In [2]:
import pandas as pd 
import wikipedia
import os,re

** Step 1: Parsing Wikipedia Byelection Data**

This notebook first parses (by)election data from Wikipedia to a set of ```.csv``` files. 

In [6]:
urls = [f"https://en.wikipedia.org/wiki/List_of_United_Kingdom_by-elections_(1918%E2%80%931931)",f"https://en.wikipedia.org/wiki/List_of_United_Kingdom_by-elections_(1931%E2%80%931950)",f"https://en.wikipedia.org/wiki/List_of_United_Kingdom_by-elections_(1950%E2%80%931979)",f"https://en.wikipedia.org/wiki/List_of_United_Kingdom_by-elections_(1979%E2%80%932010)",f"https://en.wikipedia.org/wiki/List_of_United_Kingdom_by-elections_(2010%E2%80%93present)"]

In [7]:
def transform(url):
    df = pd.read_html(url)[0]
    df = df[[c for c in df.columns if len([x for x in df[c] if str(x) != "nan"]) == len(df)]]
    for c in df.columns:
        df[c] = [re.sub('\[[^>]+\]', '', x) for x in df[c]]
    return df

In [16]:
# Parse and write data. Use years in URLs for file names (I changed the last one to "2010-2020" manually)
for u in urls:
    start_year = u.split('(')[1][:4]
    end_year =  u.split(')')[0][-4:]
    fname = f"byelection-{start_year}-{end_year}.tsv"
    print(fname)
    d = transform(u)
    d.to_csv(f'/media/ruben/Elements/PhD/data/hansard/resources/wikipedia-election-results/{fname}',sep='\t')

** Step 2: Parsing Wikipedia Election Data **

After the byelections, do the same for the regular elections. Note that in 1974 two general elections were held.

In [4]:
list_years = [1918,1922,1923,1924,1929,1931,1935,1945,1950,1951,1955,1959,1964,1966,"February_1974","October_1974",1979,1983,1987,1992,1997,2001,2005,2010,2015,2017,2019]

def parse_members(year):
    candidates = wikipedia.search(f'List of MPs elected in the {year} United Kingdom general election')
    url = f"https://en.wikipedia.org/wiki/{candidates[0].replace(' ','_')}"
    res = pd.read_html(url)
    df = [d for d in res if len(d.iloc[:,0]) >= 350][0]
    df = df[~df[list(df.columns)[0]].astype(str).str.contains('\[')]
    if str(list(df.columns)[0]) == "0":
        df.columns = df.iloc[0]
        df = df.drop(df.index[0]).reset_index(drop=True)

    for c in df.columns:
        df[c] = [re.sub('\[[^>]+\]', '', str(x)) for x in df[c]]
        #df[c] = [re.sub('\([^>]+\)', '', str(x)) for x in df[c]]
        df[c] = [x.replace('Co. ','Coalition ') for x in df[c]]
    #df = df[[c for c in df.columns if len([x for x in df[c] if str(x) != "nan"]) == len(df)]]
    df.columns = [x.split('.')[0] if '.' in x else x for x in df.columns]
    
    if len(df.columns) == 3 and int(str(year)[-4:]) <= 2001:
        df.columns = ["constituency","name","party"]

    if int(str(year)[-4:]) >= 2001 and int(str(year)[-4:]) < 2010:
        df = df.iloc[:,:3]
        df.columns = ["constituency","name","party"]
    
    if int(str(year)[-4:]) >= 2010:
        #df = df[[c for c in df.columns if len([x for x in df[c] if str(x) != "nan"]) == len(df)]]
        return df
    return df

In [170]:
#Again, manual checking. Turns out that dfs > 2010 do not have a party column, but (party abbreviations) following the members. Discarding them as this step is only required for period 1918-2010

for x in list_years:
    d = parse_members(x)
    #d.to_csv(f'/media/ruben/Elements/PhD/data/hansard/resources/wikipedia-election-results/election-{str(x).replace(" ","_").lower()}.tsv',sep='\t',index=False)