In [2]:
import pandas as pd
from glob import glob as gb
import os
from fuzzywuzzy import fuzz
import wikipedia
from wikidata.client import Client
from tqdm import tqdm
client = Client()
import requests
from bs4 import BeautifulSoup as bs
from fuzzywuzzy import process
import itertools
from collections import Counter

In [1]:
def load_soup(wd_key):
    r = requests.get(f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={wd_key}&format=xml")
    s = bs(r.content)
    return s

def find_party_claim(s):
    try:
        r = s.find_all("property",{"id":"P102"})
        r = [x for x in r if x.findChildren()[0].name == "claim"][0]
        r = r('claim')[0]('mainsnak')[0]('datavalue')[0]('value')[0].attrs['id'] 
        return r
    except Exception as e:
        return "na"

def find_parl_group(s,filename_df):
    try:
        positions = s.find("property",{"id":"P39"}).find_all('claim')
        positions = {p.find('value').attrs['id']:p.find("property",{"id":"P4100"}).find('value').attrs['id'] for p in positions}
        hit = positions[os.path.split(filename_df)[-1].split('-')[-1][:-4]]
        return r
    except Exception as e:
        return "na"

def find_party_wikipedia_election(df_elections,name):
    candidates = set(df_elections['name'])
    t = df_elections[df_elections['name'] == name].reset_index(drop=True)
    if len(t) != 0:
        return t['party'][0]

    elif len(t) == 0:
        name_fuzzy = process.extract(name, set(df_elections['name']), limit=1,scorer=fuzz.token_set_ratio)[0]
        if name_fuzzy[1] > 90:
            t = df_elections[df_elections['name'] == name_fuzzy[0]].reset_index(drop=True)
            return t['party'][0]
        else:
            return "na"
    else:
        return "na"

def find_party_wikipedia_byelection(df_byelections,name):
    candidates = set(df_byelections['Winner'])
    t = df_byelections[df_byelections['Winner'] == name].reset_index(drop=True)
    if len(t) != 0:
        return t['PartyNew'][0]

    elif len(t) == 0:
        name_fuzzy = process.extract(name, set(df_byelections['Winner']), limit=1,scorer=fuzz.token_set_ratio)[0]
        if name_fuzzy[1] > 90:
            t = df_byelections[df_byelections['Winner'] == name_fuzzy[0]].reset_index(drop=True)
            return t['PartyNew'][0]
        else:
            return "na"
    else:
        return "na"

In [43]:
list_wikidata = [x for x in gb('/media/ruben/Elements/PhD/data/hansard/resources/wikidata-members/reformatted/*') if "-Q" in x and int(x.split('-Q')[0][-4:]) <= 1931 and int(x.split('-Q')[0][-4:]) >= 1917]
list_elections = {x.split('-Q')[0].split('/')[-1]:f"/media/ruben/Elements/PhD/data/hansard/resources/wikipedia-election-results/election-{x.split('-Q')[0].split('/')[-1]}.tsv" for x in list_wikidata}
df_byelections = pd.read_csv('/media/ruben/Elements/PhD/data/hansard/resources/wikipedia-election-results/byelection-1918-1931.tsv',sep='\t')

In [100]:
for wikidata_df in list_wikidata:
    df = pd.read_csv(wikidata_df,sep='\t')
    df_elections = list_elections[wikidata_df.split('-Q')[0].split('/')[-1]]
    df_elections = pd.read_csv(df_elections,sep='\t')
    
    d = []

    for c,i in tqdm(enumerate(df['mp_ref'])):

        # Check if party = present, else continue
        if str(df['party'][c]) == "nan":
            filename_df = wikidata_df
            wd_key = i.split('/')[-1][:-1]
            soup = load_soup(wd_key)

            party = find_party_claim(soup)

            if party == "na":
                party = find_parl_group(soup,filename_df)
            if party == "na":
                party = find_party_wikipedia_election(df_elections,df['mp'][c])
            if party == "na":
                party = find_party_wikipedia_byelection(df_byelections,df['mp'][c])
            d.append([i,df['mp'][c],party])
        else:
            continue

    dfr = pd.DataFrame(d,columns=["wd","name","party"])
    dfr['party'] = [str(client.get(str(x)).label) if "Q" in x else x for x in tqdm(dfr['party'])] 
    dfr = {i:{"name":dfr['name'][c],"party":dfr['party'][c]}for c,i in enumerate(dfr['wd'])}
    df['party'] = [x if str(x) != "nan" else dfr[df['mp_ref'][c]]['party'] for c,x in enumerate(df['party'])]

    # Add party reference
    pl = list(zip(df['party'],df['party_ref']))
    pl = [[x[0],x[1]] for x in pl if str(x[1]) != "nan"]
    party_dict = {}
    for p in pl:
        if p[0] not in party_dict.keys():
            party_dict.update({p[0]:p[1]})
        elif p[1] not in party_dict[p[0]]:
            party_dict[p[0]].append(p[1])
        else:
            continue

    df['party_ref'] = [party_dict[x] if x in party_dict.keys() else "nan" for x in df['party']]
    df.to_csv(wikidata_df[:-4] + "-enriched.tsv",index=False,sep='\t')

801it [03:05,  4.32it/s]
100%|██████████| 333/333 [02:02<00:00,  2.73it/s]
631it [00:46, 13.47it/s]
100%|██████████| 98/98 [00:36<00:00,  2.66it/s]
625it [00:26, 23.31it/s]
100%|██████████| 55/55 [00:21<00:00,  2.52it/s]
676it [00:23, 28.53it/s]
100%|██████████| 47/47 [00:14<00:00,  3.18it/s]
653it [00:17, 38.29it/s]
100%|██████████| 32/32 [00:30<00:00,  1.05it/s]
677it [00:10, 66.41it/s]
100%|██████████| 25/25 [00:08<00:00,  2.91it/s]


'/media/ruben/Elements/PhD/data/hansard/resources/wikidata-members/reformatted/1918-Q41582582-enriched.tsv'

In [101]:
df = pd.read_csv(wikidata_df,sep='\t')

In [103]:
len(df)

677