### Plan:
1) Parse pickle file to retrieve entity names for IDs
2) Use EntityToLabel module to retrieve entity names for rows not found in pickle
3) Use Redirects module to retrieve redirects for rows not found in pickle

In [1]:
import pandas as pd 
import pickle
from tqdm import tqdm
import math
import time

### Download data

In [4]:
path_to_train_simple_questions = "/data/annotated_wd_data_test_answerable.txt"
data = pd.read_table(path_to_train_simple_questions, header=None).rename(columns = {0:"subject", 1:"property", 2:"object", 3:"question"})

In [79]:
! wget https://dl.fbaipublicfiles.com/GENRE/lang_title2wikidataID-normalized_with_redirect.pkl

--2022-09-26 14:46:04--  https://dl.fbaipublicfiles.com/GENRE/lang_title2wikidataID-normalized_with_redirect.pkl
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3881415585 (3.6G) [application/octet-stream]
Saving to: ‘lang_title2wikidataID-normalized_with_redirect.pkl’


2022-09-26 14:49:22 (18.8 MB/s) - ‘lang_title2wikidataID-normalized_with_redirect.pkl’ saved [3881415585/3881415585]



In [80]:
with open("lang_title2wikidataID-normalized_with_redirect.pkl", "rb") as f:
    lang_title2wikidataID = pickle.load(f)

### Part 1: Parse pickle file 

In [82]:
mydict = lang_title2wikidataID

In [83]:
data_df = pd.DataFrame(mydict.items(), columns=['key', 'value'])

In [84]:
data_df['lang'], data_df['name'] = data_df.key.str

  data_df['lang'], data_df['name'] = data_df.key.str


In [85]:
data_df = data_df.query('lang == "en"')

In [86]:
data_df = data_df.reset_index()

In [88]:
dict_opt = dict()
for i in tqdm(range(len(data_df))):
    cur_val = str(data_df.loc[i, 'value']).strip("{,},''" )
    if cur_val in dict_opt:
        dict_opt[cur_val].append(data_df.loc[i, 'name'])
    else:
         dict_opt[cur_val] = [data_df.loc[i, 'name']]

100%|██████████| 14751661/14751661 [05:18<00:00, 46282.56it/s]


In [89]:
for i in tqdm(range(len(data))):
    cur_subject = data.loc[i, 'subject']
    try:
        cur_labels = dict_opt[cur_subject]
        data.loc[i, 'subject_text'] = ', '.join(cur_labels)
    except:
        continue

100%|██████████| 5622/5622 [00:00<00:00, 12936.95it/s]


In [90]:
data.head()

Unnamed: 0,subject,property,object,question,subject_text
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis
1,Q154335,P509,Q12152,what was the cause of death of yves klein,"Yves Klein, The Void (artwork)"
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,"Carlos Gómez, Carlos Gomez, Gómez, Carlos"
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,"Engelbert Zaschka, Englebert Zaschka, Rotation..."
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,"Pee Wee Reese, Harold H. Reese, Harold Henry &..."


In [91]:
data.to_csv('data_first_iteration.csv', mode = "w", index = False)

In [3]:
data = pd.read_csv('data_first_iteration.csv')

### Part 2: use WikidataEntityToLabel module

In [5]:
from kbqa.caches.wikidata_entity_to_label import WikidataEntityToLabel
entity2label = WikidataEntityToLabel()

In [6]:
for i in tqdm(range(len(data))):
    if pd.isna(data.loc[i, 'subject_text']) == True:
        time.sleep(1)
        try:
            label = entity2label.get_label(data.loc[i, 'subject'])
            data.loc[i, 'subject_text'] = label
        except:
            continue

100%|██████████| 5622/5622 [07:21<00:00, 12.74it/s]


In [7]:
data[data['subject_text'].isna()]

Unnamed: 0,subject,property,object,question,subject_text


In [9]:
data.to_csv('data_second_iteration.csv', mode = "w", index = False)

In [10]:
for j in range(len(data)):
    if len(data.loc[j, 'subject_text'].split(',')) != 1:
        data.loc[j, 'flag'] = 0
    else:
        data.loc[j, 'flag'] = 1
data.to_csv('data_second_iteration_flag.csv', index = False)

### Part 3: use WikidataRedirectsCache module

In [11]:
data_redirects = pd.read_csv('data_second_iteration_flag.csv')

In [13]:
from kbqa.caches.wikidata_redirects import WikidataRedirectsCache
redirects = WikidataRedirectsCache()

In [14]:
for k in tqdm(range(len(data_redirects))):
    if data_redirects.loc[k, 'flag'] == 1:
        text = data_redirects.loc[k, 'subject_text']
        output = redirects.get_redirects(text)
        if output != 'No results found' and output != 'Problem communicating with the server:':
            data_redirects.loc[k, 'subject_text_add'] = ', '.join(output)
            

100%|██████████| 5622/5622 [14:02<00:00,  6.67it/s] 


In [16]:
import numpy as np
for m in range(len(data_redirects)):
    if data_redirects.loc[m, 'subject_text'] == 'Nan':
        data_redirects.loc[m, 'subject_text_add'] = np.nan

In [17]:
data_redirects[data_redirects['subject_text_add'].notna()]

Unnamed: 0,subject,property,object,question,subject_text,flag,subject_text_add
121,Q13422918,P50,Q234865,"who wrote \\""w\\"" is for wasted","""W"" Is for Wasted",1.0,"Problem communicating with the server: ,"
489,Q188605,R509,Q6386212,Who died from emphysema?,pulmonary emphysema,1.0,"Emphysema, Centriacinar emphysema, Centrilobul..."
539,Q2405480,R106,Q3106805,Name a voice actor.,voice actor,1.0,"Voice acting, Amateur Voice Acting, Amateur vo..."
614,Q4540294,P50,Q234865,"Who wrote \\""l\\"" is for lawless?","""L"" Is for Lawless",1.0,"Problem communicating with the server: ,"
665,Q6581097,R21,Q1771345,who is a male character?,male,1.0,"Maleness, Males, MALE, Andromorphic, Maled"
...,...,...,...,...,...,...,...
5064,Q727543,P162,Q51583,Who produced double wedding,Double Wedding,1.0,"Wedding, Wedding Venue, Wedding Venues, Weddin..."
5225,Q2411068,P495,Q30,Which country is the film the end from,The End,1.0,"The End, The End (band), The End (disambiguati..."
5327,Q6581097,R21,Q2050686,who is a male character from csi: miami?,male,1.0,"Maleness, Males, MALE, Andromorphic, Maled"
5360,Q4838235,P495,Q30,What country was the film Baby Clothes produce...,Baby Clothes,1.0,"Infant clothing, Toddler clothes, Toddler clot..."


In [19]:
data_redirects['subject_text_all'] = data_redirects[['subject_text', 'subject_text_add']].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)


In [21]:
data_redirects.to_csv('data_third_iteration.csv', index=False)