In [1]:
import pandas as pd
import requests
import re
import nltk
import wikimodel

In [2]:
wiki = wikimodel.WikiModel("../../../enwiki")

In [3]:
wiki_dict = {}
for doc in wiki.docs():
    wiki_dict[doc.title] = doc.text

In [4]:
id_to_title = {}

In [5]:
def normalize_text(text):
    text = text.replace('"', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('[note]', ' ')
    text = text.replace('(disambiguation)', ' ')
    text = text.replace('[citation needed]', ' ')
    text = text.replace('[update]', ' ')
    text = text.replace('[contradictory]', ' ')
    text = text.replace('[page needed]', ' ')
    text = re.sub(r'\[relevant\?\s*\–\s*discuss\]', ' ', text)
    text = text.replace('[clarification needed]', ' ')
    text = re.sub(r'\[[a-z]*\?\]', ' ', text)
    text = re.sub(r'\[nb\s[0-9]*\]', ' ', text)
    text = re.sub(r'\[nb\]', ' ', text)
    text = re.sub(r'\[[a-z]\]', ' ', text)
    text = re.sub(r'\[[A-Z]\]', ' ', text)
    text = re.sub(r'\[[0-9]+\]', ' ', text)
    text = re.sub(r'[a-z]\.\^', ' ', text)
    text = " ".join(text.split())
    return text

In [6]:
def is_sent_contain_entities(e1, e2, sent):
    contain = False
    if e1 in e2:
        if e1 in sent:
            sent_temp = sent.replace(e1, '<<<ENTITY1>>>')
            if e2 in sent_temp:
                contain = True
    elif e2 in e1:
        if e2 in sent:
            sent_temp = sent.replace(e2, '<<<ENTITY2>>>')
            if e1 in sent_temp:
                contain = True
    else:
        if e1 in sent and e2 in sent:
            contain = True
    return contain

In [7]:
def get_list_sents_dump(title): 
    if title == 'N/A':
        return []
    
    try:
        paragraph = wiki_dict[title]
        paragraph = normalize_text(paragraph)
        sentences = nltk.sent_tokenize(paragraph)
        return sentences
    except KeyError:
        return []

In [8]:
def get_wikipedia_title(wikidata_id):
    if wikidata_id == 'NUM':
        return 'N/A'
    
    try:
        title = id_to_title[wikidata_id]
    except KeyError:
        url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&ids=' + wikidata_id + '&sitefilter=enwiki'
        r = requests.get(url)
        result = r.json()
        try:
            title = result['entities'][wikidata_id]['sitelinks']['enwiki']['title']
        except KeyError:
            title = 'N/A'

        id_to_title[wikidata_id] = title
    
    return title

In [41]:
triples_data = pd.read_csv('../train_triples.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label'])
valid = pd.read_csv('../valid_triples.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label'])
merged = triples_data.merge(valid, how='left', indicator=True)
triples_data = merged[merged['_merge'] == 'left_only']
triples_data = triples_data.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [42]:
triples_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,_merge
0,Q20950174,Q20966585,P659,NCBI homo sapiens annotation release 107,Genome assembly GRCh38,genomic assembly,left_only
1,Q223122,Q22006653,P462,Three Way,color,color,left_only
2,Q7762756,Q838368,P462,The Sea Ghost,Black-and-white,color,left_only
3,Q3795055,Q22006653,P462,The Past Is a Foreign Land,color,color,left_only
4,Q8053816,Q838368,P462,"Yira, yira",black-and-white,color,left_only
5,Q692208,Q22006653,P462,Whatever Works,colour,color,left_only
6,Q15632404,Q22006653,P462,Bandits of the West,colour,color,left_only
7,Q3940740,Q838368,P462,Romanticismo,Black-and-white,color,left_only
8,Q19683032,Q838368,P462,"La télévision, œil de demain",Black-and-white,color,left_only
9,Q3989650,Q838368,P462,The Tip,Black-and-white,color,left_only


In [49]:
# filtered_sents = []
len_triples = len(triples_data)
len_idx = len_triples - 1
max_sent = 10

for i in range(2357375, len_triples):
    print(str(i) + '/' + str(len_idx))
    
    e1_label = triples_data['e1_label'][i]
    e2_label = triples_data['e2_label'][i]
    e1_id = triples_data['e1_id'][i]
    e2_id = triples_data['e2_id'][i]
    rel_id = triples_data['rel_id'][i]
    rel_label = triples_data['rel_label'][i]

    sents = get_list_sents_dump(e1_label) + get_list_sents_dump(e2_label)

    sent_idx = 0
    for sent in sents:
        is_valid = is_sent_contain_entities(e1_label, e2_label, sent)
        if is_valid == True:
            filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent))
            sent_idx += 1
        if sent_idx > max_sent:
            break

In [50]:
len(filtered_sents)

223533

In [45]:
with open('../train_sentences.tsv', 'w') as f:
    for item in filtered_sents:
        f.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\t' + item[3] + '\t' + item[4] +
                '\t' + item[5] + '\t' + item[6] + '\n')
f.closed

True