In [1]:
import pandas as pd
import requests
import re
import nltk

In [2]:
triples_data = pd.read_csv('../valid_triples.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label'])

In [3]:
triples_data

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label
0,Q496290,Q884,P495,Pasta,South Korea,country of origin
1,Q449880,Q60,P20,Alwin Nikolais,New York City,place of death
2,Q184402,Q432,P140,ayatollah,Islam,religion
3,Q2894354,Q9219,P108,Frederick Kagan,United States Military Academy,employer
4,Q205903,Q1860,P364,Doc,English,original language of work
5,Q3946,Q29,P17,Cantabria,Spain,country
6,Q795470,Q162954,P466,Pepsi Center,Denver Nuggets,occupant
7,Q321280,Q5372,P641,Dick Vitale,basketball,sport
8,Q928,Q1065,P463,Philippines,United Nations,member of
9,Q275740,Q869,P131,Kanchanaburi,Thailand,located in the administrative territorial entity


In [4]:
def get_list_sents(title): 
    if title == 'N/A':
        return []
    
    response = requests.get('https://en.wikipedia.org/api/rest_v1/page/summary/' + title).json()
    try:
        paragraph = response['extract']
        sentences = nltk.sent_tokenize(paragraph)
        return sentences
    except KeyError:
        return []

In [5]:
def get_wikipedia_title(wikidata_id):
    url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&ids=' + wikidata_id + '&sitefilter=enwiki'
    r = requests.get(url)
    result = r.json()
    try:
        title = result['entities'][wikidata_id]['sitelinks']['enwiki']['title']
    except KeyError:
        title = 'N/A'
    return title

In [6]:
filtered_sents = []
len_triples = len(triples_data['e1_id'])
len_idx = len_triples - 1

for i in range(len_triples):
    print(str(i) + '/' + str(len_idx))
    
    try:
        e1_label = triples_data['e1_label'][i]
        e2_label = triples_data['e2_label'][i]
        e1_id = triples_data['e1_id'][i]
        e2_id = triples_data['e2_id'][i]
        rel_id = triples_data['rel_id'][i]
        rel_label = triples_data['rel_label'][i]

        e1 = get_wikipedia_title(e1_id)
        e2 = get_wikipedia_title(e2_id)
        sents = get_list_sents(e1) + get_list_sents(e2)

        for sent in sents:
            if e1_label in e2_label: # i.e. e1: asia, e2: southeast asia
                if e2_label in sent:
                    sent_temp = sent.replace(e2_label, '<<<<_ENTITY2_>>>>')
                    if e1_label in sent_temp:
                        filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))
            elif e2_label in e1_label: # i.e. e1: southeast asia, e2: asia
                if e1_label in sent:
                    sent_temp = sent.replace(e1_label, '<<<<_ENTITY1_>>>>')
                    if e2_label in sent_temp:
                        filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))
            else:
                if e1_label in sent and e2_label in sent:
                    filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))
    except OSError:
        e1_label = triples_data['e1_label'][i]
        e2_label = triples_data['e2_label'][i]
        e1_id = triples_data['e1_id'][i]
        e2_id = triples_data['e2_id'][i]
        rel_id = triples_data['rel_id'][i]
        rel_label = triples_data['rel_label'][i]

        e1 = get_wikipedia_title(e1_id)
        e2 = get_wikipedia_title(e2_id)
        sents = get_list_sents(e1) + get_list_sents(e2)

        for sent in sents:
            if e1_label in e2_label: # i.e. e1: asia, e2: southeast asia
                if e2_label in sent:
                    sent_temp = sent.replace(e2_label, '<<<<_ENTITY2_>>>>')
                    if e1_label in sent_temp:
                        filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))
            elif e2_label in e1_label: # i.e. e1: southeast asia, e2: asia
                if e1_label in sent:
                    sent_temp = sent.replace(e1_label, '<<<<_ENTITY1_>>>>')
                    if e2_label in sent_temp:
                        filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))
            else:
                if e1_label in sent and e2_label in sent:
                    filtered_sents.append((e1_id, e2_id, rel_id, e1_label, e2_label, rel_label, sent.strip()))

0/4048
1/4048
2/4048
3/4048
4/4048
5/4048
6/4048
7/4048
8/4048
9/4048
10/4048
11/4048
12/4048
13/4048
14/4048
15/4048
16/4048
17/4048
18/4048
19/4048
20/4048
21/4048
22/4048
23/4048
24/4048
25/4048
26/4048
27/4048
28/4048
29/4048
30/4048
31/4048
32/4048
33/4048
34/4048
35/4048
36/4048
37/4048
38/4048
39/4048
40/4048
41/4048
42/4048
43/4048
44/4048
45/4048
46/4048
47/4048
48/4048
49/4048
50/4048
51/4048
52/4048
53/4048
54/4048
55/4048
56/4048
57/4048
58/4048
59/4048
60/4048
61/4048
62/4048
63/4048
64/4048
65/4048
66/4048
67/4048
68/4048
69/4048
70/4048
71/4048
72/4048
73/4048
74/4048
75/4048
76/4048
77/4048
78/4048
79/4048
80/4048
81/4048
82/4048
83/4048
84/4048
85/4048
86/4048
87/4048
88/4048
89/4048
90/4048
91/4048
92/4048
93/4048
94/4048
95/4048
96/4048
97/4048
98/4048
99/4048
100/4048
101/4048
102/4048
103/4048
104/4048
105/4048
106/4048
107/4048
108/4048
109/4048
110/4048
111/4048
112/4048
113/4048
114/4048
115/4048
116/4048
117/4048
118/4048
119/4048
120/4048
121/4048
122/4048
123

923/4048
924/4048
925/4048
926/4048
927/4048
928/4048
929/4048
930/4048
931/4048
932/4048
933/4048
934/4048
935/4048
936/4048
937/4048
938/4048
939/4048
940/4048
941/4048
942/4048
943/4048
944/4048
945/4048
946/4048
947/4048
948/4048
949/4048
950/4048
951/4048
952/4048
953/4048
954/4048
955/4048
956/4048
957/4048
958/4048
959/4048
960/4048
961/4048
962/4048
963/4048
964/4048
965/4048
966/4048
967/4048
968/4048
969/4048
970/4048
971/4048
972/4048
973/4048
974/4048
975/4048
976/4048
977/4048
978/4048
979/4048
980/4048
981/4048
982/4048
983/4048
984/4048
985/4048
986/4048
987/4048
988/4048
989/4048
990/4048
991/4048
992/4048
993/4048
994/4048
995/4048
996/4048
997/4048
998/4048
999/4048
1000/4048
1001/4048
1002/4048
1003/4048
1004/4048
1005/4048
1006/4048
1007/4048
1008/4048
1009/4048
1010/4048
1011/4048
1012/4048
1013/4048
1014/4048
1015/4048
1016/4048
1017/4048
1018/4048
1019/4048
1020/4048
1021/4048
1022/4048
1023/4048
1024/4048
1025/4048
1026/4048
1027/4048
1028/4048
1029/4048
1030/40

1750/4048
1751/4048
1752/4048
1753/4048
1754/4048
1755/4048
1756/4048
1757/4048
1758/4048
1759/4048
1760/4048
1761/4048
1762/4048
1763/4048
1764/4048
1765/4048
1766/4048
1767/4048
1768/4048
1769/4048
1770/4048
1771/4048
1772/4048
1773/4048
1774/4048
1775/4048
1776/4048
1777/4048
1778/4048
1779/4048
1780/4048
1781/4048
1782/4048
1783/4048
1784/4048
1785/4048
1786/4048
1787/4048
1788/4048
1789/4048
1790/4048
1791/4048
1792/4048
1793/4048
1794/4048
1795/4048
1796/4048
1797/4048
1798/4048
1799/4048
1800/4048
1801/4048
1802/4048
1803/4048
1804/4048
1805/4048
1806/4048
1807/4048
1808/4048
1809/4048
1810/4048
1811/4048
1812/4048
1813/4048
1814/4048
1815/4048
1816/4048
1817/4048
1818/4048
1819/4048
1820/4048
1821/4048
1822/4048
1823/4048
1824/4048
1825/4048
1826/4048
1827/4048
1828/4048
1829/4048
1830/4048
1831/4048
1832/4048
1833/4048
1834/4048
1835/4048
1836/4048
1837/4048
1838/4048
1839/4048
1840/4048
1841/4048
1842/4048
1843/4048
1844/4048
1845/4048
1846/4048
1847/4048
1848/4048
1849/4048


2570/4048
2571/4048
2572/4048
2573/4048
2574/4048
2575/4048
2576/4048
2577/4048
2578/4048
2579/4048
2580/4048
2581/4048
2582/4048
2583/4048
2584/4048
2585/4048
2586/4048
2587/4048
2588/4048
2589/4048
2590/4048
2591/4048
2592/4048
2593/4048
2594/4048
2595/4048
2596/4048
2597/4048
2598/4048
2599/4048
2600/4048
2601/4048
2602/4048
2603/4048
2604/4048
2605/4048
2606/4048
2607/4048
2608/4048
2609/4048
2610/4048
2611/4048
2612/4048
2613/4048
2614/4048
2615/4048
2616/4048
2617/4048
2618/4048
2619/4048
2620/4048
2621/4048
2622/4048
2623/4048
2624/4048
2625/4048
2626/4048
2627/4048
2628/4048
2629/4048
2630/4048
2631/4048
2632/4048
2633/4048
2634/4048
2635/4048
2636/4048
2637/4048
2638/4048
2639/4048
2640/4048
2641/4048
2642/4048
2643/4048
2644/4048
2645/4048
2646/4048
2647/4048
2648/4048
2649/4048
2650/4048
2651/4048
2652/4048
2653/4048
2654/4048
2655/4048
2656/4048
2657/4048
2658/4048
2659/4048
2660/4048
2661/4048
2662/4048
2663/4048
2664/4048
2665/4048
2666/4048
2667/4048
2668/4048
2669/4048


3390/4048
3391/4048
3392/4048
3393/4048
3394/4048
3395/4048
3396/4048
3397/4048
3398/4048
3399/4048
3400/4048
3401/4048
3402/4048
3403/4048
3404/4048
3405/4048
3406/4048
3407/4048
3408/4048
3409/4048
3410/4048
3411/4048
3412/4048
3413/4048
3414/4048
3415/4048
3416/4048
3417/4048
3418/4048
3419/4048
3420/4048
3421/4048
3422/4048
3423/4048
3424/4048
3425/4048
3426/4048
3427/4048
3428/4048
3429/4048
3430/4048
3431/4048
3432/4048
3433/4048
3434/4048
3435/4048
3436/4048
3437/4048
3438/4048
3439/4048
3440/4048
3441/4048
3442/4048
3443/4048
3444/4048
3445/4048
3446/4048
3447/4048
3448/4048
3449/4048
3450/4048
3451/4048
3452/4048
3453/4048
3454/4048
3455/4048
3456/4048
3457/4048
3458/4048
3459/4048
3460/4048
3461/4048
3462/4048
3463/4048
3464/4048
3465/4048
3466/4048
3467/4048
3468/4048
3469/4048
3470/4048
3471/4048
3472/4048
3473/4048
3474/4048
3475/4048
3476/4048
3477/4048
3478/4048
3479/4048
3480/4048
3481/4048
3482/4048
3483/4048
3484/4048
3485/4048
3486/4048
3487/4048
3488/4048
3489/4048


In [9]:
len(filtered_sents)

1626

In [8]:
with open('../valid_sentences.tsv', 'w') as f:
    for item in filtered_sents:
        f.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\t' + item[3] + '\t' + item[4] +
                '\t' + item[5] + '\t' + item[6] + '\n')
f.closed

True