In [1]:
import pandas as pd
import time
import spacy
import csv
import json
nlp = spacy.load('en_core_web_md')
from wikidata.client import Client
client = Client()

In [2]:
import enteater

Setting up Enteater [1] Loading Spacy Model ... Done
Setting up Enteater [2] Loading Wikidata ID dictionary ... Done
Setting up Enteater [3] Loading Wikidata-Freebase Mapping table ... Done


In [3]:
# Open KB data
data_kb = pd.read_csv('../valid_sentences_filtered.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'], quoting=csv.QUOTE_NONE)

# Open graphene output
with open('../graphene/output_re_valid_sentences_filtered.txt') as f:
    data_openie = json.load(f)
    
# Sentence ID map
sent_id_df = pd.read_csv('../graphene/valid_sent_id_map.tsv', sep='\t', quoting=csv.QUOTE_NONE)
sent_id_dict = dict([(i,j) for i, j in zip(sent_id_df.sentenceIdx, sent_id_df.sentOrigIdx)])

In [4]:
data_kb

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q3946,Q29,P17,Cantabria,Spain,country,"Cantabria belongs to Green Spain, the name giv..."
1,Q3946,Q29,P17,Cantabria,Spain,country,"Due to the gulf stream, Cantabria, as well as ..."
2,Q3946,Q29,P17,Cantabria,Spain,country,In relative contrast to other regions of Spain...
3,Q3946,Q29,P17,Cantabria,Spain,country,"During the Ancien Régime, the greatest jurisdi..."
4,Q3946,Q29,P17,Cantabria,Spain,country,Following the approval of the General Courts o...
5,Q3946,Q29,P17,Cantabria,Spain,country,"Until the 13th century, Cantabria was organize..."
6,Q3946,Q29,P17,Cantabria,Spain,country,"As of July 2014, the unemployment rate in Cant..."
7,Q3946,Q29,P17,Cantabria,Spain,country,"In 2007, Cantabria's growth of real GDP was 4...."
8,Q795470,Q162954,P466,Pepsi Center,Denver Nuggets,occupant,"Before the construction of Pepsi Center, the D..."
9,Q795470,Q162954,P466,Pepsi Center,Denver Nuggets,occupant,"However, Denver Nuggets and Pepsi Center owner..."


In [5]:
len(data_openie['sentences'])

7548

In [6]:
len(sent_id_dict)

7548

In [7]:
def get_noun_root(text, basic_root=True):
    doc = nlp(text)
    root = 'N/A'
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ == 'ROOT':
            if basic_root == True:
                root = chunk.root.text
            else:
                root = chunk.text
    return root

In [8]:
def get_entity_tuple_id(e1, rel, e2):
    e1_wiki_id = 'N/A'
    e2_wiki_id = 'N/A'
    e1_root = get_noun_root(e1)
    if e1_root == 'N/A':
        e1_root = e1
    e2_root = get_noun_root(e2, basic_root=False)
    if e2_root == 'N/A':
        e2_root = e2
    sent = e1_root + ' ' + rel + ' ' + e2_root
    
    ent_result, ent_id_dict = enteater.get_entity(sent, detect_property=False)
    ent_rank = [ent[0] for ent in ent_result]
    for rel_item in rel.split():
        if rel_item in ent_rank:
            ent_rank.remove(rel_item)
    
    try:
        e1_idx = ent_rank.index(e1_root)
        # First entity is exist in wikidata
        e1_wiki_id = ent_id_dict[e1_root]
        if len(ent_rank) > 1:
            e1_idx = ent_rank.index(e1_root)
            if e1_idx == 0:
                e2_wiki_id = ent_id_dict[ent_rank[1]]
            else:
                e2_wiki_id = ent_id_dict[ent_rank[0]]
    except ValueError:
        # First entity is not exist in wikidata
        e1_wiki_id = 'N/A'
        if len(ent_rank) > 1:
            e2_wiki_id = ent_id_dict[ent_rank[0]]
    
    return (e1_wiki_id, e1_root, e2_wiki_id, e2_root)

In [9]:
align_data = []
idx = 0
len_openie = len(data_openie['sentences'])
for item in data_openie['sentences']:
    print(idx, 'of', len_openie - 1)
    for key, val_oie in item['extractionMap'].items():
        sent_idx = sent_id_dict[val_oie['sentenceIdx']]
        e1_kb_id = data_kb['e1_id'][sent_idx]
        e2_kb_id = data_kb['e2_id'][sent_idx]
        e1_kb = data_kb['e1_label'][sent_idx]
        e2_kb = data_kb['e2_label'][sent_idx]
        rel_kb = data_kb['rel_label'][sent_idx]
        rel_kb_id = data_kb['rel_id'][sent_idx]
        e1_oie = val_oie['arg1']
        e2_oie = val_oie['arg2']
        rel_oie = val_oie['relation']
        if e1_oie != '' and e2_oie != '' and rel_oie != '':
            e1_oie_id, e1_oie_root, e2_oie_id, e2_oie_root = get_entity_tuple_id(e1_oie, rel_oie, e2_oie)

            if e1_oie_id == e1_kb_id and e2_oie_id == e2_kb_id:
                # same class (similar relation): 0
                align_data.append((e1_kb, rel_kb, rel_kb_id, e2_kb,
                                   e1_oie, rel_oie, e2_oie,
                                   e1_kb_id, e2_kb_id, e1_oie_id, e2_oie_id, e1_oie_root, e2_oie_root,
                                   '0'))
            else:
                align_data.append((e1_kb, rel_kb, rel_kb_id, e2_kb,
                                   e1_oie, rel_oie, e2_oie,
                                   e1_kb_id, e2_kb_id, e1_oie_id, e2_oie_id, e1_oie_root, e2_oie_root,
                                   '1'))
    idx += 1

0 of 7547
1 of 7547
2 of 7547
3 of 7547
4 of 7547
5 of 7547
6 of 7547
7 of 7547
8 of 7547
9 of 7547
10 of 7547
11 of 7547
12 of 7547
13 of 7547
14 of 7547
15 of 7547
16 of 7547
17 of 7547
18 of 7547
19 of 7547
20 of 7547
21 of 7547
22 of 7547
23 of 7547
24 of 7547
25 of 7547
26 of 7547
27 of 7547
28 of 7547
29 of 7547
30 of 7547
31 of 7547
32 of 7547
33 of 7547
34 of 7547
35 of 7547
36 of 7547
37 of 7547
38 of 7547
39 of 7547
40 of 7547
41 of 7547
42 of 7547
43 of 7547
44 of 7547
45 of 7547
46 of 7547
47 of 7547
48 of 7547
49 of 7547
50 of 7547
51 of 7547
52 of 7547
53 of 7547
54 of 7547
55 of 7547
56 of 7547
57 of 7547
58 of 7547
59 of 7547
60 of 7547
61 of 7547
62 of 7547
63 of 7547
64 of 7547
65 of 7547
66 of 7547
67 of 7547
68 of 7547
69 of 7547
70 of 7547
71 of 7547
72 of 7547
73 of 7547
74 of 7547
75 of 7547
76 of 7547
77 of 7547
78 of 7547
79 of 7547
80 of 7547
81 of 7547
82 of 7547
83 of 7547
84 of 7547
85 of 7547
86 of 7547
87 of 7547
88 of 7547
89 of 7547
90 of 7547
91 of 754

692 of 7547
693 of 7547
694 of 7547
695 of 7547
696 of 7547
697 of 7547
698 of 7547
699 of 7547
700 of 7547
701 of 7547
702 of 7547
703 of 7547
704 of 7547
705 of 7547
706 of 7547
707 of 7547
708 of 7547
709 of 7547
710 of 7547
711 of 7547
712 of 7547
713 of 7547
714 of 7547
715 of 7547
716 of 7547
717 of 7547
718 of 7547
719 of 7547
720 of 7547
721 of 7547
722 of 7547
723 of 7547
724 of 7547
725 of 7547
726 of 7547
727 of 7547
728 of 7547
729 of 7547
730 of 7547
731 of 7547
732 of 7547
733 of 7547
734 of 7547
735 of 7547
736 of 7547
737 of 7547
738 of 7547
739 of 7547
740 of 7547
741 of 7547
742 of 7547
743 of 7547
744 of 7547
745 of 7547
746 of 7547
747 of 7547
748 of 7547
749 of 7547
750 of 7547
751 of 7547
752 of 7547
753 of 7547
754 of 7547
755 of 7547
756 of 7547
757 of 7547
758 of 7547
759 of 7547
760 of 7547
761 of 7547
762 of 7547
763 of 7547
764 of 7547
765 of 7547
766 of 7547
767 of 7547
768 of 7547
769 of 7547
770 of 7547
771 of 7547
772 of 7547
773 of 7547
774 of 7547
775 

1346 of 7547
1347 of 7547
1348 of 7547
1349 of 7547
1350 of 7547
1351 of 7547
1352 of 7547
1353 of 7547
1354 of 7547
1355 of 7547
1356 of 7547
1357 of 7547
1358 of 7547
1359 of 7547
1360 of 7547
1361 of 7547
1362 of 7547
1363 of 7547
1364 of 7547
1365 of 7547
1366 of 7547
1367 of 7547
1368 of 7547
1369 of 7547
1370 of 7547
1371 of 7547
1372 of 7547
1373 of 7547
1374 of 7547
1375 of 7547
1376 of 7547
1377 of 7547
1378 of 7547
1379 of 7547
1380 of 7547
1381 of 7547
1382 of 7547
1383 of 7547
1384 of 7547
1385 of 7547
1386 of 7547
1387 of 7547
1388 of 7547
1389 of 7547
1390 of 7547
1391 of 7547
1392 of 7547
1393 of 7547
1394 of 7547
1395 of 7547
1396 of 7547
1397 of 7547
1398 of 7547
1399 of 7547
1400 of 7547
1401 of 7547
1402 of 7547
1403 of 7547
1404 of 7547
1405 of 7547
1406 of 7547
1407 of 7547
1408 of 7547
1409 of 7547
1410 of 7547
1411 of 7547
1412 of 7547
1413 of 7547
1414 of 7547
1415 of 7547
1416 of 7547
1417 of 7547
1418 of 7547
1419 of 7547
1420 of 7547
1421 of 7547
1422 of 7547

1977 of 7547
1978 of 7547
1979 of 7547
1980 of 7547
1981 of 7547
1982 of 7547
1983 of 7547
1984 of 7547
1985 of 7547
1986 of 7547
1987 of 7547
1988 of 7547
1989 of 7547
1990 of 7547
1991 of 7547
1992 of 7547
1993 of 7547
1994 of 7547
1995 of 7547
1996 of 7547
1997 of 7547
1998 of 7547
1999 of 7547
2000 of 7547
2001 of 7547
2002 of 7547
2003 of 7547
2004 of 7547
2005 of 7547
2006 of 7547
2007 of 7547
2008 of 7547
2009 of 7547
2010 of 7547
2011 of 7547
2012 of 7547
2013 of 7547
2014 of 7547
2015 of 7547
2016 of 7547
2017 of 7547
2018 of 7547
2019 of 7547
2020 of 7547
2021 of 7547
2022 of 7547
2023 of 7547
2024 of 7547
2025 of 7547
2026 of 7547
2027 of 7547
2028 of 7547
2029 of 7547
2030 of 7547
2031 of 7547
2032 of 7547
2033 of 7547
2034 of 7547
2035 of 7547
2036 of 7547
2037 of 7547
2038 of 7547
2039 of 7547
2040 of 7547
2041 of 7547
2042 of 7547
2043 of 7547
2044 of 7547
2045 of 7547
2046 of 7547
2047 of 7547
2048 of 7547
2049 of 7547
2050 of 7547
2051 of 7547
2052 of 7547
2053 of 7547

2608 of 7547
2609 of 7547
2610 of 7547
2611 of 7547
2612 of 7547
2613 of 7547
2614 of 7547
2615 of 7547
2616 of 7547
2617 of 7547
2618 of 7547
2619 of 7547
2620 of 7547
2621 of 7547
2622 of 7547
2623 of 7547
2624 of 7547
2625 of 7547
2626 of 7547
2627 of 7547
2628 of 7547
2629 of 7547
2630 of 7547
2631 of 7547
2632 of 7547
2633 of 7547
2634 of 7547
2635 of 7547
2636 of 7547
2637 of 7547
2638 of 7547
2639 of 7547
2640 of 7547
2641 of 7547
2642 of 7547
2643 of 7547
2644 of 7547
2645 of 7547
2646 of 7547
2647 of 7547
2648 of 7547
2649 of 7547
2650 of 7547
2651 of 7547
2652 of 7547
2653 of 7547
2654 of 7547
2655 of 7547
2656 of 7547
2657 of 7547
2658 of 7547
2659 of 7547
2660 of 7547
2661 of 7547
2662 of 7547
2663 of 7547
2664 of 7547
2665 of 7547
2666 of 7547
2667 of 7547
2668 of 7547
2669 of 7547
2670 of 7547
2671 of 7547
2672 of 7547
2673 of 7547
2674 of 7547
2675 of 7547
2676 of 7547
2677 of 7547
2678 of 7547
2679 of 7547
2680 of 7547
2681 of 7547
2682 of 7547
2683 of 7547
2684 of 7547

3239 of 7547
3240 of 7547
3241 of 7547
3242 of 7547
3243 of 7547
3244 of 7547
3245 of 7547
3246 of 7547
3247 of 7547
3248 of 7547
3249 of 7547
3250 of 7547
3251 of 7547
3252 of 7547
3253 of 7547
3254 of 7547
3255 of 7547
3256 of 7547
3257 of 7547
3258 of 7547
3259 of 7547
3260 of 7547
3261 of 7547
3262 of 7547
3263 of 7547
3264 of 7547
3265 of 7547
3266 of 7547
3267 of 7547
3268 of 7547
3269 of 7547
3270 of 7547
3271 of 7547
3272 of 7547
3273 of 7547
3274 of 7547
3275 of 7547
3276 of 7547
3277 of 7547
3278 of 7547
3279 of 7547
3280 of 7547
3281 of 7547
3282 of 7547
3283 of 7547
3284 of 7547
3285 of 7547
3286 of 7547
3287 of 7547
3288 of 7547
3289 of 7547
3290 of 7547
3291 of 7547
3292 of 7547
3293 of 7547
3294 of 7547
3295 of 7547
3296 of 7547
3297 of 7547
3298 of 7547
3299 of 7547
3300 of 7547
3301 of 7547
3302 of 7547
3303 of 7547
3304 of 7547
3305 of 7547
3306 of 7547
3307 of 7547
3308 of 7547
3309 of 7547
3310 of 7547
3311 of 7547
3312 of 7547
3313 of 7547
3314 of 7547
3315 of 7547

3870 of 7547
3871 of 7547
3872 of 7547
3873 of 7547
3874 of 7547
3875 of 7547
3876 of 7547
3877 of 7547
3878 of 7547
3879 of 7547
3880 of 7547
3881 of 7547
3882 of 7547
3883 of 7547
3884 of 7547
3885 of 7547
3886 of 7547
3887 of 7547
3888 of 7547
3889 of 7547
3890 of 7547
3891 of 7547
3892 of 7547
3893 of 7547
3894 of 7547
3895 of 7547
3896 of 7547
3897 of 7547
3898 of 7547
3899 of 7547
3900 of 7547
3901 of 7547
3902 of 7547
3903 of 7547
3904 of 7547
3905 of 7547
3906 of 7547
3907 of 7547
3908 of 7547
3909 of 7547
3910 of 7547
3911 of 7547
3912 of 7547
3913 of 7547
3914 of 7547
3915 of 7547
3916 of 7547
3917 of 7547
3918 of 7547
3919 of 7547
3920 of 7547
3921 of 7547
3922 of 7547
3923 of 7547
3924 of 7547
3925 of 7547
3926 of 7547
3927 of 7547
3928 of 7547
3929 of 7547
3930 of 7547
3931 of 7547
3932 of 7547
3933 of 7547
3934 of 7547
3935 of 7547
3936 of 7547
3937 of 7547
3938 of 7547
3939 of 7547
3940 of 7547
3941 of 7547
3942 of 7547
3943 of 7547
3944 of 7547
3945 of 7547
3946 of 7547

4502 of 7547
4503 of 7547
4504 of 7547
4505 of 7547
4506 of 7547
4507 of 7547
4508 of 7547
4509 of 7547
4510 of 7547
4511 of 7547
4512 of 7547
4513 of 7547
4514 of 7547
4515 of 7547
4516 of 7547
4517 of 7547
4518 of 7547
4519 of 7547
4520 of 7547
4521 of 7547
4522 of 7547
4523 of 7547
4524 of 7547
4525 of 7547
4526 of 7547
4527 of 7547
4528 of 7547
4529 of 7547
4530 of 7547
4531 of 7547
4532 of 7547
4533 of 7547
4534 of 7547
4535 of 7547
4536 of 7547
4537 of 7547
4538 of 7547
4539 of 7547
4540 of 7547
4541 of 7547
4542 of 7547
4543 of 7547
4544 of 7547
4545 of 7547
4546 of 7547
4547 of 7547
4548 of 7547
4549 of 7547
4550 of 7547
4551 of 7547
4552 of 7547
4553 of 7547
4554 of 7547
4555 of 7547
4556 of 7547
4557 of 7547
4558 of 7547
4559 of 7547
4560 of 7547
4561 of 7547
4562 of 7547
4563 of 7547
4564 of 7547
4565 of 7547
4566 of 7547
4567 of 7547
4568 of 7547
4569 of 7547
4570 of 7547
4571 of 7547
4572 of 7547
4573 of 7547
4574 of 7547
4575 of 7547
4576 of 7547
4577 of 7547
4578 of 7547

5133 of 7547
5134 of 7547
5135 of 7547
5136 of 7547
5137 of 7547
5138 of 7547
5139 of 7547
5140 of 7547
5141 of 7547
5142 of 7547
5143 of 7547
5144 of 7547
5145 of 7547
5146 of 7547
5147 of 7547
5148 of 7547
5149 of 7547
5150 of 7547
5151 of 7547
5152 of 7547
5153 of 7547
5154 of 7547
5155 of 7547
5156 of 7547
5157 of 7547
5158 of 7547
5159 of 7547
5160 of 7547
5161 of 7547
5162 of 7547
5163 of 7547
5164 of 7547
5165 of 7547
5166 of 7547
5167 of 7547
5168 of 7547
5169 of 7547
5170 of 7547
5171 of 7547
5172 of 7547
5173 of 7547
5174 of 7547
5175 of 7547
5176 of 7547
5177 of 7547
5178 of 7547
5179 of 7547
5180 of 7547
5181 of 7547
5182 of 7547
5183 of 7547
5184 of 7547
5185 of 7547
5186 of 7547
5187 of 7547
5188 of 7547
5189 of 7547
5190 of 7547
5191 of 7547
5192 of 7547
5193 of 7547
5194 of 7547
5195 of 7547
5196 of 7547
5197 of 7547
5198 of 7547
5199 of 7547
5200 of 7547
5201 of 7547
5202 of 7547
5203 of 7547
5204 of 7547
5205 of 7547
5206 of 7547
5207 of 7547
5208 of 7547
5209 of 7547

5764 of 7547
5765 of 7547
5766 of 7547
5767 of 7547
5768 of 7547
5769 of 7547
5770 of 7547
5771 of 7547
5772 of 7547
5773 of 7547
5774 of 7547
5775 of 7547
5776 of 7547
5777 of 7547
5778 of 7547
5779 of 7547
5780 of 7547
5781 of 7547
5782 of 7547
5783 of 7547
5784 of 7547
5785 of 7547
5786 of 7547
5787 of 7547
5788 of 7547
5789 of 7547
5790 of 7547
5791 of 7547
5792 of 7547
5793 of 7547
5794 of 7547
5795 of 7547
5796 of 7547
5797 of 7547
5798 of 7547
5799 of 7547
5800 of 7547
5801 of 7547
5802 of 7547
5803 of 7547
5804 of 7547
5805 of 7547
5806 of 7547
5807 of 7547
5808 of 7547
5809 of 7547
5810 of 7547
5811 of 7547
5812 of 7547
5813 of 7547
5814 of 7547
5815 of 7547
5816 of 7547
5817 of 7547
5818 of 7547
5819 of 7547
5820 of 7547
5821 of 7547
5822 of 7547
5823 of 7547
5824 of 7547
5825 of 7547
5826 of 7547
5827 of 7547
5828 of 7547
5829 of 7547
5830 of 7547
5831 of 7547
5832 of 7547
5833 of 7547
5834 of 7547
5835 of 7547
5836 of 7547
5837 of 7547
5838 of 7547
5839 of 7547
5840 of 7547

6395 of 7547
6396 of 7547
6397 of 7547
6398 of 7547
6399 of 7547
6400 of 7547
6401 of 7547
6402 of 7547
6403 of 7547
6404 of 7547
6405 of 7547
6406 of 7547
6407 of 7547
6408 of 7547
6409 of 7547
6410 of 7547
6411 of 7547
6412 of 7547
6413 of 7547
6414 of 7547
6415 of 7547
6416 of 7547
6417 of 7547
6418 of 7547
6419 of 7547
6420 of 7547
6421 of 7547
6422 of 7547
6423 of 7547
6424 of 7547
6425 of 7547
6426 of 7547
6427 of 7547
6428 of 7547
6429 of 7547
6430 of 7547
6431 of 7547
6432 of 7547
6433 of 7547
6434 of 7547
6435 of 7547
6436 of 7547
6437 of 7547
6438 of 7547
6439 of 7547
6440 of 7547
6441 of 7547
6442 of 7547
6443 of 7547
6444 of 7547
6445 of 7547
6446 of 7547
6447 of 7547
6448 of 7547
6449 of 7547
6450 of 7547
6451 of 7547
6452 of 7547
6453 of 7547
6454 of 7547
6455 of 7547
6456 of 7547
6457 of 7547
6458 of 7547
6459 of 7547
6460 of 7547
6461 of 7547
6462 of 7547
6463 of 7547
6464 of 7547
6465 of 7547
6466 of 7547
6467 of 7547
6468 of 7547
6469 of 7547
6470 of 7547
6471 of 7547

7026 of 7547
7027 of 7547
7028 of 7547
7029 of 7547
7030 of 7547
7031 of 7547
7032 of 7547
7033 of 7547
7034 of 7547
7035 of 7547
7036 of 7547
7037 of 7547
7038 of 7547
7039 of 7547
7040 of 7547
7041 of 7547
7042 of 7547
7043 of 7547
7044 of 7547
7045 of 7547
7046 of 7547
7047 of 7547
7048 of 7547
7049 of 7547
7050 of 7547
7051 of 7547
7052 of 7547
7053 of 7547
7054 of 7547
7055 of 7547
7056 of 7547
7057 of 7547
7058 of 7547
7059 of 7547
7060 of 7547
7061 of 7547
7062 of 7547
7063 of 7547
7064 of 7547
7065 of 7547
7066 of 7547
7067 of 7547
7068 of 7547
7069 of 7547
7070 of 7547
7071 of 7547
7072 of 7547
7073 of 7547
7074 of 7547
7075 of 7547
7076 of 7547
7077 of 7547
7078 of 7547
7079 of 7547
7080 of 7547
7081 of 7547
7082 of 7547
7083 of 7547
7084 of 7547
7085 of 7547
7086 of 7547
7087 of 7547
7088 of 7547
7089 of 7547
7090 of 7547
7091 of 7547
7092 of 7547
7093 of 7547
7094 of 7547
7095 of 7547
7096 of 7547
7097 of 7547
7098 of 7547
7099 of 7547
7100 of 7547
7101 of 7547
7102 of 7547

In [10]:
with open('../valid_align_filtered.tsv', 'w') as f:
    for item in align_data:
        len_item = len(item) - 1
        for i in range(len_item):
            f.write(str(item[i]) + '\t')
        f.write(str(item[len_item]) + '\n')
f.closed

True

### Generate more positive examples

In [11]:
align_df = pd.read_csv('../valid_align_filtered.tsv', sep='\t', header=None, names=['e1_kb', 'rel_kb', 'rel_kb_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie', 'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id', 'e1_oie_root', 'e2_oie_root', 'label'], quoting=csv.QUOTE_NONE)

In [12]:
align_df

Unnamed: 0,e1_kb,rel_kb,rel_kb_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
0,Cantabria,country,P17,Spain,Cantabria,belongs,to the name given to the strip of land between...,Q3946,Q29,Q3946,Q41573,Cantabria,to the name given to the strip of land between...,1
1,Cantabria,country,P17,Spain,Cantabria,"belongs to the Cantabrian Mountains , so called",because of its particularly lush vegetation,Q3946,Q29,Q3946,Q208076,Cantabria,because of its particularly lush vegetation,1
2,Cantabria,country,P17,Spain,"due to the gulf stream , Cantabria , as well a...",has,a much more temperate climate than might be ex...,Q3946,Q29,,Q167466,"due to the gulf stream , Cantabria , as well a...",a much more temperate climate than might be ex...,1
3,Cantabria,country,P17,Spain,Cantabria,has not experienced,much immigration,Q3946,Q29,Q3946,Q131288,Cantabria,much immigration,1
4,Cantabria,country,P17,Spain,the greatest jurisdictional lordships of Canta...,were mainly,under the control of three of the Grandee fami...,Q3946,Q29,Q6680218,Q2025559,lordships,under the control of three of the Grandee fami...,1
5,Cantabria,country,P17,Spain,the King of Spain,signed,the corresponding Organic Law of Autonomy Stat...,Q3946,Q29,Q116,Q928812,King,the corresponding Organic Law,1
6,Cantabria,country,P17,Spain,Cantabria,was organized,in valleys,Q3946,Q29,Q3946,Q3797191,Cantabria,in valleys,1
7,Cantabria,country,P17,Spain,the unemployment rate in Cantabria,is,"19.3 % , compared to 24.47 % in Spain ;",Q3946,Q29,Q1144560,Q746386,rate,19.3 %,1
8,Cantabria,country,P17,Spain,its purchasing power parity,was,"$ 25,326",Q3946,Q29,Q25661,,parity,"$ 25,326",1
9,Cantabria,country,P17,Spain,its purchasing power parity,was,"$ 25,326",Q3946,Q29,Q25661,,parity,"$ 25,326",1


In [13]:
align_df_pos = align_df[align_df.label != 1]

In [14]:
align_df_pos

Unnamed: 0,e1_kb,rel_kb,rel_kb_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
101,Uzbekistan,shares border with,P47,Kazakhstan,Uzbekistan,is bordered,by Kazakhstan to the north,Q265,Q232,Q265,Q232,Uzbekistan,by Kazakhstan to the north,0
106,Uzbekistan,shares border with,P47,Kazakhstan,Uzbekistan,is bordering,Kazakhstan and the Aral Sea to the north and n...,Q265,Q232,Q265,Q232,Uzbekistan,Kazakhstan,0
162,hour,has part,P527,minute,the French hour,divided formally,into 100 decimal minutes,Q25235,Q7727,Q25235,Q7727,hour,into 100 decimal minutes,0
278,Desolation,depicts,P180,woman,desolation,represents,"a mysterious , delicate woman hiding her face ...",Q3815359,Q467,Q3815359,Q467,desolation,"a mysterious , delicate woman",0
689,Telemachus,father,P22,Odysseus,Telemachus,is,Odysseus ' infant son,Q192482,Q47231,Q192482,Q47231,Telemachus,Odysseus ' infant son,0
831,Seoul,located in the administrative territorial entity,P131,South Korea,Seoul,is,in the northwest of South Korea,Q8684,Q884,Q8684,Q884,Seoul,in the northwest of South Korea,0
842,Seoul,located in the administrative territorial entity,P131,South Korea,Seoul,is,being the capital of South Korea and highest g...,Q8684,Q884,Q8684,Q884,Seoul,being the capital of South Korea and highest g...,0
848,Seoul,located in the administrative territorial entity,P131,South Korea,Seoul,is connected,to every major city in South Korea by rail,Q8684,Q884,Q8684,Q884,Seoul,to every major city in South Korea by rail,0
1158,Scorzonera,taxon rank,P105,genus,scorzonera,is,a genus of flowering plants in the dandelion t...,Q1140322,Q34740,Q1140322,Q34740,scorzonera,a genus,0
1193,dachshund,subclass of,P279,dog,a sable dachshund,looks somewhat,like a black and tan dog,Q29099,Q144,Q29099,Q144,dachshund,like a black and tan dog,0


In [15]:
len_pos = len(align_df_pos)
for idx, row in align_df_pos.iterrows():
    print(idx, 'of', len_pos)
    e1, e1_id, rel, rel_id, e2, e2_id = row['e1_kb'], row['e1_kb_id'], row['rel_kb'], row['rel_kb_id'], row['e2_kb'], row['e2_kb_id']
    pos_list = align_df_pos[(align_df_pos.rel_kb_id == rel_id) & (align_df_pos.e1_kb_id != e1_id) & (align_df_pos.e2_kb_id != e2_id)]
    if len(pos_list) > 0:
        for idx_l, row_l in pos_list.iterrows():
            align_data.append((e1, rel, rel_id, e2,
                               row_l['e1_oie'], row_l['rel_oie'], row_l['e2_oie'],
                               e1_id, e2_id, row_l['e1_oie_id'], row_l['e2_oie_id'],
                               row_l['e1_oie_root'], row_l['e2_oie_root'],
                               '0'))

101 of 181
106 of 181
162 of 181
278 of 181
689 of 181
831 of 181
842 of 181
848 of 181
1158 of 181
1193 of 181
1194 of 181
1215 of 181
1283 of 181
1484 of 181
1532 of 181
1560 of 181
1583 of 181
1882 of 181
1901 of 181
1903 of 181
2212 of 181
2256 of 181
2267 of 181
2321 of 181
2365 of 181
2397 of 181
2399 of 181
2409 of 181
2498 of 181
2517 of 181
2520 of 181
2968 of 181
2970 of 181
2971 of 181
3068 of 181
3279 of 181
3296 of 181
3312 of 181
3317 of 181
3332 of 181
3379 of 181
3383 of 181
3505 of 181
3681 of 181
3683 of 181
3684 of 181
3722 of 181
3815 of 181
4001 of 181
4038 of 181
4039 of 181
4246 of 181
4318 of 181
4329 of 181
4362 of 181
4372 of 181
4373 of 181
4534 of 181
4631 of 181
4660 of 181
5156 of 181
5302 of 181
5344 of 181
5528 of 181
5562 of 181
5585 of 181
5625 of 181
5627 of 181
5628 of 181
5730 of 181
5738 of 181
5929 of 181
6181 of 181
6276 of 181
6283 of 181
6284 of 181
6341 of 181
6342 of 181
6653 of 181
6728 of 181
6827 of 181
6828 of 181
6859 of 181
6925 of 181


In [16]:
len(align_data)

16995

In [17]:
with open('../valid_align_filtered.tsv', 'w') as f:
    for item in align_data:
        len_item = len(item) - 1
        for i in range(len_item):
            f.write(str(item[i]) + '\t')
        f.write(str(item[len_item]) + '\n')
f.closed

True