In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
rel_list = pd.read_csv('../relation_data.tsv', sep='\t')

In [3]:
rel_list

Unnamed: 0,id,label,description,aliases,data_type,count
0,P31,instance of,that class of which this subject is a particul...,"distinct element of, distinct individual membe...",wikibase-item,47970502
1,P248,stated in,to be used in the references field to refer to...,"originating source, source of claim",wikibase-item,27277889
2,P1433,published in,larger work that a given work was published in...,"on the tracklist of, part of work, published i...",wikibase-item,19053900
3,P143,imported from Wikimedia project,source of this claim's value; used in referenc...,"from, source, imported from",wikibase-item,15311889
4,P17,country,sovereign state of this item; don't use on humans,"land, sovereign state, state",wikibase-item,10099326
5,P2860,cites,citation from one creative work to another,"bibliographic citation, citation",wikibase-item,6886072
6,P131,located in the administrative territorial entity,the item is located on the territory of the fo...,"region, is in the arrondissement of, administr...",wikibase-item,6702718
7,P21,sex or gender,"sexual identity of subject: male (Q6581097), f...","biological sex, female, gender, gender express...",wikibase-item,4097172
8,P106,occupation,"occupation of a person; see also ""field of wor...","career, craft, employment, job, profession, work",wikibase-item,3503657
9,P921,main subject,primary topic of a work (see also P180: depicts),"describes, about, aboutness, index term, is ab...",wikibase-item,3177781


In [4]:
test_list = pd.read_csv('../test_sentences_filtered.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'], quoting=csv.QUOTE_NONE)

In [5]:
test_list

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q692417,Q5369,P641,New York Mets,baseball,sport,The New York Mets are an American professional...
1,Q1009718,Q408,P17,Queenstown,Australia,country,Queenstown is a town in the West Coast region ...
2,Q852190,Q11446,P279,shipwreck,ship,subclass of,A shipwreck is the remains of a ship that has ...
3,Q23027,Q183,P17,Schopp,Germany,country,Schopp is a municipality in the district of Ka...
4,Q456855,Q36180,P106,Curtis Sittenfeld,writer,occupation,Elizabeth Curtis Sittenfeld (born 1975) is an ...
5,Q1152944,Q1860,P1412,John Richard Green,English,"languages spoken, written or signed",John Richard Green (12 December 1837 – 7 March...
6,Q3430887,Q11424,P31,Richard III,film,instance of,Richard III is a 55-minute film adaptation of ...
7,Q1156922,Q11424,P31,Dad,film,instance of,Dad is a 1989 American comedy-drama film writt...
8,Q539555,Q811021,P175,Mirrored,Battles,performer,Mirrored is the debut studio album by American...
9,Q539555,Q811021,P175,Mirrored,Battles,performer,Battles scored their first UK cover feature in...


In [6]:
aliases_dict = {}
for i in range(len(rel_list)):
    aliases = rel_list['aliases'][i]
    aliases_list = []
    if type(aliases) == str:
        aliases_list = rel_list['aliases'][i].split(', ')
    aliases_dict[rel_list['id'][i]] = aliases_list

In [7]:
aliases_dict['P17']

['land', 'sovereign state', 'state']

In [8]:
delete_idx = set()
for i in range(len(test_list)):
    rel_oie = test_list['sent'][i]
    rel_id = test_list['rel_id'][i]
    aliases = aliases_dict.get(rel_id, [])
    if len(aliases) == 0:
        delete_idx.add(i)
    else:
        exist = False
        for alias in aliases:
            if alias in rel_oie:
                exist = True
                break
        if exist == False:
            delete_idx.add(i)

In [9]:
len(delete_idx)

5601

In [10]:
test_gold = test_list.drop(test_list.index[list(delete_idx)])
test_gold = test_gold.reset_index(drop=True)
test_gold['e1_oie'] = np.nan
test_gold['e2_oie'] = np.nan
test_gold['rel_oie'] = np.nan

In [11]:
test_gold

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent,e1_oie,e2_oie,rel_oie
0,Q1009718,Q408,P17,Queenstown,Australia,country,Queenstown is a town in the West Coast region ...,,,
1,Q23027,Q183,P17,Schopp,Germany,country,Schopp is a municipality in the district of Ka...,,,
2,Q3430887,Q11424,P31,Richard III,film,instance of,Richard III is a 55-minute film adaptation of ...,,,
3,Q1156922,Q11424,P31,Dad,film,instance of,Dad is a 1989 American comedy-drama film writt...,,,
4,Q341919,Q778,P131,Acklins,Bahamas,located in the administrative territorial entity,Acklins is an island and district of the Bahamas.,,,
5,Q853258,Q96,P17,Puerto Vallarta,Mexico,country,"However, since that time, Puerto Vallarta has ...",,,
6,Q68197,Q39,P17,Port,Switzerland,country,Port is a municipality in the canton of Bern i...,,,
7,Q68197,Q39,P17,Port,Switzerland,country,"The average income of the over 75,000 CHF grou...",,,
8,Q962196,Q496758,P131,Victor,Ravalli County,located in the administrative territorial entity,Victor is a census-designated place (CDP) in R...,,,
9,Q383771,Q7346,P175,Africa/Brass,John Coltrane,performer,Africa/Brass is the eighth studio album by jaz...,,,


### Run OpenIE

In [12]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [13]:
len_data = len(test_gold)
for i in range(len_data):
    print(i, 'of', len_data - 1)
    corenlpres = nlp.annotate(test_gold['sent'][i], properties={
            'ssplit.eolonly': True,
            'openie.triple.all_nominals': False,
            'openie.triple.strict': True,
            'openie.max_entailments_per_clause': 500,
            'annotators': 'openie',
            'outputFormat': 'json',
        })
    for o in corenlpres['sentences'][0]['openie']:
        if ((test_gold['e1_label'][i] in o['subject'] or o['subject'] in test_gold['e1_label'][i])
            and (test_gold['e2_label'][i] in o['object'] or o['object'] in test_gold['e2_label'][i])):
            # subject and object extracted by open IE contains the focus entity
            test_gold['e1_oie'][i] = o['subject']
            test_gold['rel_oie'][i] = o['relation']
            test_gold['e2_oie'][i] = o['object']
            break

0 of 1416
1 of 1416


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2 of 1416
3 of 1416
4 of 1416
5 of 1416
6 of 1416
7 of 1416
8 of 1416
9 of 1416
10 of 1416
11 of 1416
12 of 1416
13 of 1416
14 of 1416
15 of 1416
16 of 1416
17 of 1416
18 of 1416
19 of 1416
20 of 1416
21 of 1416
22 of 1416
23 of 1416
24 of 1416
25 of 1416
26 of 1416
27 of 1416
28 of 1416
29 of 1416
30 of 1416
31 of 1416
32 of 1416
33 of 1416
34 of 1416
35 of 1416
36 of 1416
37 of 1416
38 of 1416
39 of 1416
40 of 1416
41 of 1416
42 of 1416
43 of 1416
44 of 1416
45 of 1416
46 of 1416
47 of 1416
48 of 1416
49 of 1416
50 of 1416
51 of 1416
52 of 1416
53 of 1416
54 of 1416
55 of 1416
56 of 1416
57 of 1416
58 of 1416
59 of 1416
60 of 1416
61 of 1416
62 of 1416
63 of 1416
64 of 1416
65 of 1416
66 of 1416
67 of 1416
68 of 1416
69 of 1416
70 of 1416
71 of 1416
72 of 1416
73 of 1416
74 of 1416
75 of 1416
76 of 1416
77 of 1416
78 of 1416
79 of 1416
80 of 1416
81 of 1416
82 of 1416
83 of 1416
84 of 1416
85 of 1416
86 of 1416
87 of 1416
88 of 1416
89 of 1416
90 of 1416
91 of 1416
92 of 1416
93 of 1

697 of 1416
698 of 1416
699 of 1416
700 of 1416
701 of 1416
702 of 1416
703 of 1416
704 of 1416
705 of 1416
706 of 1416
707 of 1416
708 of 1416
709 of 1416
710 of 1416
711 of 1416
712 of 1416
713 of 1416
714 of 1416
715 of 1416
716 of 1416
717 of 1416
718 of 1416
719 of 1416
720 of 1416
721 of 1416
722 of 1416
723 of 1416
724 of 1416
725 of 1416
726 of 1416
727 of 1416
728 of 1416
729 of 1416
730 of 1416
731 of 1416
732 of 1416
733 of 1416
734 of 1416
735 of 1416
736 of 1416
737 of 1416
738 of 1416
739 of 1416
740 of 1416
741 of 1416
742 of 1416
743 of 1416
744 of 1416
745 of 1416
746 of 1416
747 of 1416
748 of 1416
749 of 1416
750 of 1416
751 of 1416
752 of 1416
753 of 1416
754 of 1416
755 of 1416
756 of 1416
757 of 1416
758 of 1416
759 of 1416
760 of 1416
761 of 1416
762 of 1416
763 of 1416
764 of 1416
765 of 1416
766 of 1416
767 of 1416
768 of 1416
769 of 1416
770 of 1416
771 of 1416
772 of 1416
773 of 1416
774 of 1416
775 of 1416
776 of 1416
777 of 1416
778 of 1416
779 of 1416
780 

1360 of 1416
1361 of 1416
1362 of 1416
1363 of 1416
1364 of 1416
1365 of 1416
1366 of 1416
1367 of 1416
1368 of 1416
1369 of 1416
1370 of 1416
1371 of 1416
1372 of 1416
1373 of 1416
1374 of 1416
1375 of 1416
1376 of 1416
1377 of 1416
1378 of 1416
1379 of 1416
1380 of 1416
1381 of 1416
1382 of 1416
1383 of 1416
1384 of 1416
1385 of 1416
1386 of 1416
1387 of 1416
1388 of 1416
1389 of 1416
1390 of 1416
1391 of 1416
1392 of 1416
1393 of 1416
1394 of 1416
1395 of 1416
1396 of 1416
1397 of 1416
1398 of 1416
1399 of 1416
1400 of 1416
1401 of 1416
1402 of 1416
1403 of 1416
1404 of 1416
1405 of 1416
1406 of 1416
1407 of 1416
1408 of 1416
1409 of 1416
1410 of 1416
1411 of 1416
1412 of 1416
1413 of 1416
1414 of 1416
1415 of 1416
1416 of 1416


In [14]:
test_gold

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent,e1_oie,e2_oie,rel_oie
0,Q1009718,Q408,P17,Queenstown,Australia,country,Queenstown is a town in the West Coast region ...,,,
1,Q23027,Q183,P17,Schopp,Germany,country,Schopp is a municipality in the district of Ka...,Schopp,Germany,is municipality in
2,Q3430887,Q11424,P31,Richard III,film,instance of,Richard III is a 55-minute film adaptation of ...,Richard III,film adaptation,is
3,Q1156922,Q11424,P31,Dad,film,instance of,Dad is a 1989 American comedy-drama film writt...,Dad,1989 American comedy-drama film,is
4,Q341919,Q778,P131,Acklins,Bahamas,located in the administrative territorial entity,Acklins is an island and district of the Bahamas.,Acklins,Bahamas,is island of
5,Q853258,Q96,P17,Puerto Vallarta,Mexico,country,"However, since that time, Puerto Vallarta has ...",,,
6,Q68197,Q39,P17,Port,Switzerland,country,Port is a municipality in the canton of Bern i...,Port,canton in Switzerland located,is municipality in
7,Q68197,Q39,P17,Port,Switzerland,country,"The average income of the over 75,000 CHF grou...",,,
8,Q962196,Q496758,P131,Victor,Ravalli County,located in the administrative territorial entity,Victor is a census-designated place (CDP) in R...,Victor,Ravalli County,is place in
9,Q383771,Q7346,P175,Africa/Brass,John Coltrane,performer,Africa/Brass is the eighth studio album by jaz...,Africa/Brass,studio album by jazz musician John Coltrane re...,is


In [20]:
test_gold.dropna(subset=['e1_oie'], inplace=True)
test_gold = test_gold.reset_index(drop=True)
test_gold['label'] = '0'
test_gold = test_gold[['e1_id', 'rel_id', 'e2_id', 'e1_label', 'rel_label', 'e2_label', 'e1_oie', 'rel_oie', 'e2_oie', 'label']]

In [21]:
test_gold

Unnamed: 0,e1_id,rel_id,e2_id,e1_label,rel_label,e2_label,e1_oie,rel_oie,e2_oie,label
0,Q23027,P17,Q183,Schopp,country,Germany,Schopp,is municipality in,Germany,0
1,Q3430887,P31,Q11424,Richard III,instance of,film,Richard III,is,film adaptation,0
2,Q1156922,P31,Q11424,Dad,instance of,film,Dad,is,1989 American comedy-drama film,0
3,Q341919,P131,Q778,Acklins,located in the administrative territorial entity,Bahamas,Acklins,is island of,Bahamas,0
4,Q68197,P17,Q39,Port,country,Switzerland,Port,is municipality in,canton in Switzerland located,0
5,Q962196,P131,Q496758,Victor,located in the administrative territorial entity,Ravalli County,Victor,is place in,Ravalli County,0
6,Q383771,P175,Q7346,Africa/Brass,performer,John Coltrane,Africa/Brass,is,studio album by jazz musician John Coltrane re...,0
7,Q214763,P159,Q16555,ConocoPhillips,headquarters location,Houston,ConocoPhillips Company,is,energy corporation with its headquarters locat...,0
8,Q35715,P131,Q408,South Australia,located in the administrative territorial entity,Australia,South Australia,is state in,part of Australia,0
9,Q35715,P131,Q408,South Australia,located in the administrative territorial entity,Australia,South Australia,was named as,second lowest performing economy in Australia,0


In [22]:
test_gold.to_csv('../test_align_gold.tsv', sep='\t', index=False)

### Reformat Gold Data

In [23]:
test_gold = pd.read_csv('../test_align_gold_filtered.tsv', sep='\t', quoting=csv.QUOTE_NONE)

In [24]:
test_gold

Unnamed: 0,e1_id,rel_id,e2_id,e1_label,rel_label,e2_label,e1_oie,rel_oie,e2_oie,label
0,Q1426188,P50,Q25161,Matilda,author,Roald Dahl,Matilda,is book by,British writer Roald Dahl,0
1,Q142811,P1376,Q494117,Allentown,capital of,Lehigh County,Allentown,is,county seat of Lehigh County,1
2,Q1516852,P161,Q152208,Taxi,cast member,Gisele BÃ¼ndchen,Taxi,starring,Gisele BÃ¼ndchen,0
3,Q220910,P161,Q20178,Casino,cast member,Joe Pesci,Casino,starring,Joe Pesci,0
4,Q957067,P161,Q131380,Nadine,cast member,Kim Basinger,Nadine,starring,Kim Basinger,0
5,Q4137388,P161,Q9543,Heroes,cast member,Salman Khan,Heroes,starring,Salman Khan,0
6,Q2005056,P161,Q16473,Shopgirl,cast member,Steve Martin,Shopgirl,starring,Steve Martin,0
7,Q2362846,P161,Q103946,Renegades,cast member,Kiefer Sutherland,Renegades,starring,Kiefer Sutherland,0
8,Q3575329,P40,Q534889,Zeus,child,Hercules,Zeus,taught,Hercules,1
9,Q3575329,P40,Q534889,Zeus,child,Hercules,Zeus,accompanies,Hercules,1


In [29]:
test_gold['e1_oie_id'] = ''
test_gold['e2_oie_id'] = ''
test_gold['e1_oie_root'] = test_gold['e1_oie']
test_gold['e2_oie_root'] = test_gold['e2_oie']
data_header = ['e1_label', 'rel_label', 'rel_id', 'e2_label', 'e1_oie', 'rel_oie', 'e2_oie',
               'e1_id', 'e2_id', 'e1_oie_id', 'e2_oie_id',
               'e1_oie_root', 'e2_oie_root', 'label']
test_gold = test_gold[data_header]

In [30]:
test_gold

Unnamed: 0,e1_label,rel_label,rel_id,e2_label,e1_oie,rel_oie,e2_oie,e1_id,e2_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
0,Matilda,author,P50,Roald Dahl,Matilda,is book by,British writer Roald Dahl,Q1426188,Q25161,,,Matilda,British writer Roald Dahl,0
1,Allentown,capital of,P1376,Lehigh County,Allentown,is,county seat of Lehigh County,Q142811,Q494117,,,Allentown,county seat of Lehigh County,1
2,Taxi,cast member,P161,Gisele BÃ¼ndchen,Taxi,starring,Gisele BÃ¼ndchen,Q1516852,Q152208,,,Taxi,Gisele BÃ¼ndchen,0
3,Casino,cast member,P161,Joe Pesci,Casino,starring,Joe Pesci,Q220910,Q20178,,,Casino,Joe Pesci,0
4,Nadine,cast member,P161,Kim Basinger,Nadine,starring,Kim Basinger,Q957067,Q131380,,,Nadine,Kim Basinger,0
5,Heroes,cast member,P161,Salman Khan,Heroes,starring,Salman Khan,Q4137388,Q9543,,,Heroes,Salman Khan,0
6,Shopgirl,cast member,P161,Steve Martin,Shopgirl,starring,Steve Martin,Q2005056,Q16473,,,Shopgirl,Steve Martin,0
7,Renegades,cast member,P161,Kiefer Sutherland,Renegades,starring,Kiefer Sutherland,Q2362846,Q103946,,,Renegades,Kiefer Sutherland,0
8,Zeus,child,P40,Hercules,Zeus,taught,Hercules,Q3575329,Q534889,,,Zeus,Hercules,1
9,Zeus,child,P40,Hercules,Zeus,accompanies,Hercules,Q3575329,Q534889,,,Zeus,Hercules,1


In [31]:
test_gold.to_csv('../test_align_gold_filtered.tsv', sep='\t', index=False, header=False)