In [1]:
import pandas as pd
import time
import spacy
nlp = spacy.load('en_core_web_md')
from tqdm import tqdm
from wikidata.client import Client
client = Client()

In [2]:
import nltk
import pickle

In [3]:
import enteater

Setting up Enteater [1] Loading Spacy Model ... Done
Setting up Enteater [2] Loading Wikidata ID dictionary ... Done
Setting up Enteater [3] Loading Wikidata-Freebase Mapping table ... Done


In [50]:
version = 'v10'
dt_type = 'valid'
data_kb = pd.read_csv('../' + dt_type + '_sentences.tsv', sep='\t', header=None, names=['e1_id', 'e2_id', 'rel_id', 'e1_label', 'e2_label', 'rel_label', 'sent'])
data_openie = pd.read_csv('../' + dt_type + '_openie.tsv', sep='\t', header=None, names=['has_kb_rel', 'sent_id', 'e1', 'rel', 'e2'])

In [51]:
data_kb

Unnamed: 0,e1_id,e2_id,rel_id,e1_label,e2_label,rel_label,sent
0,Q2489949,Q22006653,P462,Ride a Crooked Trail,color,color,Ride a Crooked Trail Ride a Crooked Trail is a...
1,Q40088,Q23444,P462,Carrara marble,white,color,Carrara marble Carrara marble is a type of whi...
2,Q2080084,Q838368,P462,Wing and a Prayer,black-and-white,color,Wing and a Prayer Wing and a Prayer (also know...
3,Q2514152,Q838368,P462,Terror from the Year 5000,black-and-white,color,Terror from the Year 5000 Terror from the Year...
4,Q17634359,Q838368,P462,Humo de Marihuana,black-and-white,color,Humo de Marihuana Humo de Marihuana () is an A...
5,Q4738474,Q838368,P462,Always on Duty,black-and-white,color,Always on Duty Zu jeder Stunde (English-langua...
6,Q20002488,Q838368,P462,Special Mission,black-and-white,color,Special Mission Im Sonderauftrag (English-lang...
7,Q3222758,Q22006653,P462,The Proud Rebel,color,color,The Proud Rebel The Proud Rebel is a 1958 Amer...
8,Q851395,Q22006653,P462,Pakeezah,colour,color,Meena Kumari's performance as a golden-hearted...
9,Q148807,Q943,P462,Senyera,yellow,color,"Senyera The Senyera (; meaning pennon , standa..."


In [52]:
data_kb.groupby('rel_id').size().sort_values(ascending=False).to_csv('../' + version + '/' + dt_type + '_rel_kb_freq.tsv', sep='\t')

In [53]:
data_openie

Unnamed: 0,has_kb_rel,sent_id,e1,rel,e2
0,1,0,Trail,is,1958 American Eastmancolor Western film shot i...
1,1,0,Crooked Trail,is,1958 Eastmancolor Western film shot in CinemaS...
2,1,0,Crooked Trail,is,1958 Eastmancolor Western film shot
3,1,0,Trail,is,1958 American Eastmancolor Western film shot
4,1,0,Trail,is,1958 Eastmancolor Western film shot
5,1,0,Crooked Trail,is,1958 American Eastmancolor Western film shot
6,1,0,Trail,is,1958 Eastmancolor Western film shot in CinemaS...
7,1,0,Crooked Trail,is,1958 American Eastmancolor Western film shot i...
8,0,0,Crooked Trail Ride,is with,former World War II hero Audie Murphy
9,0,0,1958 American Eastmancolor Western film shot,is in,CinemaScope


In [54]:
rel_aliases = pd.read_csv('../relation_desc_aliases.tsv', sep='\t', header=None, names=['id', 'rel', 'desc', 'len_desc', 'aliases'])
aliases_dict = {}
for i in range(len(rel_aliases)):
    aliases = rel_aliases['aliases'][i]
    if isinstance(aliases, str):
        aliases_dict[rel_aliases['id'][i]] = aliases.split(', ')
    else:
        aliases_dict[rel_aliases['id'][i]] = [rel_aliases['rel'][i]]

In [55]:
aliases_dict['P1057']

['on chromosome']

In [56]:
with open('../../dutta/align_v6.p', 'rb') as fp:
    dutta_dict = pickle.load(fp)

In [57]:
def get_entity_tuple_id(e1, e2, e1_kb, e2_kb):
    e1_wiki_id = ''
    e2_wiki_id = ''
    e1_label = ''
    e2_label = ''
    
    # Get e1 ID
    e1_list = [x for x in enteater.get_wikidata_id(e1) if 'Q' in x]
    if e1_kb in e1_list:
        e1_wiki_id = e1_kb
    else:
        if len(e1_list) >= 1:
            e1_wiki_id = e1_list[0]
    
    # Get e2 ID
    e2_list = [x for x in enteater.get_wikidata_id(e2) if 'Q' in x]
    if e2_kb in e2_list:
        e2_wiki_id = e2_kb
    else:
        if len(e2_list) >= 1:
            e2_wiki_id = e2_list[0]
    
    return (e1_wiki_id, e1_label, e2_wiki_id, e2_label)

In [58]:
def get_relation_label(rel_kb_id, rel_kb, rel_oie):
    if rel_kb == rel_oie or (rel_oie in rel_kb) or (rel_kb in rel_oie):
        return '0'
    
    aliases = aliases_dict[rel_kb_id]
    for alias in aliases:
        alias = alias.lower()
        if alias == rel_oie or (rel_oie in alias) or (alias in rel_oie):
            return '0'
    
    oie_nlp = nlp(rel_oie)
    oie_lemma = ''
    for token in oie_nlp:
        oie_lemma = oie_lemma + ' ' + token.lemma_
    
    if rel_kb == oie_lemma or (oie_lemma in rel_kb) or (rel_kb in oie_lemma):
        return '0'
    
    for alias in aliases:
        alias = alias.lower()
        if alias == oie_lemma or (oie_lemma in alias) or (alias in oie_lemma):
            return '0'
        
#     try:
#         rels_d = dutta_dict[rel_kb_id]
#         for rel in rels_d:
#             rel = rel.lower()
#             if rel == rel_oie or (rel_oie in rel) or (rel in rel_oie):
#                 return '0'

#         for rel in rels_d:
#             rel = rel.lower()
#             if rel == oie_lemma or (oie_lemma in rel) or (rel in oie_lemma):
#                 return '0'
#     except KeyError:
#         return '1'
    
    return '1'

def is_similar_entity(ent_kb, ent_kb_id, ent_oie, ent_oie_id):
    if ent_kb_id == ent_oie_id:
        return True
    elif ent_kb == ent_oie:
        return True
    else:
        diff = nltk.edit_distance(ent_kb, ent_oie)
        if diff < 10 and ((ent_kb in ent_oie) or (ent_oie in ent_kb)):
            return True
        else:
            return False
    
def normalize_entity(ent):
    if not isinstance(ent, str):
        ent = str(ent)
    ent = ent.replace(' \'s', '\'s')
    ent = ent.replace('-', '')
    return ent

In [59]:
align_data = []
sent_idx_list = set()
len_openie = len(data_openie)
for i in tqdm(range(len_openie)):
    sent_idx = data_openie['sent_id'][i]
    e1_kb_id = data_kb['e1_id'][sent_idx]
    e2_kb_id = data_kb['e2_id'][sent_idx]
    e1_kb = normalize_entity(data_kb['e1_label'][sent_idx])
    e2_kb = normalize_entity(data_kb['e2_label'][sent_idx])
    rel_kb = data_kb['rel_label'][sent_idx].lower()
    rel_kb_id = data_kb['rel_id'][sent_idx]
    e1_oie = normalize_entity(data_openie['e1'][i])
    e2_oie = normalize_entity(data_openie['e2'][i])
    rel_oie = data_openie['rel'][i].lower()
    e1_oie_id, e1_oie_root, e2_oie_id, e2_oie_root = get_entity_tuple_id(e1_oie, e2_oie, e1_kb_id, e2_kb_id)
    
    if is_similar_entity(e1_kb, e1_kb_id, e1_oie, e1_oie_id) and is_similar_entity(e2_kb, e2_kb_id, e2_oie, e2_oie_id):
#         label = get_relation_label(rel_kb_id, rel_kb, rel_oie)
#         if label == '0':
        label = '0'
        align_data.append((e1_kb, rel_kb, rel_kb_id, e2_kb,
                           e1_oie, rel_oie, e2_oie,
                           e1_kb_id, e2_kb_id, e1_oie_id, e2_oie_id, e1_oie_root, e2_oie_root,
                           label))
        sent_idx_list.add(sent_idx)
    elif is_similar_entity(e1_kb, e1_kb_id, e1_oie, e1_oie_id) and not is_similar_entity(e2_kb, e2_kb_id, e2_oie, e2_oie_id):
        label = get_relation_label(rel_kb_id, rel_kb, rel_oie)
        if label == '1':
            align_data.append((e1_kb, rel_kb, rel_kb_id, e2_kb,
                               e1_oie, rel_oie, e2_oie,
                               e1_kb_id, e2_kb_id, e1_oie_id, e2_oie_id, e1_oie_root, e2_oie_root,
                               label))
            sent_idx_list.add(sent_idx)
    elif not is_similar_entity(e1_kb, e1_kb_id, e1_oie, e1_oie_id) and is_similar_entity(e2_kb, e2_kb_id, e2_oie, e2_oie_id):
        label = get_relation_label(rel_kb_id, rel_kb, rel_oie)
        if label == '1':
            align_data.append((e1_kb, rel_kb, rel_kb_id, e2_kb,
                               e1_oie, rel_oie, e2_oie,
                               e1_kb_id, e2_kb_id, e1_oie_id, e2_oie_id, e1_oie_root, e2_oie_root,
                               label))
            sent_idx_list.add(sent_idx)

100%|██████████| 829085/829085 [57:19<00:00, 241.08it/s]  


In [60]:
len(align_data), len(sent_idx_list)

(196234, 42874)

In [61]:
with open('../' + version + '/' + dt_type + '_align.tsv', 'w') as f:
    for item in align_data:
        len_item = len(item) - 1
        for i in range(len_item):
            f.write(str(item[i]) + '\t')
        f.write(str(item[len_item]) + '\n')
f.closed

True

### Generate more positive examples

In [62]:
align_df = pd.read_csv('../' + version + '/' + dt_type + '_align.tsv', sep='\t', header=None, names=['e1_kb', 'rel_kb', 'rel_kb_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie', 'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id', 'e1_oie_root', 'e2_oie_root', 'label'])

In [63]:
align_df

Unnamed: 0,e1_kb,rel_kb,rel_kb_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
0,Ride a Crooked Trail,color,P462,color,Crooked Trail,is,1958 Eastmancolor Western film shot in CinemaS...,Q2489949,Q22006653,Q7728157,,,,1
1,Ride a Crooked Trail,color,P462,color,Crooked Trail,is,1958 Eastmancolor Western film shot,Q2489949,Q22006653,Q7728157,,,,1
2,Ride a Crooked Trail,color,P462,color,Crooked Trail,is,1958 American Eastmancolor Western film shot,Q2489949,Q22006653,Q7728157,,,,1
3,Ride a Crooked Trail,color,P462,color,Crooked Trail,is,1958 American Eastmancolor Western film shot i...,Q2489949,Q22006653,Q7728157,,,,1
4,Carrara marble,color,P462,white,Carrara marble Carrara marble,is type popular of,white marble,Q40088,Q23444,,,,,1
5,Carrara marble,color,P462,white,marble,is,popular,Q40088,Q23444,Q2080761,Q400146,,,1
6,Carrara marble,color,P462,white,Carrara marble Carrara marble,is type of,white marble,Q40088,Q23444,,,,,1
7,Wing and a Prayer,color,P462,blackandwhite,Wing,is,blackandwhite,Q2080084,Q838368,Q161358,,,,1
8,Wing and a Prayer,color,P462,blackandwhite,Wing,is,blackandwhite,Q2080084,Q838368,Q161358,,,,1
9,Terror from the Year 5000,color,P462,blackandwhite,Terror,is,blackandwhite,Q2514152,Q838368,Q1968139,,,,1


In [64]:
align_df.groupby(['rel_kb_id', 'rel_kb', 'rel_oie', 'label']).size().to_csv('../' + version + '/' + dt_type + '_align_freq.tsv', sep='\t')

In [65]:
align_df.groupby(['label']).size()

label
0     11613
1    184621
dtype: int64

In [66]:
# rel_except = ['P175']
# align_df_pos = align_df[(align_df.label != 1) & (~align_df.rel_kb_id.isin(rel_except))]
align_df_pos = align_df[(align_df.label != 1)]

In [67]:
align_df_pos

Unnamed: 0,e1_kb,rel_kb,rel_kb_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
52,Félicette,color,P462,black,Félicette,was,blackandwhite,Q12165555,Q23445,Q12165555,,,,0
94,Smaug,color,P462,red,Smaug,was,considered,Q46302,Q3142,Q46302,,,,0
138,Second Choice,color,P462,blackandwhite,Choice,is,blackandwhite,Q7443191,Q838368,Q1075772,,,,0
148,99 Homes,film editor,P1040,Ramin Bahrani,99 Homes 99 Homes,written by,Bahrani,Q16147760,Q3418487,,,,,0
150,99 Homes,film editor,P1040,Ramin Bahrani,Homes 99 Homes,written by,Bahrani,Q16147760,Q3418487,,,,,0
205,Session 9,film editor,P1040,Brad Anderson,Session,directed by,Brad Anderson,Q577221,Q598238,Q362310,Q598238,,,0
253,Toto Forever,film editor,P1040,Roberto F. Canuto,Toto Forever,'s director is,Roberto F. Canuto,Q17504820,Q17486474,Q17504820,Q17486474,,,0
309,His New Profession,film editor,P1040,Charlie Chaplin,His New Profession,starring,Charlie Chaplin,Q2367759,Q882,Q2367759,Q882,,,0
329,Hooligan Sparrow,film editor,P1040,Nanfu Wang,Hooligan Sparrow,shows,Nanfu Wang,Q24608885,Q24609090,Q24608885,Q24609090,,,0
380,True Lies,film editor,P1040,James Cameron,Lies,written by,James Cameron,Q110397,Q42574,Q623837,Q42574,,,0


In [68]:
align_data = []
len_pos = len(align_df_pos)
for idx, row in tqdm(align_df_pos.iterrows(), total=align_df_pos.shape[0]):
    e1, e1_id, rel, rel_id, e2, e2_id = row['e1_kb'], row['e1_kb_id'], row['rel_kb'], row['rel_kb_id'], row['e2_kb'], row['e2_kb_id']
    pos_list = align_df_pos[(align_df_pos.rel_kb_id == rel_id) & (align_df_pos.e1_kb_id != e1_id) & (align_df_pos.e2_kb_id != e2_id)]
    if len(pos_list) > 0:
        for idx_l, row_l in pos_list.iterrows():
            align_data.append({
                'e1_kb': e1,
                'rel_kb': rel,
                'rel_kb_id': rel_id,
                'e2_kb': e2,
                'e1_oie': row_l['e1_oie'],
                'rel_oie': row_l['rel_oie'],
                'e2_oie': row_l['e2_oie'],
                'e1_kb_id': e1_id,
                'e2_kb_id': e2_id,
                'e1_oie_id': row_l['e1_oie_id'],
                'e2_oie_id': row_l['e2_oie_id'],
                'e1_oie_root': row_l['e1_oie_root'],
                'e2_oie_root': row_l['e2_oie_root'],
                'label': 0
            })

100%|██████████| 11613/11613 [04:56<00:00, 39.12it/s]


In [69]:
align_df = align_df.append(align_data)

In [70]:
align_df.to_csv('../' + version + '/' + dt_type + '_align_all.tsv', sep='\t', header=False, index=False)

In [71]:
len(align_df)

2397308

In [72]:
align_df.groupby(['label']).size()

label
0    2212687
1     184621
dtype: int64