In [None]:
import pandas as pd
import pickle
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [None]:
header = [
    'e1_kb', 'rel_kb', 'rel_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie',
    'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id', 'e1_oie_root', 'e2_oie_root', 'label'
]
train = pd.read_csv('../dataset/train_align_all_balanced.tsv', sep='\t', header=None, names=header)
valid = pd.read_csv('../dataset/valid_align_all.tsv', sep='\t', header=None, names=header)
train = train.append(valid, ignore_index=True)
train = train[pd.notnull(train['e1_oie_id']) & pd.notnull(train['e2_oie_id'])]
train = train.reset_index(drop=True)

In [None]:
train

In [None]:
oie_triples = train.groupby(['rel_oie']).size()

In [None]:
oie_triples['written by']

In [None]:
align = pd.DataFrame(columns=['rel_oie', 'K', 'rel_kb', 'e1_type', 'e2_type', 'conf', 'tou'])

In [None]:
with open('rel_list.pickle', 'rb') as file:
    rel_list = pickle.load(file)

def get_relation_list(e1, e2):
    try:
        return rel_list[e1 + ' ' + e2]
    except KeyError:
        try:
            sparql.setQuery("""
            SELECT ?relation WHERE {
                wd:%s ?relation wd:%s .
            }
            """ % (e1, e2))
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            prop_list = [i['relation']['value'].replace('http://www.wikidata.org/prop/direct/', '') for i in results['results']['bindings']]
            rel_list[e1 + ' ' + e2] = prop_list
            return prop_list
        except OSError:
            sparql.setQuery("""
            SELECT ?relation WHERE {
                wd:%s ?relation wd:%s .
            }
            """ % (e1, e2))
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            prop_list = [i['relation']['value'].replace('http://www.wikidata.org/prop/direct/', '') for i in results['results']['bindings']]
            rel_list[e1 + ' ' + e2] = prop_list
            return prop_list

In [None]:
with open('entity_type.pickle', 'rb') as file:
    entity_type = pickle.load(file)

In [None]:
def get_entity_type(e):
    try:
        if type(entity_type[e]) == bytes:
            return entity_type[e].decode("utf-8")
        elif type(entity_type[e]) == str:
            return entity_type[e]
        else:
            return str(entity_type[e])
    except KeyError:
        ent_type = '-'
        return ent_type

In [None]:
idx = 0
len_train = len(train)

for i in tqdm(range(len_train)):
    rel_oie = train['rel_oie'][i]
    e1 = train['e1_oie_id'][i]
    e2 = train['e2_oie_id'][i]
    
    e1_type = get_entity_type(e1)
    e2_type = get_entity_type(e2)
    kb_rels = get_relation_list(e1, e2)
    
    for rel in kb_rels:
        align.at[idx, 'rel_oie'] = rel_oie
        align.at[idx, 'rel_kb'] = rel
        align.at[idx, 'e1_type'] = e1_type
        align.at[idx, 'e2_type'] = e2_type
        idx += 1

In [None]:
if len(rel_list) > 0:
    with open('rel_list.pickle', 'wb') as handle:
        pickle.dump(rel_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
# if len(entity_type) > 0:
#     with open('entity_type.pickle', 'wb') as handle:
#         pickle.dump(entity_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
align

In [None]:
oie_triples_align = align.groupby(['rel_oie', 'e1_type', 'e2_type']).size()
oie_triples_align_total = align.groupby(['rel_oie']).size()

In [None]:
for i in tqdm(range(len(align))):
    rel_oie = align['rel_oie'][i]
    e1_type = align['e1_type'][i]
    e2_type = align['e2_type'][i]
    
    # Calculate K
    K = len(oie_triples_align[rel_oie]) / oie_triples[rel_oie]
    if K > 1:
        K = 1
    align['K'][i] = K
        
    # Calculate conf
    conf = oie_triples_align[rel_oie][e1_type][e2_type] / oie_triples_align_total[rel_oie]
    align['conf'][i] = conf
    
    # Calculate tou
    tou = (1 - K) / conf
    align['tou'][i] = conf

In [None]:
align

In [None]:
align.to_csv('align.tsv', sep='\t', index=False)

### Run Linear Regression

In [69]:
import numpy as np
import pandas as pd
import pickle
from sklearn import linear_model
from tqdm import tqdm

In [2]:
align = pd.read_csv('align.tsv', sep='\t')

In [3]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou
0,was filmed in,0.012658,P462,film,color,0.000476,0.000476
1,is,0.011210,P462,film,color,0.000468,0.000468
2,is,0.011210,P462,film,color,0.000468,0.000468
3,starring,0.002943,P161,film,human,0.767110,0.767110
4,starring,0.002943,P161,film,human,0.767110,0.767110
5,directed by,0.000823,P57,film,human,0.916603,0.916603
6,directed by,0.000823,P162,film,human,0.916603,0.916603
7,are,0.018613,P462,color scheme,safest web colors,0.001524,0.001524
8,was,0.011766,P31,house cat,taxon,0.000350,0.000350
9,starring,0.002943,P161,film,human,0.767110,0.767110


In [4]:
tou_list = np.array(align['tou']).reshape(-1, 1)
K_list = np.array(align['K']).reshape(-1, 1)

In [5]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(K_list, tou_list)

# Make predictions using the testing set
tou_pred = regr.predict(K_list)

In [6]:
tou_h = []
for i in tqdm(range(len(tou_pred))):
    tou_h.append(tou_pred[i][0])

100%|██████████| 832315/832315 [00:00<00:00, 1907195.50it/s]


In [7]:
align['tou_h'] = tou_h

In [8]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou,tou_h
0,was filmed in,0.012658,P462,film,color,0.000476,0.000476,0.515746
1,is,0.011210,P462,film,color,0.000468,0.000468,0.514561
2,is,0.011210,P462,film,color,0.000468,0.000468,0.514561
3,starring,0.002943,P161,film,human,0.767110,0.767110,0.507791
4,starring,0.002943,P161,film,human,0.767110,0.767110,0.507791
5,directed by,0.000823,P57,film,human,0.916603,0.916603,0.506054
6,directed by,0.000823,P162,film,human,0.916603,0.916603,0.506054
7,are,0.018613,P462,color scheme,safest web colors,0.001524,0.001524,0.520622
8,was,0.011766,P31,house cat,taxon,0.000350,0.000350,0.515015
9,starring,0.002943,P161,film,human,0.767110,0.767110,0.507791


In [10]:
is_aligned = []
for i in tqdm(range(len(tou_h))):
    thres = align['tou_h'][i]
    val = align['tou'][i]
    if val < thres:
        is_aligned.append(1)
    else:
        is_aligned.append(0)

100%|██████████| 832315/832315 [00:16<00:00, 49683.02it/s]


In [11]:
align['is_aligned'] = is_aligned

In [13]:
align_pos = align[align.is_aligned == 1]

In [30]:
align_pos_grouped = align_pos.groupby(['rel_kb', 'rel_oie']).size()

In [44]:
align_pos_df = align_pos_grouped.to_frame().reset_index()
align_pos_df = align_pos_df.drop(0, axis=1)

In [48]:
align_pos_df

Unnamed: 0,rel_kb,rel_oie
0,P1001,Council of
1,P1001,Court of
2,P1001,branch of
3,P1001,is In
4,P1001,is in
5,P1001,parliament 's
6,P101,has
7,P101,is In
8,P101,is in
9,P101,is professor of


In [66]:
kb_to_oie = {}
for i in range(len(align_pos_df)):
    key = align_pos_df['rel_kb'][i]
    val = align_pos_df['rel_oie'][i]
    if key not in kb_to_oie:
        kb_to_oie[key] = [val]
    else:
        kb_to_oie[key] = kb_to_oie[key] + [val]

In [68]:
kb_to_oie

{'P1001': ['Council of',
  'Court of',
  'branch of',
  'is In',
  'is in',
  "parliament 's"],
 'P101': ['has',
  'is In',
  'is in',
  'is professor of',
  'merged with',
  'of',
  'specializing in',
  'studied'],
 'P102': ['President of',
  'candidate for',
  'founder of',
  'in',
  'is member of',
  'joined',
  'member of',
  'of',
  'raised money for',
  'represented',
  'was member of'],
 'P103': ['appeared in', 'had', 'is', 'is in', 'speak', 'was', 'wrote'],
 'P1037': ['was founded by'],
 'P1038': ['betraying', 'known as'],
 'P1040': ['sequel by', 'starring'],
 'P1049': ['is In'],
 'P105': ['are', 'be', 'became', 'being', 'being of', 'is', 'was'],
 'P1050': ['battling'],
 'P1056': ['distributed', 'is edition of'],
 'P106': ['became',
  'called',
  'debuted episode as',
  'graduating as',
  'has also remained as',
  'has remained as',
  'is',
  'is active as',
  'is considered',
  'is daughter of',
  'is first vintner',
  'is married to',
  'is most active as',
  'is older sister

In [70]:
with open('align.p', 'wb') as fp:
    pickle.dump(kb_to_oie, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [72]:
"'s son is" in kb_to_oie['P40']

True