In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [2]:
header = [
    'e1_kb', 'rel_kb', 'rel_id', 'e2_kb', 'e1_oie', 'rel_oie', 'e2_oie',
    'e1_kb_id', 'e2_kb_id', 'e1_oie_id', 'e2_oie_id', 'e1_oie_root', 'e2_oie_root', 'label'
]
train = pd.read_csv('../dataset/train_align_all.tsv', sep='\t', header=None, names=header)
valid = pd.read_csv('../dataset/valid_align_all.tsv', sep='\t', header=None, names=header)
train = train.append(valid, ignore_index=True)
train = train[pd.notnull(train['e1_oie_id']) & pd.notnull(train['e2_oie_id'])]
train = train.reset_index(drop=True)

In [3]:
train

Unnamed: 0,e1_kb,rel_kb,rel_id,e2_kb,e1_oie,rel_oie,e2_oie,e1_kb_id,e2_kb_id,e1_oie_id,e2_oie_id,e1_oie_root,e2_oie_root,label
0,The Card,color,P462,black-and-white,Card,was,released,Q12105566,Q838368,Q12105566,Q7310942,,,1
1,The Card,color,P462,black-and-white,Card,was released in,1952,Q12105566,Q838368,Q12105566,Q5272,,,1
2,The Tingler,color,P462,color,Tingler,was filmed in,black-and-white,Q1557904,Q22006653,Q1557904,Q838368,,,1
3,Flying Disc Man from Mars,color,P462,black-and-white,Man,is,black-and-white,Q5463392,Q838368,Q8441,Q838368,,,1
4,Day the World Ended,color,P462,black-and-white,Ended,is,black-and-white,Q1218963,Q838368,Q19095121,Q838368,,,1
5,Bilwamangal,color,P462,black-and-white,Bilwamangal,is,silent,Q4912051,Q838368,Q4912051,Q7514514,,,1
6,Bilwamangal,color,P462,black-and-white,Bilwamangal,is,black-and-white,Q4912051,Q838368,Q4912051,Q838368,,,1
7,Potoooooooo,color,P462,chestnut,Potoooooooo,bred by,Willoughby Bertie,Q7235103,Q1406070,Q7235103,Q8022210,,,1
8,Potoooooooo,color,P462,chestnut,Potoooooooo,bred in,1773,Q7235103,Q1406070,Q7235103,Q7692,,,1
9,Potoooooooo,color,P462,chestnut,Potoooooooo,bred by,Earl,Q7235103,Q1406070,Q7235103,Q1128240,,,1


In [4]:
oie_triples = train.groupby(['rel_oie']).size()

In [5]:
oie_triples['written by']

1201

In [6]:
align = pd.DataFrame(columns=['rel_oie', 'K', 'rel_kb', 'e1_type', 'e2_type', 'conf', 'tou'])

In [7]:
with open('rel_list.pickle', 'rb') as file:
    rel_list = pickle.load(file)

def get_relation_list(e1, e2):
    try:
        return rel_list[e1 + ' ' + e2]
    except KeyError:
        try:
            sparql.setQuery("""
            SELECT ?relation WHERE {
                wd:%s ?relation wd:%s .
            }
            """ % (e1, e2))
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            prop_list = [i['relation']['value'].replace('http://www.wikidata.org/prop/direct/', '') for i in results['results']['bindings']]
            rel_list[e1 + ' ' + e2] = prop_list
            return prop_list
        except OSError:
            sparql.setQuery("""
            SELECT ?relation WHERE {
                wd:%s ?relation wd:%s .
            }
            """ % (e1, e2))
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            prop_list = [i['relation']['value'].replace('http://www.wikidata.org/prop/direct/', '') for i in results['results']['bindings']]
            rel_list[e1 + ' ' + e2] = prop_list
            return prop_list

In [8]:
with open('entity_type.pickle', 'rb') as file:
    entity_type = pickle.load(file)

In [9]:
def get_entity_type(e):
    try:
        if type(entity_type[e]) == bytes:
            return entity_type[e].decode("utf-8")
        elif type(entity_type[e]) == str:
            return entity_type[e]
        else:
            return str(entity_type[e])
    except KeyError:
        ent_type = '-'
        return ent_type

In [10]:
idx = 0
len_train = len(train)

for i in tqdm(range(len_train)):
    rel_oie = train['rel_oie'][i]
    e1 = train['e1_oie_id'][i]
    e2 = train['e2_oie_id'][i]
    
    e1_type = get_entity_type(e1)
    e2_type = get_entity_type(e2)
    kb_rels = get_relation_list(e1, e2)
    
    for rel in kb_rels:
        align.at[idx, 'rel_oie'] = rel_oie
        align.at[idx, 'rel_kb'] = rel
        align.at[idx, 'e1_type'] = e1_type
        align.at[idx, 'e2_type'] = e2_type
        idx += 1

100%|██████████| 297414/297414 [5:22:34<00:00, 15.37it/s]  


In [11]:
if len(rel_list) > 0:
    with open('rel_list.pickle', 'wb') as handle:
        pickle.dump(rel_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
if len(entity_type) > 0:
    with open('entity_type.pickle', 'wb') as handle:
        pickle.dump(entity_type, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou
0,was filmed in,,P462,film,color,,
1,is,,P462,film,color,,
2,is,,P462,film,color,,
3,starring,,P161,film,human,,
4,starring,,P161,film,human,,
5,directed by,,P57,film,human,,
6,directed by,,P162,film,human,,
7,are,,P462,color scheme,safest web colors,,
8,starring,,P161,film,human,,
9,is still seen,,P462,fictional automobile,color,,


In [13]:
oie_triples_align = align.groupby(['rel_oie', 'e1_type', 'e2_type']).size()
oie_triples_align_total = align.groupby(['rel_oie']).size()

In [14]:
for i in tqdm(range(len(align))):
    rel_oie = align['rel_oie'][i]
    e1_type = align['e1_type'][i]
    e2_type = align['e2_type'][i]
    
    # Calculate K
    K = len(oie_triples_align[rel_oie]) / oie_triples[rel_oie]
    if K > 1:
        K = 1
    align['K'][i] = K
        
    # Calculate conf
    conf = oie_triples_align[rel_oie][e1_type][e2_type] / oie_triples_align_total[rel_oie]
    align['conf'][i] = conf
    
    # Calculate tou
    tou = (1 - K) / conf
    align['tou'][i] = conf

100%|██████████| 219086/219086 [34:53<00:00, 104.66it/s]


In [15]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou
0,was filmed in,0.161905,P462,film,color,0.0204082,0.0204082
1,is,0.0241922,P462,film,color,0.00209581,0.00209581
2,is,0.0241922,P462,film,color,0.00209581,0.00209581
3,starring,0.0051713,P161,film,human,0.957923,0.957923
4,starring,0.0051713,P161,film,human,0.957923,0.957923
5,directed by,0.000747513,P57,film,human,0.86467,0.86467
6,directed by,0.000747513,P162,film,human,0.86467,0.86467
7,are,0.0378657,P462,color scheme,safest web colors,0.0163934,0.0163934
8,starring,0.0051713,P161,film,human,0.957923,0.957923
9,is still seen,0.5,P462,fictional automobile,color,1,1


In [16]:
align.to_csv('align_v6.tsv', sep='\t', index=False)

### Run Linear Regression

In [17]:
import numpy as np
import pandas as pd
import pickle
from sklearn import linear_model
from tqdm import tqdm

In [18]:
align = pd.read_csv('align_v6.tsv', sep='\t')

In [19]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou
0,was filmed in,0.161905,P462,film,color,0.020408,0.020408
1,is,0.024192,P462,film,color,0.002096,0.002096
2,is,0.024192,P462,film,color,0.002096,0.002096
3,starring,0.005171,P161,film,human,0.957923,0.957923
4,starring,0.005171,P161,film,human,0.957923,0.957923
5,directed by,0.000748,P57,film,human,0.864670,0.864670
6,directed by,0.000748,P162,film,human,0.864670,0.864670
7,are,0.037866,P462,color scheme,safest web colors,0.016393,0.016393
8,starring,0.005171,P161,film,human,0.957923,0.957923
9,is still seen,0.500000,P462,fictional automobile,color,1.000000,1.000000


In [20]:
tou_list = np.array(align['tou']).reshape(-1, 1)
K_list = np.array(align['K']).reshape(-1, 1)

In [21]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(K_list, tou_list)

# Make predictions using the testing set
tou_pred = regr.predict(K_list)

In [22]:
tou_h = []
for i in tqdm(range(len(tou_pred))):
    tou_h.append(tou_pred[i][0])

100%|██████████| 219086/219086 [00:00<00:00, 1039991.13it/s]


In [23]:
align['tou_h'] = tou_h

In [24]:
align

Unnamed: 0,rel_oie,K,rel_kb,e1_type,e2_type,conf,tou,tou_h
0,was filmed in,0.161905,P462,film,color,0.020408,0.020408,0.444117
1,is,0.024192,P462,film,color,0.002096,0.002096,0.355810
2,is,0.024192,P462,film,color,0.002096,0.002096,0.355810
3,starring,0.005171,P161,film,human,0.957923,0.957923,0.343613
4,starring,0.005171,P161,film,human,0.957923,0.957923,0.343613
5,directed by,0.000748,P57,film,human,0.864670,0.864670,0.340777
6,directed by,0.000748,P162,film,human,0.864670,0.864670,0.340777
7,are,0.037866,P462,color scheme,safest web colors,0.016393,0.016393,0.364578
8,starring,0.005171,P161,film,human,0.957923,0.957923,0.343613
9,is still seen,0.500000,P462,fictional automobile,color,1.000000,1.000000,0.660916


In [25]:
is_aligned = []
for i in tqdm(range(len(tou_h))):
    thres = align['tou_h'][i]
    val = align['tou'][i]
    if val < thres:
        is_aligned.append(1)
    else:
        is_aligned.append(0)

100%|██████████| 219086/219086 [00:04<00:00, 51228.57it/s]


In [26]:
align['is_aligned'] = is_aligned

In [27]:
align_pos = align[align.is_aligned == 1]

In [28]:
align_pos_grouped = align_pos.groupby(['rel_kb', 'rel_oie']).size()

In [29]:
align_pos_df = align_pos_grouped.to_frame().reset_index()
align_pos_df = align_pos_df.drop(0, axis=1)

In [30]:
align_pos_df

Unnamed: 0,rel_kb,rel_oie
0,P1001,Council of
1,P1001,branch of
2,P1001,is In
3,P1001,is in
4,P101,graduated in
5,P101,has
6,P101,is In
7,P101,is in
8,P101,is professor of
9,P101,merged with


In [31]:
kb_to_oie = {}
for i in range(len(align_pos_df)):
    key = align_pos_df['rel_kb'][i]
    val = align_pos_df['rel_oie'][i]
    if key not in kb_to_oie:
        kb_to_oie[key] = [val]
    else:
        kb_to_oie[key] = kb_to_oie[key] + [val]

In [32]:
kb_to_oie

{'P1001': ['Council of', 'branch of', 'is In', 'is in'],
 'P101': ['graduated in',
  'has',
  'is In',
  'is in',
  'is professor of',
  'merged with',
  'of',
  'studied',
  'used'],
 'P102': ['President of',
  'founder of',
  'in',
  'is member of',
  'joined',
  'member of',
  'of',
  'represented',
  'was member of'],
 'P103': ['appeared in', 'had', 'is', 'is in', 'speak', 'was', 'wrote'],
 'P1037': ['was founded by'],
 'P1038': ['known as'],
 'P1040': ['starring'],
 'P1049': ['is In'],
 'P105': ['are',
  'be',
  'became',
  'being',
  'is',
  'is listed as',
  'was',
  'was described as',
  'was first described as'],
 'P1056': ['distributed', 'is edition of'],
 'P106': ['became',
  'called',
  'is',
  'is considered',
  'is daughter of',
  'is married to',
  'is older sister of',
  'is sister of',
  'lead',
  'was',
  'was appointed',
  'was born to',
  'was married to',
  'was son of',
  'worked as'],
 'P1066': ['joined'],
 'P108': ['Emeritus at',
  'attended',
  'became at',
  '

In [33]:
with open('align_v6.p', 'wb') as fp:
    pickle.dump(kb_to_oie, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
"'s son is" in kb_to_oie['P40']

True