In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [2]:
def get_head_triplets(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?subjectLabel ?propertyLabel ?objectLabel ?object   WHERE {{

      SERVICE wikibase:label {{ 
        bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
      }}
      VALUES (?subject) {{(wd:{entity_id})}}  
      ?subject ?predicate ?object .
      ?property wikibase:directClaim ?predicate.

      FILTER(STRSTARTS(STR(?predicate), "http://www.wikidata.org/prop/direct/")) .
      FILTER(STRSTARTS(STR(?object), "http://www.wikidata.org/entity/")) .

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    output_triplets = []

    for result in results["results"]["bindings"]:
        obj_id = result['object']['value'].split("/")[-1]
        subject = result["subjectLabel"]["value"]
        predicate = result["propertyLabel"]["value"]
        object_ = result["objectLabel"]["value"]
    
        output_triplets.append({"subject": subject, "predicate": predicate, "object": object_, "subj_id": entity_id,"obj_id": obj_id})
    
    return output_triplets

get_head_triplets("Q19837")

[{'subject': 'Steve Jobs',
  'predicate': 'place of birth',
  'object': 'San Francisco',
  'subj_id': 'Q19837',
  'obj_id': 'Q62'},
 {'subject': 'Steve Jobs',
  'predicate': 'place of death',
  'object': 'Palo Alto',
  'subj_id': 'Q19837',
  'obj_id': 'Q47265'},
 {'subject': 'Steve Jobs',
  'predicate': 'sex or gender',
  'object': 'male',
  'subj_id': 'Q19837',
  'obj_id': 'Q6581097'},
 {'subject': 'Steve Jobs',
  'predicate': 'father',
  'object': 'Abdulfattah Jandali',
  'subj_id': 'Q19837',
  'obj_id': 'Q12605967'},
 {'subject': 'Steve Jobs',
  'predicate': 'mother',
  'object': 'Joanne Carole Schieble Simpson',
  'subj_id': 'Q19837',
  'obj_id': 'Q28941744'},
 {'subject': 'Steve Jobs',
  'predicate': 'spouse',
  'object': 'Laurene Powell Jobs',
  'subj_id': 'Q19837',
  'obj_id': 'Q3133593'},
 {'subject': 'Steve Jobs',
  'predicate': 'country of citizenship',
  'object': 'United States of America',
  'subj_id': 'Q19837',
  'obj_id': 'Q30'},
 {'subject': 'Steve Jobs',
  'predicate':

In [3]:
def get_tail_triplets(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?subjectLabel ?propertyLabel ?objectLabel ?subject WHERE {{

      SERVICE wikibase:label {{ 
        bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
      }}

      VALUES (?object) {{(wd:{entity_id})}}  
      ?subject ?predicate ?object .
      ?property wikibase:directClaim ?predicate.

      FILTER(STRSTARTS(STR(?predicate), "http://www.wikidata.org/prop/direct/")) .
      FILTER(STRSTARTS(STR(?object), "http://www.wikidata.org/entity/")) .

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    output_triplets = []
    for result in results["results"]["bindings"]:
        subject = result["subjectLabel"]["value"]
        predicate = result["propertyLabel"]["value"]
        object_ = result["objectLabel"]["value"]
        subj_id = result['subject']['value'].split("/")[-1]

        output_triplets.append({"subject": subject, "predicate": predicate, "object": object_, "subj_id": subj_id, "obj_id": entity_id})
    
    return output_triplets

get_tail_triplets("Q19837")

[{'subject': 'Lisa Brennan-Jobs',
  'predicate': 'father',
  'object': 'Steve Jobs',
  'subj_id': 'Q2983236',
  'obj_id': 'Q19837'},
 {'subject': 'Reed Paul',
  'predicate': 'father',
  'object': 'Steve Jobs',
  'subj_id': 'Q20895861',
  'obj_id': 'Q19837'},
 {'subject': 'Erin Sienna',
  'predicate': 'father',
  'object': 'Steve Jobs',
  'subj_id': 'Q20895866',
  'obj_id': 'Q19837'},
 {'subject': 'Eve Jobs',
  'predicate': 'father',
  'object': 'Steve Jobs',
  'subj_id': 'Q20895867',
  'obj_id': 'Q19837'},
 {'subject': 'Laurene Powell Jobs',
  'predicate': 'spouse',
  'object': 'Steve Jobs',
  'subj_id': 'Q3133593',
  'obj_id': 'Q19837'},
 {'subject': 'Abdulfattah Jandali',
  'predicate': 'child',
  'object': 'Steve Jobs',
  'subj_id': 'Q12605967',
  'obj_id': 'Q19837'},
 {'subject': 'Paul Jobs',
  'predicate': 'child',
  'object': 'Steve Jobs',
  'subj_id': 'Q28941300',
  'obj_id': 'Q19837'},
 {'subject': 'Clara Hagopian Jobs',
  'predicate': 'child',
  'object': 'Steve Jobs',
  'subj

In [4]:
jobs_triplets = get_head_triplets("Q19837") + get_tail_triplets("Q19837")
jobs_df = pd.DataFrame(jobs_triplets).drop_duplicates()
jobs_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Steve Jobs,place of birth,San Francisco,Q19837,Q62
1,Steve Jobs,place of death,Palo Alto,Q19837,Q47265
2,Steve Jobs,sex or gender,male,Q19837,Q6581097
3,Steve Jobs,father,Abdulfattah Jandali,Q19837,Q12605967
4,Steve Jobs,mother,Joanne Carole Schieble Simpson,Q19837,Q28941744
...,...,...,...,...,...
146,Justia Patents inventor ID,Wikidata property example,Steve Jobs,P3874,Q19837
147,Steve Jobs,different from,Steve Jobs,Q18754959,Q19837
148,Steve Jobs Archive,interested in,Steve Jobs,Q124618897,Q19837
149,FaceTime,significant person,Steve Jobs,Q648357,Q19837


In [5]:
# apple_triplets = get_head_triplets("Q312") + get_tail_triplets("Q312")
apple_triplets = get_head_triplets("Q312")
apple_df = pd.DataFrame(apple_triplets).drop_duplicates()
apple_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Apple,location,United States of America,Q312,Q30
1,Apple,has subsidiary,Apple Store,Q312,Q421253
2,Apple,has subsidiary,Claris,Q312,Q1095605
3,Apple,has subsidiary,Beats Electronics,Q312,Q1961036
4,Apple,has subsidiary,"FileMaker, Inc.",Q312,Q1982831
...,...,...,...,...,...
100,Apple,has works in the collection,Design Museum Gent,Q312,Q1809071
101,Apple,has works in the collection,Museum of Industry,Q312,Q2245203
102,Apple,copyright status as a creator,works protected by copyrights,Q312,Q73555012
103,Apple,external auditor,Ernst & Young LLP,Q312,Q98525448


In [6]:
woznyak_triplets = get_head_triplets("Q483382") + get_tail_triplets("Q483382")
woznyak_df = pd.DataFrame(woznyak_triplets).drop_duplicates()
woznyak_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Steve Wozniak,place of birth,San Jose,Q483382,Q16553
1,Steve Wozniak,sex or gender,male,Q483382,Q6581097
2,Steve Wozniak,spouse,Janet Hill,Q483382,Q22442739
3,Steve Wozniak,country of citizenship,United States of America,Q483382,Q30
4,Steve Wozniak,country of citizenship,Serbia,Q483382,Q403
...,...,...,...,...,...
84,iWoz,main subject,Steve Wozniak,Q5975708,Q483382
85,HP-65,used by,Steve Wozniak,Q220638,Q483382
86,Pause Giant AI Experiments: An Open Letter,signatory,Steve Wozniak,Q117349926,Q483382
87,The Art of Intrusion,author of foreword,Steve Wozniak,Q3818760,Q483382


In [7]:
pixar_triplets = get_head_triplets("Q127552") + get_tail_triplets("Q127552")
pixar_df = pd.DataFrame(pixar_triplets).drop_duplicates()
pixar_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Pixar,country,United States of America,Q127552,Q30
1,Pixar,instance of,animation studio,Q127552,Q1107679
2,Pixar,instance of,film production company,Q127552,Q1762059
3,Pixar,instance of,business,Q127552,Q4830453
4,Pixar,founded by,Edwin Catmull,Q127552,Q93161
...,...,...,...,...,...
228,Portal:Pixar,Wikimedia portal's main topic,Pixar,Q15613750,Q127552
229,Template:Pixar,template has topic,Pixar,Q4200363,Q127552
230,Q23662070,used by,Pixar,Q23662070,Q127552
231,Bao,copyright holder,Pixar,Q55070849,Q127552


In [8]:
next_triplets = get_head_triplets("Q308993") + get_tail_triplets("Q308993")
next_df = pd.DataFrame(next_triplets).drop_duplicates()
next_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,NeXT,country,United States of America,Q308993,Q30
1,NeXT,instance of,business,Q308993,Q4830453
2,NeXT,instance of,enterprise,Q308993,Q6881511
3,NeXT,founded by,Steve Jobs,Q308993,Q19837
4,NeXT,owned by,Steve Jobs,Q308993,Q19837
5,NeXT,headquarters location,Redwood City,Q308993,Q505549
6,NeXT,chief executive officer,Steve Jobs,Q308993,Q19837
7,NeXT,industry,computer hardware,Q308993,Q3966
8,NeXT,industry,software industry,Q308993,Q880371
9,NeXT,location of formation,Palo Alto,Q308993,Q47265


In [9]:
all_df = pd.concat([jobs_df, apple_df, woznyak_df, pixar_df, next_df])
all_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Steve Jobs,place of birth,San Francisco,Q19837,Q62
1,Steve Jobs,place of death,Palo Alto,Q19837,Q47265
2,Steve Jobs,sex or gender,male,Q19837,Q6581097
3,Steve Jobs,father,Abdulfattah Jandali,Q19837,Q12605967
4,Steve Jobs,mother,Joanne Carole Schieble Simpson,Q19837,Q28941744
...,...,...,...,...,...
43,NeXT logo,represents,NeXT,Q99519556,Q308993
44,DB13W3,used by,NeXT,Q595385,Q308993
45,Motorola 56001,used by,NeXT,Q115781990,Q308993
46,Steve Jobs,owner of,NeXT,Q19837,Q308993


In [10]:
all_df = all_df.drop_duplicates()
all_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id
0,Steve Jobs,place of birth,San Francisco,Q19837,Q62
1,Steve Jobs,place of death,Palo Alto,Q19837,Q47265
2,Steve Jobs,sex or gender,male,Q19837,Q6581097
3,Steve Jobs,father,Abdulfattah Jandali,Q19837,Q12605967
4,Steve Jobs,mother,Joanne Carole Schieble Simpson,Q19837,Q28941744
...,...,...,...,...,...
42,macOS,influenced by,NeXT,Q14116,Q308993
43,NeXT logo,represents,NeXT,Q99519556,Q308993
44,DB13W3,used by,NeXT,Q595385,Q308993
45,Motorola 56001,used by,NeXT,Q115781990,Q308993


In [11]:
import requests

def wikidata_id2wikipedia_name(ids):
    
    num_batches = len(ids) // 50 + int(len(ids) % 50 != 0)
    names = {}

    for batch in range(num_batches):
        id_batch = ids[batch*50:batch*50+50]
        id_batch = "|".join(id_batch)
        res = requests.get("https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&ids={}&sitefilter=enwiki".format(id_batch)).json()
        for entity in res['entities']:
            if "sitelinks" in res['entities'][entity] and "enwiki" in res['entities'][entity]["sitelinks"]:
                names[entity] = res['entities'][entity]["sitelinks"]["enwiki"]["title"]
    return names

def get_alternative_labels(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?item ?itemAltLabel WHERE {{

    VALUES (?subject) {{(wd:{entity_id})}}  
    ?subject skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en")
 

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results = results["results"]["bindings"]

    output_labels = []
    for res in results:
        output_labels.append(res['itemAltLabel']['value'])
    
    return output_labels
get_alternative_labels("Q483382")

['Mr. Steve Wozniak',
 'Stephan Gary Wozniak',
 'Stephen Gary Wozniak',
 'Stephen Wozniak',
 'The Woz',
 'Woz']

In [12]:
ids = list(set(list(all_df['subj_id']) + list(all_df['obj_id'])))
wikidata_id2wiki_mapping = wikidata_id2wikipedia_name(ids)
wikidata_id2wiki_mapping

{'Q43845': 'Businessperson',
 'Q5700571': 'Heinz Awards',
 'Q14094': 'IPod Touch',
 'Q1150306': 'Disney Legends',
 'Q2252704': 'Partysaurus Rex',
 'Q2730061': 'Bertrand Serlet',
 'Q21974358': 'Zaltair',
 'Q18002980': 'Lava (2014 film)',
 'Q3818760': 'The Art of Intrusion',
 'Q61740829': 'Kitbull',
 'Q1426558': 'NeXTstation',
 'Q30587923': 'Designed by Apple in California',
 'Q269214': 'John Lasseter',
 'Q47265': 'Palo Alto, California',
 'Q6983889': 'NeXTcube Turbo',
 'Q785706': 'One Man Band (film)',
 'Q168756': 'University of California, Berkeley',
 'Q8257564': 'Category:Apple Inc. people',
 'Q6173448': 'Wikipedia:Vital articles/Level/4',
 'Q80978': 'Hewlett-Packard',
 'Q19673': 'Al Gore',
 'Q7139120': 'Category:Pixar',
 'Q17386294': 'Angus MacLane',
 'Q18810921': 'Chrisann Brennan',
 'Q189471': 'Cupertino, California',
 'Q1193981': 'OpenStep',
 'Q388105': 'Presto (animation software)',
 'Q48971': 'Susan Kare',
 'Q3966': 'Computer hardware',
 'Q42417740': 'Woz U',
 'Q6581097': 'Male 

In [14]:
import time

wikidata_id2alternative_name = {}
for id_ in ids:
    wikidata_id2alternative_name[id_] = get_alternative_labels(id_)
    time.sleep(0.3)

In [15]:
len(wikidata_id2alternative_name)

489

In [16]:
wikidata_id2alternative_name

{'Q67311526': ['obalkyknih.cz', 'Obálky knih CZ'],
 'Q43845': ['dealer',
  'businessman',
  'business person',
  'businesswoman',
  'business man',
  'business woman',
  'business people',
  'businessmen',
  'businesspeople'],
 'Q5700571': ['Heinz Awards'],
 'Q114304773': [],
 'Q14094': ['Apple iPod Touch'],
 'Q1150306': [],
 'Q2252704': [],
 'Q116790970': ['WGS 2015'],
 'Q2730061': [],
 'Q21974358': ['Zaltair 8800'],
 'Q18002980': [],
 'Q3818760': [],
 'Q61740829': [],
 'Q52987978': [],
 'Q1426558': [],
 'Q27910148': [],
 'Q30587923': [],
 'Q54809843': ['.usdz',
  'Universal Scene Description Zipped AR format (USDC)',
  'usdz'],
 'Q269214': ['John A. Lasseter', 'John Alan Lasseter'],
 'Q47265': ['Palo Alto (duplicate)', 'Palo Alto, CA', 'Palo Alto, California'],
 'Q6983889': [],
 'Q785706': [],
 'Q168756': ['U.C. Berkeley',
  'University of California–Berkeley',
  'UCB',
  'Berkeley',
  'Cal',
  'Berkeley University',
  'berkeley.edu',
  'Cal-Berkeley',
  'UC Berkeley',
  'Univ. of Ca

In [24]:
all_df['wiki_subj'] = all_df['subj_id'].apply(lambda x: wikidata_id2wiki_mapping[x] if x in wikidata_id2wiki_mapping else None)
all_df['wiki_obj'] = all_df['obj_id'].apply(lambda x: wikidata_id2wiki_mapping[x] if x in wikidata_id2wiki_mapping else None)

all_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id,wiki_subj,wiki_obj
0,Steve Jobs,place of birth,San Francisco,Q19837,Q62,Steve Jobs,San Francisco
1,Steve Jobs,place of death,Palo Alto,Q19837,Q47265,Steve Jobs,"Palo Alto, California"
2,Steve Jobs,sex or gender,male,Q19837,Q6581097,Steve Jobs,Male gender
3,Steve Jobs,father,Abdulfattah Jandali,Q19837,Q12605967,Steve Jobs,
4,Steve Jobs,mother,Joanne Carole Schieble Simpson,Q19837,Q28941744,Steve Jobs,
...,...,...,...,...,...,...,...
42,macOS,influenced by,NeXT,Q14116,Q308993,MacOS,NeXT
43,NeXT logo,represents,NeXT,Q99519556,Q308993,,NeXT
44,DB13W3,used by,NeXT,Q595385,Q308993,DB13W3,NeXT
45,Motorola 56001,used by,NeXT,Q115781990,Q308993,,NeXT


In [25]:
all_df = all_df.dropna()
all_df

Unnamed: 0,subject,predicate,object,subj_id,obj_id,wiki_subj,wiki_obj
0,Steve Jobs,place of birth,San Francisco,Q19837,Q62,Steve Jobs,San Francisco
1,Steve Jobs,place of death,Palo Alto,Q19837,Q47265,Steve Jobs,"Palo Alto, California"
2,Steve Jobs,sex or gender,male,Q19837,Q6581097,Steve Jobs,Male gender
5,Steve Jobs,spouse,Laurene Powell Jobs,Q19837,Q3133593,Steve Jobs,Laurene Powell Jobs
6,Steve Jobs,country of citizenship,United States of America,Q19837,Q30,Steve Jobs,United States
...,...,...,...,...,...,...,...
39,NeXTcube Turbo,developer,NeXT,Q6983889,Q308993,NeXTcube Turbo,NeXT
40,Category:NeXT,category's main topic,NeXT,Q7214353,Q308993,Category:NeXT,NeXT
41,Doom II,platform,NeXT,Q755186,Q308993,Doom II,NeXT
42,macOS,influenced by,NeXT,Q14116,Q308993,MacOS,NeXT


In [26]:
all_df.to_csv("wikidata_df.csv")

In [27]:
df = pd.read_csv('full_triplets.csv', index_col=0)
df = df.reset_index(drop=True)
# df = df.drop_duplicates()
df

Unnamed: 0,subject,relation,object
0,Steven Paul Jobs,date of birth,"February 24, 1955"
1,Steven Paul Jobs,date of death,"October 5, 2011"
2,Steven Paul Jobs,country of citizenship,United States of America
3,Steven Paul Jobs,occupation,Businessman
4,Steven Paul Jobs,occupation,Inventor
...,...,...,...
286,several other businesses and philanthropic ven...,specialized in,Microprocessor
287,several other businesses and philanthropic ven...,specialized in,technology and pop culture conventions
288,several other businesses and philanthropic ven...,specialized in,technology in K–12 schools
289,several other businesses and philanthropic ven...,specialized in,environmental practices


In [38]:
wikipedia_subjects = list(df.subject.unique())
wikipedia_objects = list(df.object.unique())
wikipedia_entities = list(set(wikipedia_subjects + wikipedia_objects))

wikipedia_relations = list(df.relation.unique())
len(wikipedia_entities), len(wikipedia_relations)

(195, 54)

In [39]:
aligned_triplets = []
for _, row in all_df.iterrows():

    triplet_alternative = {"subject": row["subject"], "relation": row["predicate"], "object": row["object"]}
    
    subject_alternatives = wikidata_id2alternative_name[row['subj_id']] + [row['wiki_subj'], row['subject']]
    
    for name in subject_alternatives:
        if name in wikipedia_entities:
            triplet_alternative['subject'] = name
            break

    object_alternatives = wikidata_id2alternative_name[row['obj_id']] + [row['wiki_obj'], row['object']]

    for name in object_alternatives:
        if name in wikipedia_entities:
            triplet_alternative['object'] = name
            break
    aligned_triplets.append(triplet_alternative)

aligned_df = pd.DataFrame(aligned_triplets)
aligned_df

Unnamed: 0,subject,relation,object
0,Steven Paul Jobs,place of birth,San Francisco
1,Steven Paul Jobs,place of death,Palo Alto
2,Steven Paul Jobs,sex or gender,male
3,Steven Paul Jobs,spouse,Laurene Powell Jobs
4,Steven Paul Jobs,country of citizenship,United States of America
...,...,...,...
470,NeXTcube Turbo,developer,NeXT
471,Category:NeXT,category's main topic,NeXT
472,Doom II,platform,NeXT
473,Mac OS X,influenced by,NeXT


In [58]:
wikidata_subjects = list(aligned_df.subject.unique())
wikidata_objects = list(aligned_df.object.unique())
wikidata_entities = list(set(wikidata_subjects + wikidata_objects))

wikidata_relations = list(aligned_df.relation.unique())
len(wikidata_entities), len(wikidata_relations)

(357, 92)

## Comparing with the composed KG

In [59]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [60]:
device = "cuda:5"
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever').to(device)

In [61]:
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings


def embed_batch(names):
    inputs = tokenizer(names, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs.to(device))
    embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
    return np.array(embeddings.detach().cpu())

In [62]:
embed_batch(["Steve Jobs", "Stephen Woznyak"]).shape

(2, 768)

In [63]:
all_df.to_csv("wikidata_df.csv")

In [64]:
len(set(wikidata_entities) & set(wikipedia_entities))

62

In [65]:
wikipedia_entities_embedded = embed_batch(wikipedia_entities)
wikipedia_entities_embedded.shape

(195, 768)

In [66]:
wikipedia_relations_embedded = embed_batch(wikipedia_relations)
wikipedia_relations_embedded.shape

(54, 768)

In [67]:
wikidata_entities_embedded = embed_batch(wikidata_entities)
wikidata_entities_embedded.shape

(357, 768)

In [68]:
wikidata_relations_embedded = embed_batch(wikidata_relations)
wikidata_relations_embedded.shape

(92, 768)

In [69]:
from sklearn import metrics

relation_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_relations_embedded, wikipedia_relations_embedded)
relation_similarity_matrix.shape

(92, 54)

In [70]:
entity_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_entities_embedded, wikipedia_entities_embedded)
entity_similarity_matrix.shape

(357, 195)

In [71]:
best_entity_pairs = np.argmax(entity_similarity_matrix, axis=0)
len(best_entity_pairs)

195

In [147]:
pedia2data_entity = {}
best_entity_pairs = np.argmax(entity_similarity_matrix, axis=0)

for i, _ in enumerate(wikipedia_entities):
    if entity_similarity_matrix[best_entity_pairs[i]][i] > 0.5:
        pedia2data_entity[wikipedia_entities[i]] = wikidata_entities[best_entity_pairs[i]]
pedia2data_entity

{'iPad': 'iPad',
 '1.5 million shares': 'Nasdaq',
 'Steven Paul Jobs': 'Steven Paul Jobs',
 'Technology company': 'Technology company',
 'Computer graphics division of Lucasfilm': 'Lucasfilm',
 'Toy Story 4': 'Toy Story 4',
 'Apple Watch': 'Apple Watch',
 'NeXTcube': 'NeXTcube',
 'Computers for higher-education and business markets': 'computing',
 "NeXTSTEP operating system's application layer": 'NeXTSTEP',
 'Electronic kit form': 'Music Kit',
 'Finding Dory': 'Finding Dory',
 'iPhone': 'iPhone',
 '50 highest-grossing films of all time': 'film industry',
 'NeXT': 'NeXT',
 'iCloud': 'iPhone',
 'NeXTstation': 'NeXTstation',
 'WALL-E': 'WALL-E',
 'John Lasseter': 'John Lasseter',
 'revenue': 'chief executive officer',
 'iPod': 'iPod',
 'Disney Entertainment': 'Disney',
 'Apple I personal computer': 'Personal computer',
 'Mac OS X': 'Mac OS X',
 'market capitalization': 'Dow Jones Industrial Average',
 'Graphical user interface': 'Mac OS X',
 '$3.2 trillion': 'S&P 500',
 'advertising campa

In [148]:
len(pedia2data_entity)

145

In [149]:
pedia2data_relation = {}
best_relation_pairs = np.argmax(relation_similarity_matrix, axis=0)

for i, _ in enumerate(wikipedia_relations):
    if relation_similarity_matrix[best_relation_pairs[i]][i] > 0.5:
        pedia2data_relation[wikipedia_relations[i]] = wikidata_relations[best_relation_pairs[i]]
pedia2data_relation

{'date of birth': 'place of birth',
 'date of death': 'place of birth',
 'country of citizenship': 'country of citizenship',
 'occupation': 'occupation',
 'notable work': 'notable work',
 'founder of': 'founded by',
 'business partner': 'board member',
 'place of birth': 'place of birth',
 'educated at': 'educated at',
 'instance of': 'instance of',
 'developer': 'developer',
 'influenced': 'influenced by',
 'has part': 'part of',
 'specialized in': 'founded by',
 'funded': 'commissioned by',
 'acquired': 'owned by',
 'died of': 'cause of death',
 'award received': 'award received',
 'headquarters location': 'headquarters location',
 'inception': 'founded by',
 'revenue': 'chief executive officer',
 'returned to': 'educated at',
 'succeeded': 'educated at',
 'largest manufacturing company by': 'manufacturer',
 'largest vendor of': 'manufacturer',
 'market capitalization': 'stock exchange',
 'has': 'industry',
 'developed': 'developer',
 'hosted on': 'distributed by',
 'contrast with': 

In [150]:
len(pedia2data_relation)

37

In [151]:
triplet_pairs = []

for i, row in df.iterrows():
    data_subj = pedia2data_entity[row['subject']] if row['subject'] in pedia2data_entity else None 
    data_obj = pedia2data_entity[row['object']] if row['object'] in pedia2data_entity else None
    data_rel = pedia2data_relation[row['relation']] if row['relation'] in pedia2data_relation else None

    if data_subj and data_obj and data_rel:
        triplet = aligned_df[(aligned_df['subject'] == data_subj) & (aligned_df['object'] == data_obj) & (aligned_df['relation'] == data_rel)]
        if len(triplet) > 0:
            triplet_pairs.append(((row['subject'], row['relation'], row['object']), (triplet.iloc[0,0], triplet.iloc[0,1], triplet.iloc[0,2])))

In [152]:
len(triplet_pairs), triplet_pairs

(19,
 [(('Steven Paul Jobs', 'country of citizenship', 'United States of America'),
   ('Steven Paul Jobs', 'country of citizenship', 'United States of America')),
  (('Steven Paul Jobs', 'occupation', 'Businessman'),
   ('Steven Paul Jobs', 'occupation', 'film producer')),
  (('Steven Paul Jobs', 'occupation', 'Inventor'),
   ('Steven Paul Jobs', 'occupation', 'Inventor')),
  (('Apple Inc.', 'founder of', 'Steven Paul Jobs'),
   ('Apple Inc.', 'founded by', 'Steven Paul Jobs')),
  (('Steven Paul Jobs', 'place of birth', 'San Francisco'),
   ('Steven Paul Jobs', 'place of birth', 'San Francisco')),
  (('Steven Paul Jobs', 'educated at', 'Reed College'),
   ('Steven Paul Jobs', 'educated at', 'Reed College')),
  (('Pixar', 'instance of', 'Animation studio'),
   ('Pixar', 'instance of', 'Animation studio')),
  (('Steven Paul Jobs', 'notable work', 'iMac'),
   ('Steven Paul Jobs', 'notable work', 'iMac')),
  (('Steven Paul Jobs', 'notable work', 'iTunes'),
   ('Steven Paul Jobs', 'notable

In [153]:
wikidata_triplets = list(aligned_df['subject'] +  " " + aligned_df['relation'] + " " + aligned_df['object'])
wikipedia_triplets =  list(df['subject'] +  " " + df['relation'] + " " + df['object'])

In [154]:
wikidata_triplets[:10]

['Steven Paul Jobs place of birth San Francisco',
 'Steven Paul Jobs place of death Palo Alto',
 'Steven Paul Jobs sex or gender male',
 'Steven Paul Jobs spouse Laurene Powell Jobs',
 'Steven Paul Jobs country of citizenship United States of America',
 'Steven Paul Jobs instance of human',
 'Steven Paul Jobs position held chief executive officer',
 'Steven Paul Jobs child Lisa Brennan-Jobs',
 'Steven Paul Jobs child Eve Jobs',
 'Steven Paul Jobs educated at De Anza College']

In [155]:
wikipedia_triplets[:10]

['Steven Paul Jobs date of birth February 24, 1955',
 'Steven Paul Jobs date of death October 5, 2011',
 'Steven Paul Jobs country of citizenship United States of America',
 'Steven Paul Jobs occupation Businessman',
 'Steven Paul Jobs occupation Inventor',
 'Steven Paul Jobs notable work Apple Inc.',
 'Steven Paul Jobs founder of NeXT',
 'Steven Paul Jobs founder of Pixar',
 'Steven Paul Jobs notable work Personal computer revolution',
 'Steven Paul Jobs business partner Steve Wozniak']

In [157]:
set(wikipedia_triplets) & set(wikidata_triplets), len(set(wikipedia_triplets) & set(wikidata_triplets))

({'Apple Inc. headquarters location Cupertino',
  'NeXT headquarters location Redwood City',
  'Pixar headquarters location Emeryville, California',
  'Pixar instance of Animation studio',
  'Steven Paul Jobs country of citizenship United States of America',
  'Steven Paul Jobs educated at Reed College',
  'Steven Paul Jobs notable work iMac',
  'Steven Paul Jobs notable work iTunes',
  'Steven Paul Jobs occupation Inventor',
  'Steven Paul Jobs place of birth San Francisco'},
 10)

In [158]:
wikidata_triplets_embedded = embed_batch(wikidata_triplets)
wikidata_triplets_embedded.shape

(475, 768)

In [159]:
wikipedia_triplets_embedded = embed_batch(wikipedia_triplets)
wikipedia_triplets_embedded.shape

(291, 768)

In [160]:
triplet_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_triplets_embedded, wikipedia_triplets_embedded)
triplet_similarity_matrix.shape

(475, 291)

In [161]:
best_triplet_pairs = np.argmax(triplet_similarity_matrix, axis=0)
best_triplet_pairs.shape

(291,)

In [174]:
triplet_pairs = []

for i, _ in enumerate(wikipedia_triplets):
    if triplet_similarity_matrix[best_triplet_pairs[i]][i] > 0.5:
        triplet_pairs.append((wikipedia_triplets[i], wikidata_triplets[best_triplet_pairs[i]]))

In [175]:
len(triplet_pairs)

260

In [176]:
aligned_df

Unnamed: 0,subject,relation,object
0,Steven Paul Jobs,place of birth,San Francisco
1,Steven Paul Jobs,place of death,Palo Alto
2,Steven Paul Jobs,sex or gender,male
3,Steven Paul Jobs,spouse,Laurene Powell Jobs
4,Steven Paul Jobs,country of citizenship,United States of America
...,...,...,...
470,NeXTcube Turbo,developer,NeXT
471,Category:NeXT,category's main topic,NeXT
472,Doom II,platform,NeXT
473,Mac OS X,influenced by,NeXT


In [179]:
aligned_df

Unnamed: 0,subject,relation,object
0,Steven Paul Jobs,place of birth,San Francisco
1,Steven Paul Jobs,place of death,Palo Alto
2,Steven Paul Jobs,sex or gender,male
3,Steven Paul Jobs,spouse,Laurene Powell Jobs
4,Steven Paul Jobs,country of citizenship,United States of America
...,...,...,...
470,NeXTcube Turbo,developer,NeXT
471,Category:NeXT,category's main topic,NeXT
472,Doom II,platform,NeXT
473,Mac OS X,influenced by,NeXT


## Comparing with linked names from wikipedia

In [180]:
pairs = []
pair_count = 0
edge_present = 0

counted_pairs = set()

wikipedia_common_triplets = []
wikidata_common_triplets = []


for _, row in aligned_df.iterrows():

    subj_name = row['subject']
    obj_name = row['object']


    if subj_name in wikipedia_entities and obj_name in wikipedia_entities:
        wikidata_common_triplets.append((row['subject'], row['relation'], row['object']))
        # edge_present += len(df[((df['subject'] == row['wiki_subj']) & (df['object'] == row['wiki_obj'])) | ((df['subject'] == row['wiki_obj']) & (df['object'] == row['wiki_subj']))])
        if (subj_name, obj_name) in counted_pairs or (obj_name, subj_name) in counted_pairs:
            continue
        else:
            edge_present += 1
            counted_pairs.add((subj_name, obj_name))

            intersected_triplets = df[((df['subject'] == subj_name) & (df['object'] == obj_name)) | ((df['subject'] == obj_name) & (df['object'] == subj_name))]

            if len(intersected_triplets) > 0:
                pair_count += 1
                for _, row_ in intersected_triplets.iterrows():
                    # wikidata_common_triplets.append((row['wiki_subj'], row['predicate'], row['wiki_obj']))
                    wikipedia_common_triplets.append((row_['subject'], row_['relation'], row_['object']))
                    # print(row['wiki_subj'], row['predicate'], row['wiki_obj'])
                    # print(row_['subject'], row_['relation'], row_['object'])
                    # print()
                # pair_count += len(intersected_triplets)
            
pair_count, pair_count/edge_present, edge_present

(44, 0.5714285714285714, 77)

In [189]:
"Up" in wikipedia_entities

True

In [192]:
df[df['subject'] == 'Up']

Unnamed: 0,subject,relation,object
228,Up,award received,Academy Award
236,Up,award nomination,Academy Award for Best Picture


In [73]:
for pedia_triplet, data_triplet in zip(wikipedia_common_triplets, wikidata_common_triplets):
    print(pedia_triplet, " | ", data_triplet)

('Steven Paul Jobs', 'place of birth', 'San Francisco')  |  ('Steve Jobs', 'place of birth', 'San Francisco')
('Steven Paul Jobs', 'country of citizenship', 'United States of America')  |  ('Steve Jobs', 'country of citizenship', 'United States')
('Steven Paul Jobs', 'educated at', 'Reed College')  |  ('Steve Jobs', 'educated at', 'Reed College')
('Steven Paul Jobs', 'occupation', 'Inventor')  |  ('Steve Jobs', 'occupation', 'Inventor')
('Steven Paul Jobs', 'notable work', 'Apple Inc.')  |  ('Steve Jobs', 'employer', 'Apple Inc.')
('Apple Inc.', 'founder of', 'Steven Paul Jobs')  |  ('Steve Jobs', 'employer', 'Apple Inc.')
('Steven Paul Jobs', 'founder of', 'Apple Inc.')  |  ('Steve Jobs', 'employer', 'Apple Inc.')
('Steven Paul Jobs', 'departed from', 'Apple Inc.')  |  ('Steve Jobs', 'employer', 'Apple Inc.')
('Steven Paul Jobs', 'occupation', 'Apple Inc.')  |  ('Steve Jobs', 'employer', 'Apple Inc.')
('Steven Paul Jobs', 'influenced', 'Apple Inc.')  |  ('Steve Jobs', 'employer', 'App

## Neo4j

In [185]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
# def add_node(tx, node_name):
#     tx.run("CREATE (n:WikidataNode {name: $node_name})", node_name=node_name)

# def add_relation(tx, head, tail, relation):
#     query = f"""
#         MATCH (a {{name: $head}}), (b {{name: $tail}})
#         CREATE (a)-[r:{relation}]->(b)
#         RETURN type(r)
#         """
#     result = tx.run(query, head=head, tail=tail, database_='abc')

# def get_node(tx, name):
#     result = tx.run("MATCH (n:WikidataNode {name: $name}) RETURN n.name AS name", name=name)
#     return [record["name"] for record in result]


# for i, row in all_df.iterrows():
#     head = row['subject']
#     tail = row['object']
#     relation = "_".join(row['predicate'].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
#     # print(head, tail, relation)
#     with driver.session() as session:
#         if not session.read_transaction(get_node, head):
#             session.write_transaction(add_node, head)

#         if not session.read_transaction(get_node, tail):
#             session.write_transaction(add_node, tail)
            
#         session.write_transaction(add_relation, head, tail, relation)

  if not session.read_transaction(get_node, head):
  if not session.read_transaction(get_node, tail):
  session.write_transaction(add_relation, head, tail, relation)
  session.write_transaction(add_node, tail)
  session.write_transaction(add_node, head)


In [187]:
def add_node(tx, node_name):
    tx.run("CREATE (n:WikidataNode {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail, database_='abc')

def get_node(tx, name):
    result = tx.run("MATCH (n:WikidataNode {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


for triplet in wikidata_common_triplets:
    head = triplet[0]
    tail = triplet[2]
    relation = "_".join(triplet[1].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
    # print(head, tail, relation)
    with driver.session() as session:
        if not session.read_transaction(get_node, head):
            session.write_transaction(add_node, head)

        if not session.read_transaction(get_node, tail):
            session.write_transaction(add_node, tail)
            
        session.write_transaction(add_relation, head, tail, relation)

  if not session.read_transaction(get_node, head):
  session.write_transaction(add_node, head)
  if not session.read_transaction(get_node, tail):
  session.write_transaction(add_node, tail)
  session.write_transaction(add_relation, head, tail, relation)


In [188]:
def add_node(tx, node_name):
    tx.run("CREATE (n:WikipediaNode {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a:WikipediaNode {{name: $head}}), (b:WikipediaNode {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail, database_='abc')

def get_node(tx, name):
    result = tx.run("MATCH (n:WikipediaNode {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


for triplet in wikipedia_common_triplets:
    head = triplet[0]
    tail = triplet[2]
    relation = "_".join(triplet[1].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
    # print(head, tail, relation)
    with driver.session() as session:
        if not session.read_transaction(get_node, head):
            session.write_transaction(add_node, head)

        if not session.read_transaction(get_node, tail):
            session.write_transaction(add_node, tail)
            
        session.write_transaction(add_relation, head, tail, relation)

  if not session.read_transaction(get_node, head):
  session.write_transaction(add_node, head)
  if not session.read_transaction(get_node, tail):
  session.write_transaction(add_node, tail)
  session.write_transaction(add_relation, head, tail, relation)


In [186]:
def delete_all(tx):
    tx.run("MATCH (n) DETACH DELETE n")

with driver.session() as session:
    session.execute_write(delete_all)