# Extract all mappings between Wikipedia/DBpedia and Wikidata

We finally store these into the file `dbpedia-wikidata.json`.

In [None]:
import gzip

In [None]:
d2w={}
with gzip.open('sitelinks.tsv.gz', 'rb') as f:
    header=next(f)
    for line in f:
        l=line.decode()
        row=l.split('\t')
        node1=row[1]
        node2=row[3]
        lang=row[4]
        if 'https://en.wikipedia.org/wiki' not in node2 or lang!='en': continue
        dbp=node2.replace('https://en.wikipedia.org/wiki/', '')
        d2w[dbp]=node1

In [None]:
len(d2w)

In [None]:
import json

In [None]:
with open('dbpedia-wikidata.json', 'w') as w:
    json.dump(d2w, w)

# Extract all DBpedia entities from IMKG in order to replace them

In [None]:
file_to_map='projects/tutorial-kypher/temp.tutorial-kypher/memes_wikidata_all.tsv.gz'
to_link=set()
with gzip.open(file_to_map, 'rb') as f:
    header=next(f)
    print(header.decode())
    for line in f:
        l=line.decode().strip()
        row=l.split('\t')
        if len(row)<3:
            print(row)
            continue
        if row[2].startswith('dbo:') or row[2].startswith('dbr:'):
            to_link.add(row[2])
        if row[0].startswith('dbo:') or row[0].startswith('dbr:'):
            to_link.add(row[0])

In [None]:
len(to_link)

In [None]:
can_link=set()
mappings={}
for db in to_link:
    dbp=db.replace('dbo:', '').replace('dbr:', '')
    if dbp in d2w.keys():
        can_link.add(db)
        mappings[db]=d2w[dbp]

In [None]:
len(can_link)

# Create a new IMKG version by replacing the mapped entities, and discard the relations for the others

In [None]:
file_to_map='projects/tutorial-kypher/temp.tutorial-kypher/memes_wikidata_all.tsv.gz'
to_link=set()
all_rows=[]
with gzip.open(file_to_map, 'rb') as f:
    header=next(f).decode().strip().split('\t')
    all_rows.append(header)
    for line in f:
        l=line.decode().strip()
        row=l.split('\t')
        
        if len(row)<3:
            print(row)
            continue
        new_row=[]
        if row[0] in mappings.keys():
            new_row.append(mappings[row[0]])
        else:
            new_row.append(row[0])
        new_row.append(row[1])
        if row[2] in mappings.keys():
            new_row.append(mappings[row[2]])
        else:
            new_row.append(row[2])
        all_rows.append(new_row)

In [None]:
len(all_rows)

In [None]:
with gzip.open('projects/tutorial-kypher/temp.tutorial-kypher/memes_wikidata_all_wd.tsv.gz', 'wb') as w:
    for r in all_rows:
        s='\t'.join(r) + '\n'
        w.write(s.encode())

# Use KGTK to compute statistics of the resulting graph

In [None]:
!TEMP=
!kgtk graph-statistics -i projects/tutorial-kypher/temp.tutorial-kypher/memes_wikidata_all_wd.tsv.gz \
     --log-file projects/tutorial-kypher/temp.tutorial-kypher/memes_wd_summary.txt \
     --output-statistics-only \
     -o projects/tutorial-kypher/temp.tutorial-kypher/meme_wd_stats.tsv

In [None]:
!cat projects/tutorial-kypher/temp.tutorial-kypher/memes_wd_summary.txt

# Summary

1. There are 2,625 DBpedia entities in the graph. We can link 2,482 of them.
2. This has a very minor impact on the graph statistics, which is expected because the entities are disconnected both before and after.


# Discussion

1. What to do with the entities we can map? Can we just replace them? Replace.
2. What to do with the entities we cannot map? Just discard their relations in the graph? Discard.

# Todos

1. Incorporate the mapping step in Riccardo's notebook.