# Wikidata enrichment

This notebook converts the IMKG graph to KGTK format and queries relevant entities in Wikidata to enrich IMKG with Wikidata knowledge.

## 0. Setup

In [72]:
import os
import os.path

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [73]:
# Parameters

# Folders on local machine where to create the output and temporary files:
input_path = "wikidata"
output_path = "projects"
project_name = "tutorial-kypher"

In [74]:
# IMKG files
imkg_dir='imkg03'
instances_file='%s/imgflip.seeds.nt' % imkg_dir
templates_file='%s/template.kym.nt' % imkg_dir
mapping_file='%s/kym2imgflip.sameAs(m4s).nt' % imkg_dir

In [75]:
big_files=["label"]

additional_files = {
    "P31": "derived.P31.tsv.gz",
    "items": "claims.wikibase-item.tsv.gz",
    "P1963": "derived.P1963computed.count.star.tsv.gz",
    "external": "claims.external-id.tsv.gz",
    "indegree": "metadata.in_degree.tsv.gz",
    "outdegree": "metadata.out_degree.tsv.gz",
    "pagerank": "metadata.pagerank.directed.tsv.gz"
}

ck = ConfigureKGTK(big_files)
ck.configure_kgtk(input_graph_path=input_path, 
                  output_path=output_path, 
                  project_name=project_name,
                  additional_files=additional_files)

User home: /Users/filipilievski
Current dir: /Users/filipilievski/mcs/imkg
KGTK dir: /Users/filipilievski/mcs
Use-cases dir: /Users/filipilievski/mcs/use-cases


In [76]:
ck.print_env_variables()

STORE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
EXAMPLES_DIR: /Users/filipilievski/mcs/examples
USE_CASES_DIR: /Users/filipilievski/mcs/use-cases
kypher: kgtk query --graph-cache projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
KGTK_OPTION_DEBUG: false
KGTK_LABEL_FILE: wikidata/labels.en.tsv.gz
TEMP: projects/tutorial-kypher/temp.tutorial-kypher
kgtk: kgtk
KGTK_GRAPH_CACHE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
GRAPH: wikidata
OUT: projects/tutorial-kypher
label: wikidata/labels.en.tsv.gz
P31: wikidata/derived.P31.tsv.gz
items: wikidata/claims.wikibase-item.tsv.gz
P1963: wikidata/derived.P1963computed.count.star.tsv.gz
external: wikidata/claims.external-id.tsv.gz
indegree: wikidata/metadata.in_degree.tsv.gz
outdegree: wikidata/metadata.out_degree.tsv.gz
pagerank: wikidata/metadata.pagerank.directed.tsv.gz


## 1. Import into KGTK

Define namespaces to make the import-ntriples command work:

In [62]:
namespaces={'"http://www.wikidata.org/entity/"': 'wde',
           '"https://www.wikidata.org/wiki/"': 'wdp',
           '"https://knowyourmeme.com/memes/"': 'kym',
           '"http://www.w3.org/2000/01/rdf-schema#"': 'rdfs',
           '"http://www.w3.org/1999/02/22-rdf-syntax-ns#"': 'rdf',
           '"http://dbpedia.org/resource/"': 'dbr',
           '"https://meme4.science/"': 'm4s',
           '"Http://xmlns.com/foaf/0.1/"': 'foaf',
           '"https://knowyourmeme.com/types/"': 'kymt',
           '"https://dbpedia.org/property/"': 'dbp',
           '"https://dbpedia.org/ontology/"': 'dbo',
           '"https://schema.org/"': 'schema',
           '"https://imgflip.com/i/"': 'imgflipi',
           '"https://imgflip.com/meme/"': 'imgflipmeme',
           '"https://imgflip.com/user/"': 'imgflipuser',
           '"https://imgflip.com/"': 'imgflipr'}

prop='prefix_expansion'
with open('namespaces.tsv', 'w') as w:
    w.write('node1\tlabel\tnode2\n')
    for k,v in namespaces.items():
        triple=[v, prop, k]
        w.write('\t'.join(triple) + '\n')

In [63]:
!cat namespaces.tsv

node1	label	node2
wde	prefix_expansion	"http://www.wikidata.org/entity/"
wdp	prefix_expansion	"https://www.wikidata.org/wiki/"
kym	prefix_expansion	"https://knowyourmeme.com/memes/"
rdfs	prefix_expansion	"http://www.w3.org/2000/01/rdf-schema#"
rdf	prefix_expansion	"http://www.w3.org/1999/02/22-rdf-syntax-ns#"
dbr	prefix_expansion	"http://dbpedia.org/resource/"
m4s	prefix_expansion	"https://meme4.science/"
foaf	prefix_expansion	"Http://xmlns.com/foaf/0.1/"
kymt	prefix_expansion	"https://knowyourmeme.com/types/"
dbp	prefix_expansion	"https://dbpedia.org/property/"
dbo	prefix_expansion	"https://dbpedia.org/ontology/"
schema	prefix_expansion	"https://schema.org/"
imgflipi	prefix_expansion	"https://imgflip.com/i/"
imgflipmeme	prefix_expansion	"https://imgflip.com/meme/"
imgflipuser	prefix_expansion	"https://imgflip.com/user/"
imgflipr	prefix_expansion	"https://imgflip.com/"


### 1a. Import templates

In [14]:
%%time
kgtk("""
    --progress
    import-ntriples -i "imkg02/template.kym.nt"  
        --namespace-file "namespaces.tsv"    
        -o $TEMP/raw_templates.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 3.91 ms, sys: 8.1 ms, total: 12 ms
Wall time: 4.71 s


Let's clean up the data a bit:

In [15]:
import pandas

templates_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_templates.kgtk.gz'
templates_df = pandas.read_csv(templates_fn, sep='\t')
templates_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_templates.kgtk'

In [64]:
def replace_me(n):
    try:
        return n.replace('wdp:Property:', '').replace('wdp:', '').replace('wdt:', '').replace('wde:', '')
    except:
        return n

In [65]:
def clean_df(df, filename):
    all_rows=[]
    for i, row in df.iterrows():
        n1, label, n2 = row
        n1=replace_me(n1)
        n2=replace_me(n2)
        label=replace_me(label)
        new_row=[str(n1),str(label),str(n2)]
        all_rows.append(new_row)

    with open(filename, 'w') as w:
        w.write('node1\tlabel\tnode2\n')
        for row in all_rows:
            w.write('\t'.join(row) + '\n')

We do some cleaning of the graph

In [18]:
clean_df(templates_df, templates_out)

In [19]:
!kgtk deduplicate -i $TEMP/clean_templates.kgtk -o $TEMP/templates.kgtk.gz

Now that we imported and deduplicated the instances, let's do some sanity check to make sure our graph is reasonable.

In [20]:
kgtk("""cat -i $TEMP/templates.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,Q104005472,P6760,kym:spongegar-primitive-sponge-caveman-spongebob
1,Q104841082,P6760,kym:the-cake-is-a-lie
2,Q104858864,P6760,kym:rule-63
3,Q104968209,P6760,kym:i-am-once-again-asking-for-your-financial-...
4,Q1050827,P6760,kym:waluigi
...,...,...,...
211884,noded9yQUzrarUkHBfCtv8PqJj-999,m4s:structured_uri,http://www.w3.org/2001/XMLSchema#timestamp
211885,noded9yQUzrarUkHBfCtv8PqJj-999,m4s:structured_value,1357981793
211886,rdf,prefix_expansion,http://www.w3.org/1999/02/22-rdf-syntax-ns#
211887,wde,prefix_expansion,http://www.wikidata.org/entity/


How many memes we have in the graph?

In [21]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '(n1)-[r:`rdf:type`]->(:`kym:Meme`)' \
    --return 'count(distinct n1)'

count(DISTINCT graph_20_c1."node1")
1404


What are the relations?

In [22]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

Relation
P6760
dbp:confidence
kym:about
kym:added
kym:child
kym:from
kym:last_update_source
kym:origin
kym:parent
kym:sibling
kym:spread
kym:status
kym:tag
kym:title
kym:year
m4s:fromAbout
m4s:fromImage
m4s:fromTags
m4s:structured_uri
m4s:structured_value
prefix_expansion
rdf:type


### 1b. Import instances

Now let's import the instances file into KGTK. This command takes around 6mins on my laptop, I was not sure if it is running or got stuck somewhere.
<font color='red'>Progress bar would really help us work with commands better</font>

In [77]:
%%time
kgtk("""
    import-ntriples -i "imkg03/imgflip.seeds.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_instances.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 16.4 ms, sys: 20.7 ms, total: 37.1 ms
Wall time: 41 s


In [78]:
instances_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_instances.kgtk.gz'
instances_df = pandas.read_csv(instances_fn, sep='\t')
instances_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_instances.kgtk'

In [79]:
clean_df(instances_df, instances_out)

In [80]:
!kgtk deduplicate -i $TEMP/clean_instances.kgtk -o $TEMP/instances.kgtk.gz

In [81]:
kgtk("""head -i $TEMP/instances.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,imgflipi,prefix_expansion,https://imgflip.com/i/
1,imgflipi:1001ub,imgflipr:alt_text,Captain Picard Facepalm | WIFE POSTS ON FB TH...
2,imgflipi:1001ub,imgflipr:image_url,https://i.imgflip.com/1001ub.jpg
3,imgflipi:1001ub,imgflipr:template,imgflipmeme:Captain-Picard-Facepalm
4,imgflipi:1001ub,imgflipr:templateId,1509839
5,imgflipi:1001ub,imgflipr:template_title,Captain Picard Facepalm
6,imgflipi:1001ub,imgflipr:title,Captain Picard Facepalm
7,imgflipi:1001ub,imgflipr:upvote_count,5
8,imgflipi:1001ub,imgflipr:view_count,3780
9,imgflipi:1001ub,m4s:fromCaption,Q11661


Let's validate that the import worked well:

In [82]:
!kgtk query -i $TEMP/instances.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

Relation
prefix_expansion
imgflipr:alt_text
imgflipr:image_url
imgflipr:template
imgflipr:templateId
imgflipr:template_title
imgflipr:title
imgflipr:upvote_count
imgflipr:view_count
m4s:fromCaption
imgflipr:author


### 1c. Import mappings between KYM and ImgFlip

In [61]:
%%time
kgtk("""
    import-ntriples --debug -i "imkg03/kym2imgflip.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_mappings.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 3.21 ms, sys: 11.5 ms, total: 14.7 ms
Wall time: 722 ms


In [54]:
mappings_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_mappings.kgtk.gz'
mappings_df = pandas.read_csv(mappings_fn, sep='\t')
mappings_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_mappings.kgtk.'

In [55]:
clean_df(mappings_df, mappings_out)

In [56]:
!kgtk deduplicate -i $TEMP/clean_mappings.kgtk -o $TEMP/mappings.kgtk.gz

In [58]:
kgtk("""head -i $TEMP/mappings.kgtk.gz""")

Unnamed: 0,node1,label,node2


## 2. Enrich with Wikidata now

### 2a. Data where memes are subjects or objects

We start with relations where the meme Qnode is a subject in Wikidata:

In [None]:
!kgtk query -i $items -i $TEMP/templates.kgtk.gz \
    --match 'templates: (meme_qid)-[:P6760]->(), \
            item: (meme_qid)-[mrel]->(mval)' \
    --return 'meme_qid as node1, mrel.label as label, mval as node2' \
    -o $TEMP/wikidata_sub.kgtk.gz

In [None]:
!kgtk query -i $TEMP/wikidata_sub.kgtk.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

We get information for 1,397 memes as subjects. What about memes as objects?

In [None]:
!kgtk query -i $items -i $TEMP/templates.kgtk.gz \
    --match 'templates: (meme_qid)-[:P6760]->(), \
            item: (mval)-[mrel]->(meme_qid)' \
    --return 'mval as node1, mrel.label as label, meme_qid as node2' \
    -o $TEMP/wikidata_obj.kgtk.gz

Combine and deduplicate:

In [None]:
!kgtk cat -i $TEMP/wikidata_sub.kgtk.gz -i $TEMP/wikidata_obj.kgtk.gz / deduplicate -o $TEMP/wikidata_memes.kgtk.gz

In [None]:
!kgtk query -i $TEMP/wikidata_memes.kgtk.gz \
    --match '(n1)-[]->()' \
    --return 'count(n1)'

In [None]:
!kgtk query -i $TEMP/wikidata_memes.kgtk.gz \
    --match '(n1)-[]->()' \
    --limit 10

We now combine Wikidata information with the original graph:

In [None]:
!kgtk cat -i $TEMP/wikidata_memes.kgtk.gz -i $TEMP/templates.kgtk.gz / deduplicate -o $TEMP/templates_with_wd.kgtk.gz

### 2b. Data about Wikidata entities from the combined graph

Next, let's obtain data about the other Qnodes that are not memes. Here, we want to get relations for Qnode pairs in our graph in Wikidata. So, we query Wikidata for statements where both node1 and node2 are in our graph:

In [None]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: (qnode1)-[]->(), (qnode2)-[]->()' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/ss.tsv.gz

In [None]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: (qnode1)-[]->(), ()-[]->(qnode2)' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/so.tsv.gz

In [None]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: ()-[]->(qnode1), ()-[]->(qnode2)' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/oo.tsv.gz

In [None]:
!kgtk query -i $items -i $TEMP/templates_with_wd.kgtk.gz \
    --match 'item: (qnode1)-[mrel]->(qnode2), \
            template: ()-[]->(qnode1), (qnode2)-[]->()' \
    --return 'qnode1 as node1, mrel.label as label, qnode2 as node2' \
    / deduplicate -o $TEMP/os.tsv.gz

In [None]:
!kgtk query -i $TEMP/ss.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

In [None]:
!kgtk query -i $TEMP/so.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

In [None]:
!kgtk query -i $TEMP/os.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

In [None]:
!kgtk query -i $TEMP/oo.tsv.gz \
    --match '(n1)-[r]->()' \
    --return 'count (n1)'

In [None]:
!kgtk query -i $TEMP/oo.tsv.gz \
    --match '(n1)-[r]->()' \
    --limit 10

In [None]:
!kgtk cat -i $TEMP/ss.tsv.gz $TEMP/so.tsv.gz $TEMP/os.tsv.gz $TEMP/oo.tsv.gz / deduplicate -o $TEMP/wikidata_ent.kgtk.gz

In [None]:
!kgtk query -i $TEMP/wikidata_ent.kgtk.gz \
    --match '(n1)-[]->(n2)' \
    --return 'count(n1)'

## 3. Create label file

In [83]:
!kgtk query -i $label -i $TEMP/templates.kgtk.gz --force \
            --match 'templates: (n1)-[]->(n2), \
                label: (n)-[r]->(l)' \
            --where 'n1=n OR n2=n' \
            --return 'n as node1, r.label as label, l as node2' / deduplicate \
            -o $TEMP/labelfile_templates.kgtk.gz 

In [84]:
!wc -l $TEMP/labelfile_templates.kgtk.gz

     342 projects/tutorial-kypher/temp.tutorial-kypher/labelfile_templates.kgtk.gz


In [None]:
kgtk("""cat -i $TEMP/labelfile.kgtk.gz""")

In [85]:
!kgtk query -i $label -i $TEMP/instances.kgtk.gz --force \
            --match 'instances: (n1)-[]->(n2), \
                label: (n)-[r]->(l)' \
            --where 'n1=n OR n2=n' \
            --return 'n as node1, r.label as label, l as node2' / deduplicate \
            -o $TEMP/labelfile_instances.kgtk.gz 

In [86]:
!wc -l $TEMP/labelfile_instances.kgtk.gz

    1042 projects/tutorial-kypher/temp.tutorial-kypher/labelfile_instances.kgtk.gz


In [87]:
!kgtk cat -i $TEMP/labelfile_templates.kgtk.gz -i $TEMP/labelfile_instances.kgtk.gz -o $TEMP/labelfile.kgtk.gz