# Wikidata enrichment

This notebook converts the IMKG graph to KGTK format and queries relevant entities in Wikidata to enrich IMKG with Wikidata knowledge.

## 0. Setup

In [11]:
import os
import os.path

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [12]:
# Parameters

# Folders on local machine where to create the output and temporary files:
input_path = "wikidata"
output_path = "projects"
project_name = "tutorial-kypher"

In [13]:
# IMKG files
imkg_dir='imkg04'
instances_file='%s/imgflip.nt' % imkg_dir
templates_file='%s/kym_all.nt' % imkg_dir
mapping_file='%s/kym2imgflip.sameAs(m4s).nt' % imkg_dir

In [14]:
big_files=["label"]

additional_files = {
    "P31": "derived.P31.tsv.gz",
    "items": "claims.wikibase-item.tsv.gz",
    "P1963": "derived.P1963computed.count.star.tsv.gz",
    "external": "claims.external-id.tsv.gz",
    "indegree": "metadata.in_degree.tsv.gz",
    "outdegree": "metadata.out_degree.tsv.gz",
    "pagerank": "metadata.pagerank.directed.tsv.gz"
}

ck = ConfigureKGTK(big_files)
ck.configure_kgtk(input_graph_path=input_path, 
                  output_path=output_path, 
                  project_name=project_name,
                  additional_files=additional_files)

User home: /Users/filipilievski
Current dir: /Users/filipilievski/mcs/imkg
KGTK dir: /Users/filipilievski/mcs
Use-cases dir: /Users/filipilievski/mcs/use-cases


In [15]:
ck.print_env_variables()

KGTK_GRAPH_CACHE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
USE_CASES_DIR: /Users/filipilievski/mcs/use-cases
EXAMPLES_DIR: /Users/filipilievski/mcs/examples
kypher: kgtk query --graph-cache projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
OUT: projects/tutorial-kypher
kgtk: kgtk
KGTK_LABEL_FILE: wikidata/labels.en.tsv.gz
TEMP: projects/tutorial-kypher/temp.tutorial-kypher
GRAPH: wikidata
KGTK_OPTION_DEBUG: false
STORE: projects/tutorial-kypher/temp.tutorial-kypher/wikidata.sqlite3.db
label: wikidata/labels.en.tsv.gz
P31: wikidata/derived.P31.tsv.gz
items: wikidata/claims.wikibase-item.tsv.gz
P1963: wikidata/derived.P1963computed.count.star.tsv.gz
external: wikidata/claims.external-id.tsv.gz
indegree: wikidata/metadata.in_degree.tsv.gz
outdegree: wikidata/metadata.out_degree.tsv.gz
pagerank: wikidata/metadata.pagerank.directed.tsv.gz


## 1. Import into KGTK

Define namespaces to make the import-ntriples command work:

In [29]:
namespaces={'"http://www.wikidata.org/entity/"': 'wde',
           '"https://www.wikidata.org/wiki/"': 'wdp',
           '"https://knowyourmeme.com/memes/"': 'kym',
           '"http://www.w3.org/2000/01/rdf-schema#"': 'rdfs',
           '"http://www.w3.org/1999/02/22-rdf-syntax-ns#"': 'rdf',
           '"http://dbpedia.org/resource/"': 'dbr',
           '"https://meme4.science/"': 'm4s',
           '"Http://xmlns.com/foaf/0.1/"': 'foaf',
           '"https://knowyourmeme.com/types/"': 'kymt',
           '"https://dbpedia.org/property/"': 'dbp',
           '"https://dbpedia.org/ontology/"': 'dbo',
           '"https://schema.org/"': 'schema',
           '"https://imgflip.com/i/"': 'imgflipi',
           '"https://imgflip.com/meme/"': 'imgflipmeme',
           '"https://imgflip.com/user/"': 'imgflipuser',
           '"https://imgflip.com/"': 'imgflipr',
           '"http://www.w3.org/2004/02/skos/core#"': 'skos'}

prop='prefix_expansion'
with open('namespaces.tsv', 'w') as w:
    w.write('node1\tlabel\tnode2\n')
    for k,v in namespaces.items():
        triple=[v, prop, k]
        w.write('\t'.join(triple) + '\n')

In [30]:
!cat namespaces.tsv

node1	label	node2
wde	prefix_expansion	"http://www.wikidata.org/entity/"
wdp	prefix_expansion	"https://www.wikidata.org/wiki/"
kym	prefix_expansion	"https://knowyourmeme.com/memes/"
rdfs	prefix_expansion	"http://www.w3.org/2000/01/rdf-schema#"
rdf	prefix_expansion	"http://www.w3.org/1999/02/22-rdf-syntax-ns#"
dbr	prefix_expansion	"http://dbpedia.org/resource/"
m4s	prefix_expansion	"https://meme4.science/"
foaf	prefix_expansion	"Http://xmlns.com/foaf/0.1/"
kymt	prefix_expansion	"https://knowyourmeme.com/types/"
dbp	prefix_expansion	"https://dbpedia.org/property/"
dbo	prefix_expansion	"https://dbpedia.org/ontology/"
schema	prefix_expansion	"https://schema.org/"
imgflipi	prefix_expansion	"https://imgflip.com/i/"
imgflipmeme	prefix_expansion	"https://imgflip.com/meme/"
imgflipuser	prefix_expansion	"https://imgflip.com/user/"
imgflipr	prefix_expansion	"https://imgflip.com/"
skos	prefix_expansion	"http://www.w3.org/2004/02/skos/core#"


### 1a. Import templates

In [31]:
%%time
kgtk("""
    --progress
    import-ntriples -i "imkg04/kym_all.nt"  
        --namespace-file "namespaces.tsv"    
        -o $TEMP/raw_templates.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 14.8 ms, sys: 19.9 ms, total: 34.7 ms
Wall time: 36.7 s


Let's clean up the data a bit:

In [32]:
import pandas

templates_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_templates.kgtk.gz'
templates_df = pandas.read_csv(templates_fn, sep='\t')
templates_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_templates.kgtk'

In [33]:
def replace_me(n):
    try:
        return n.replace('wdp:Property:', '').replace('wdp:', '').replace('wdt:', '').replace('wde:', '')
    except:
        return n

In [34]:
def clean_df(df, filename):
    all_rows=[]
    for i, row in df.iterrows():
        n1, label, n2 = row
        n1=replace_me(n1)
        n2=replace_me(n2)
        label=replace_me(label)
        new_row=[str(n1),str(label),str(n2)]
        all_rows.append(new_row)

    with open(filename, 'w') as w:
        w.write('node1\tlabel\tnode2\n')
        for row in all_rows:
            w.write('\t'.join(row) + '\n')

We do some cleaning of the graph

In [35]:
clean_df(templates_df, templates_out)

In [36]:
!kgtk deduplicate -i $TEMP/clean_templates.kgtk -o $TEMP/templates.kgtk.gz

Now that we imported and deduplicated the instances, let's do some sanity check to make sure our graph is reasonable.

In [37]:
kgtk("""cat -i $TEMP/templates.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,kym,prefix_expansion,https://knowyourmeme.com/memes/
1,kym:%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face,m4s:about,( ͡° ͜ʖ ͡°) is an emoticon created with unicod...
2,kym:%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face,m4s:added,nodeDW9ARPJSRQYEUQn4RyWQMF-16625
3,kym:%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face,m4s:from,Ylilauta
4,kym:%CD%A1-%CD%9C%CA%96-%CD%A1-lenny-face,m4s:fromAbout,Q238330
...,...,...,...
914937,rdf,prefix_expansion,http://www.w3.org/1999/02/22-rdf-syntax-ns#
914938,rdfs,prefix_expansion,http://www.w3.org/2000/01/rdf-schema#
914939,skos,prefix_expansion,http://www.w3.org/2004/02/skos/core#
914940,wde,prefix_expansion,http://www.wikidata.org/entity/


How many memes we have in the graph?

In [38]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '(n1)-[r:`rdf:type`]->(:`kym:Meme`)' \
    --return 'count(distinct n1)'

count(DISTINCT graph_40_c1."node1")
12585


What are the relations?

In [39]:
!kgtk query -i $TEMP/templates.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

Relation
m4s:about
m4s:added
m4s:from
m4s:fromAbout
m4s:fromImage
m4s:last_update_source
m4s:origin
m4s:spread
m4s:status
m4s:structured_uri
m4s:structured_value
m4s:tag
m4s:title
m4s:year
prefix_expansion
rdf:type
rdfs:seeAlso
skos:broader
skos:narrower


### 1b. Import instances

Now let's import the instances file into KGTK. This command takes around 6mins on my laptop, I was not sure if it is running or got stuck somewhere.

In [40]:
%%time
kgtk("""
    import-ntriples -i "imkg04/imgflip.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_instances.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

CPU times: user 116 ms, sys: 88.2 ms, total: 204 ms
Wall time: 5min 39s


In [41]:
instances_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_instances.kgtk.gz'
instances_df = pandas.read_csv(instances_fn, sep='\t')
instances_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_instances.kgtk'

In [42]:
clean_df(instances_df, instances_out)

In [43]:
!kgtk deduplicate -i $TEMP/clean_instances.kgtk -o $TEMP/instances.kgtk.gz

In [44]:
kgtk("""head -i $TEMP/instances.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,imgflipi,prefix_expansion,https://imgflip.com/i/
1,imgflipi:100035,imgflipr:alt_text,I Should Buy A Boat Cat | I SHOULD WATCH MAD ...
2,imgflipi:100035,imgflipr:image_url,https://i.imgflip.com/100035.jpg
3,imgflipi:100035,imgflipr:template,imgflipmeme:I-Should-Buy-A-Boat-Cat
4,imgflipi:100035,imgflipr:templateId,1367068
5,imgflipi:100035,imgflipr:template_title,I Should Buy A Boat Cat
6,imgflipi:100035,imgflipr:title,After watching the Oscars for 20 minutes and s...
7,imgflipi:100035,imgflipr:upvote_count,1
8,imgflipi:100035,imgflipr:view_count,2437
9,imgflipi:100035,m4s:fromCaption,Q3275141


Let's validate that the import worked well:

In [45]:
!kgtk query -i $TEMP/instances.kgtk.gz \
    --match '()-[r]->()' \
    --return 'distinct r.label as Relation'

Relation
prefix_expansion
imgflipr:alt_text
imgflipr:image_url
imgflipr:template
imgflipr:templateId
imgflipr:template_title
imgflipr:title
imgflipr:upvote_count
imgflipr:view_count
m4s:fromCaption
imgflipr:author


### 1c. Import mappings between KYM and ImgFlip

In [74]:
%%time
kgtk("""
    import-ntriples --debug -i "imkg04/mappings.nt"  
        --namespace-file "namespaces.tsv"
        -o $TEMP/raw_mappings.kgtk.gz
        --namespace-id-use-uuid True 
        --build-new-namespaces False
        --output-only-used-namespaces True 
        --structured-value-label m4s:structured_value 
        --structured-uri-label m4s:structured_uri 
        --newnode-prefix node 
        --newnode-use-uuid True
    """)

Starting 'import-ntriples' on pid 14171.

CPU times: user 5.41 ms, sys: 77.5 ms, total: 82.9 ms
Wall time: 853 ms


In [75]:
mappings_fn='projects/tutorial-kypher/temp.tutorial-kypher/raw_mappings.kgtk.gz'
mappings_df = pandas.read_csv(mappings_fn, sep='\t')
mappings_out='projects/tutorial-kypher/temp.tutorial-kypher/clean_mappings.kgtk'

In [76]:
clean_df(mappings_df, mappings_out)

In [77]:
!kgtk deduplicate -i $TEMP/clean_mappings.kgtk -o $TEMP/mappings.kgtk.gz

In [80]:
kgtk("""cat -i $TEMP/mappings.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,Q104005472,P6760,kym:spongegar-primitive-sponge-caveman-spongebob
1,Q104841082,P6760,kym:the-cake-is-a-lie
2,Q104858864,P6760,kym:rule-63
3,Q104968209,P6760,kym:i-am-once-again-asking-for-your-financial-...
4,Q1050827,P6760,kym:waluigi
...,...,...,...
483,imgflipr:memetemplate/Zombie-Overly-Attached-G...,m4s:templateOf,kym:overly-attached-girlfriend
484,kym,prefix_expansion,https://knowyourmeme.com/memes/
485,m4s,prefix_expansion,https://meme4.science/
486,wde,prefix_expansion,http://www.wikidata.org/entity/


## 2. Enrich with Wikidata now

### 2a. Data where memes are subjects or objects

We start with relations where the meme Qnode is a subject in Wikidata:

In [88]:
!kgtk query -i $items -i $TEMP/mappings.kgtk.gz \
    --match 'mapping: (meme_qid)-[:P6760]->(), \
            item: (meme_qid)-[mrel]->(mval)' \
    --return 'meme_qid as node1, mrel.label as label, mval as node2' \
    -o $TEMP/wikidata_sub.kgtk.gz

ERROR! Session/line number was not unique in database. History logging moved to new session 473


In [106]:
!kgtk query -i $TEMP/wikidata_sub.kgtk.gz \
    --match '(n1)-[r]->()' \
    --return 'count (distinct n1)'

count(DISTINCT graph_36_c1."node1")
240


We get information for 1,397 memes as subjects. What about memes as objects?

In [90]:
!kgtk query -i $items -i $TEMP/mappings.kgtk.gz \
    --match 'mapping: (meme_qid)-[:P6760]->(), \
            item: (mval)-[mrel]->(meme_qid)' \
    --return 'mval as node1, mrel.label as label, meme_qid as node2' \
    -o $TEMP/wikidata_obj.kgtk.gz

Combine and deduplicate:

In [91]:
!kgtk cat -i $TEMP/wikidata_sub.kgtk.gz -i $TEMP/wikidata_obj.kgtk.gz / deduplicate -o $TEMP/wikidata_memes.kgtk.gz

In [92]:
!kgtk query -i $TEMP/wikidata_memes.kgtk.gz \
    --match '(n1)-[]->()' \
    --return 'count(n1)'

count(graph_31_c1."node1")
1587


In [93]:
!kgtk cat -i $TEMP/templates.kgtk.gz -i $TEMP/mappings.kgtk.gz -i $TEMP/instances.kgtk.gz / deduplicate -o $TEMP/combined.kgtk.gz

We now combine Wikidata information with the original graph:

In [94]:
!kgtk cat -i $TEMP/wikidata_memes.kgtk.gz -i $TEMP/templates.kgtk.gz -i $TEMP/mappings.kgtk.gz -i $TEMP/instances.kgtk.gz / deduplicate -o $TEMP/combined_with_wd.kgtk.gz

### 2b. Data about Wikidata entities from the combined graph

Next, let's obtain data about the other Qnodes that are not memes. Here, we want to get relations for Qnode pairs in our graph in Wikidata. So, we query Wikidata for statements where both node1 and node2 are in our graph:

In [95]:
!kgtk query -i $TEMP/combined_with_wd.kgtk.gz \
    --match '(x)-->(y)' \
     --return 'x as node1, "member" as label, "set1" as node2, y as node1, "member" as label, "set1" as node2' \
     --multi 2 \
     / deduplicate / add-id / \
     query -i - --as gnodes --idx mode:valuegraph -i $items --idx mode:graph \
     --match 'item:  (x)-[r]->(y), \
              gnodes: (x)-->(), \
                      (y)-->()' \
    --return 'distinct x, r.label, y' \
    -o $TEMP/wikidata_ent.kgtk.gz

In [96]:
!gzcat $TEMP/wikidata_ent.kgtk.gz | tail -30

Q998999	P495	Q16
Q998999	P641	Q159992
Q999	P189	Q30
Q999	P279	Q19605
Q999	P31	Q11344
Q999	P361	Q19605
Q999	P61	Q173028
Q999159	P1192	Q5503
Q999159	P131	Q171689
Q999159	P17	Q142
Q999159	P30	Q46
Q999159	P31	Q928830
Q999259	P2341	Q11772
Q999278	P279	Q205555
Q9993	P361	Q21204
Q9993	P361	Q8229
Q999587	P136	Q5937792
Q999587	P407	Q1860
Q999587	P495	Q30
Q999587	P674	Q15220681
Q999587	P7937	Q8261
Q999587	P840	Q1297
Q999591	P136	Q591990
Q999591	P31	Q105543609
Q999591	P7937	Q7366
Q999646	P2283	Q1420
Q99997	P17	Q55
Q99997	P31	Q532
Q999981	P1269	Q162297
Q999981	P1889	Q1147070


In [97]:
!kgtk query -i $TEMP/wikidata_ent.kgtk.gz \
    --match '(n1)-[]->(n2)' \
    --return 'count(n1)'

count(graph_42_c1."node1")
504781


## 3. Merge all in IMKG


In [103]:
!kgtk cat -i $TEMP/wikidata_memes.kgtk.gz -i $TEMP/wikidata_ent.kgtk.gz / deduplicate -o $TEMP/wd.kgtk.gz

In [104]:
!kgtk cat -i $TEMP/combined_with_wd.kgtk.gz -i $TEMP/wikidata_ent.kgtk.gz / deduplicate -o $TEMP/imkg.kgtk.gz

In [105]:
kgtk("""cat -i $TEMP/imkg.kgtk.gz""")

Unnamed: 0,node1,label,node2
0,P1651,P10726,Q63412991
1,P1651,P1855,Q5230628
2,P1651,P9073,Q866
3,P21,P1629,Q290
4,P21,P1855,Q1395624
...,...,...,...
16549805,rdf,prefix_expansion,http://www.w3.org/1999/02/22-rdf-syntax-ns#
16549806,rdfs,prefix_expansion,http://www.w3.org/2000/01/rdf-schema#
16549807,skos,prefix_expansion,http://www.w3.org/2004/02/skos/core#
16549808,wde,prefix_expansion,http://www.wikidata.org/entity/


## 4. Create label file (optional, for visualization)

In [None]:
!kgtk query -i $label -i $TEMP/templates.kgtk.gz --force \
            --match 'templates: (n1)-[]->(n2), \
                label: (n)-[r]->(l)' \
            --where 'n1=n OR n2=n' \
            --return 'n as node1, r.label as label, l as node2' / deduplicate \
            -o $TEMP/labelfile_templates.kgtk.gz 

In [None]:
!wc -l $TEMP/labelfile_templates.kgtk.gz

In [None]:
kgtk("""cat -i $TEMP/labelfile.kgtk.gz""")

In [None]:
!kgtk query -i $label -i $TEMP/instances.kgtk.gz --force \
            --match 'instances: (n1)-[:`m4s:fromCaption`]->(n2), \
                label: (n)-[r]->(l)' \
            --where 'n1=n OR n2=n' \
            --return 'n as node1, r.label as label, l as node2' / deduplicate \
            -o $TEMP/labelfile_instances.kgtk.gz 

In [None]:
!wc -l $TEMP/labelfile_instances.kgtk.gz

In [None]:
!kgtk cat -i $TEMP/labelfile_templates.kgtk.gz -i $TEMP/labelfile_instances.kgtk.gz -o $TEMP/labelfile.kgtk.gz