In [1]:
import json, csv
from csv import writer, reader

## Functions for the Pipeline

### Helper Functions

In [2]:
def save2json(filename, dump):
    out_file = open(filename, "w")
    json.dump(dump, out_file, indent = 6)
    out_file.close()

In [3]:
def save2csv(filename, header, data):
    with open(filename, 'w', newline='') as csvfile:
        csvw = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        csvw.writerow(header)
        for d in data:
            csvw.writerow(d)

## Extract Metadata from KYM (main source)

### Helper Functions

In [4]:
def get_kym_metadata(infile,memefile,outfile):
    dump = [ ]
    with open(infile) as csvfile:
        urls = reader(csvfile)
        with open(memefile) as memes:
            data = json.load(memes)
            for row in urls:
                for i in data:
                    if (i['url'] == row[1]):
                        dump.append(i)
    save2json(outfile,dump)

In [5]:
def get_textual_entities(infile,entity_file,outfile):
    memes_entities = []
    cache = []
    errors = []
    with open(infile) as csvfile:
        urls = reader(csvfile)
        #Loading file with dbpedia entities
        with open(entity_file) as efile:
            data = json.load(efile)
            #Mapping DBPedia entities to Wikidata
            with open('dbpedia-wikidata.json') as jsonfile:
                db2wdmapping = json.load(jsonfile)
                for row in urls:
                    if(row[1] in data.keys() and not row[1] in cache):
                        cache.append([row[1]])
                        newdata = data[row[1]]
                        newdata['url']=row[1]
                        if("Resources" in newdata.keys()):
                            for r in newdata["Resources"]:
                                res = r['@URI'].replace('http://dbpedia.org/resource/','')
                                if(res in db2wdmapping.keys()):
                                    r['QID'] = 'https://www.wikidata.org/wiki/'+db2wdmapping[res]                       
                                if(type(r["@types"])==str):
                                    ls = r["@types"].replace("DUL:", "http://www.loa-cnr.it/ontologies/DOLCE-Lite#").replace("Wikidata:", "https://www.wikidata.org/wiki/").replace("Schema:", "https://schema.org/").split(",")
                                    ls = list(filter(None, ls))
                                    ls = [l for l in ls if 'DBpedia' not in l]
                                # elif(type(r["@types"])==list):
                                    # ls = list(map(lambda s: s.replace("DUL:", "http://www.loa-cnr.it/ontologies/DOLCE-Lite#").replace("Wikidata:", "https://www.wikidata.org/wiki/").replace("Schema:", "https://schema.org/").replace("DBpedia:", ""),r["@types"]))
                                    # errors.append([row[1],res,r["@types"]])
                                #Removing dbpedia types that do not have a correspondence in wikidata
                                for l in ls:
                                    if(l in db2wdmapping.keys()):
                                        l = 'https://www.wikidata.org/wiki/'+db2wdmapping[l]     
                                r["@typeList"]=ls
                        memes_entities.append(data[row[1]])
    save2json(outfile, memes_entities)
    save2csv('errors.csv',['kym','entity','@types'],errors)

In [6]:
def get_vision_entities(infile,entity_file,outfile):
    vision = []
    cache = []
    with open(infile) as csvfile:
        urls = reader(csvfile)
        with open(entity_file) as memes:
            data = json.load(memes)
            for row in urls:
                if(row[1] in data.keys() and not row[1] in cache):
                    cache.append([row[1]])
                    newdata = data[row[1]]
                    newdata['url']=row[1]
                    vision.append(data[row[1]])
    save2json(outfile, vision)

### Input Files

In [7]:
kym_memes = 'kym.memes.json'

In [8]:
kym_spotlight_entities = 'kym.spotlight.json'

In [9]:
kym_vision = 'kym.vision.json'


### Getting the subset that is present in Wikidata

In [10]:
wikidata_kym_aliases = 'aliases.csv'

In [11]:
get_kym_metadata(wikidata_kym_aliases,kym_memes,'wiki.'+kym_memes)

### Gets Spotlights entities associated to KYM subset from Wikidata

In [12]:
get_textual_entities(wikidata_kym_aliases,kym_spotlight_entities,'wiki.'+kym_spotlight_entities)

### Gets Google Vision entities associated to KYM subset from Wikidata

In [13]:
get_vision_entities(wikidata_kym_aliases,kym_vision,'wiki.'+kym_vision)

# Collect siblings

In [14]:
siblings_file = 'siblings.csv'

In [15]:
def extract_siblings(infile, outfile):
    siblings = []
    with open(infile) as inF:
        data2filter = json.load(inF)
        for m in data2filter:
               if('siblings' in m.keys()):
                    for s in m['siblings']:
                        siblings = siblings + [[m['url'],s]]
                    
    save2csv(outfile,['meme',"sibling"],siblings)

In [16]:
extract_siblings('wiki.kym.memes.json', siblings_file)

## Add Meme Information from KYM

In [17]:
get_kym_metadata(siblings_file,kym_memes,'siblings.'+kym_memes)

## Add Spotlight Entities

In [18]:
get_textual_entities(siblings_file,kym_spotlight_entities,'siblings.'+kym_spotlight_entities)

## Add Vision Entities

In [19]:
get_vision_entities(siblings_file,kym_vision,'siblings.'+kym_vision)

# Conversion to RDF

In [21]:
!java -jar mapper.jar -m mappings.ttl -o $(pwd)/imkg.nt

^C


In [27]:
!head test.nt

<http://www.wikidata.org/entity/Q112977858> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/yao-ming-face-bitch-please>.
<http://www.wikidata.org/entity/Q113126434> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/soy-boy-face-soyjak>.
<http://www.wikidata.org/entity/Q54> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/all-your-base-are-belong-to-us>.
<http://www.wikidata.org/entity/Q56> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/lolcats>.
<http://www.wikidata.org/entity/Q149> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/nyan-cat>.
<http://www.wikidata.org/entity/Q62141> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/facepalm>.
<http://www.wikidata.org/entity/Q88870> <https://www.wikidata.org/wiki/Property:P6760> <https://knowyourmeme.com/memes/polandball>.
<http://www.wikidata.org/entity/Q13927

In [28]:
! pip install rdflib



In [42]:
import rdflib

In [43]:
from rdflib import Graph as RDFGraph

In [44]:
rg = RDFGraph()
rg.parse('./imkg.nt', format='nt')

<Graph identifier=N7cc607ed587d43508fe6e48889aadf1c (<class 'rdflib.graph.Graph'>)>

In [45]:
len(rg)

148731