In [83]:
### IMPORTS ###
import rdflib
import pickle
import os
import urllib.parse
import csv

In [63]:
### INSTANCE RETRIEVAL ###

def get_lmdb_instances(graph: rdflib.Graph, cls: str, unquote: bool):
    """Get all sameAs URIS from LMDB class to DBP links."""
    results = graph.query("""
            PREFIX lmdbm: <http://data.linkedmdb.org/resource/movie/>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            SELECT ?sub ?link WHERE {{
                ?sub rdf:type lmdbm:{} .
                ?sub owl:sameAs ?link .
                FILTER REGEX(STR(?link), 'dbpedia.org') .
            }}""".format(cls))
    if unquote:
        return list(map(lambda tup: (str(tup[0]), urllib.parse.unquote(str(tup[1]))), results))
    else:
        return list(map(lambda tup: str(tup[0]), results))

def get_dbp_instances(graph: rdflib.Graph, cls: str, unquote: bool):
    """Get all subject URIS for a DBPedia class"""
    results = graph.query("""
                PREFIX dbo: <http://dbpedia.org/ontology/>
                SELECT ?sub WHERE {{
                    ?sub a dbo:{} .
            }}""".format(cls))
    if unquote:
        return list(map(lambda tup: urllib.parse.unquote(str(tup[0])), results))
    else:
        return list(map(lambda tup: str(tup[0]), results))

In [4]:
### LOAD FILM GRAPHS ###
lmdb_film_graph = pickle.load(open(os.path.join('Data', 'LMDB', 'FilmGraph.pkl'), 'rb'))
dbp_film_graph = pickle.load(open(os.path.join("Data", 'LodALot', 'DBPediaFilmGraph.pkl'), 'rb'))

In [64]:
### GET FILM INSTANCES TO COMPARE ###
lmdb_instances = get_lmdb_instances(lmdb_film_graph, 'film', True)
lmdb_to_dbp = list(map(lambda tup: tup[1], lmdb_instances))

dbp_instances = get_dbp_instances(dbp_film_graph, 'Film', True)

In [82]:
### GET INTERSECTION OF INSTANCES ###
matched_links = set(lmdb_to_dbp).intersection(dbp_instances)
matched_instances = set(filter(lambda tup: tup[1]
                                in matched_links, lmdb_instances))

In [140]:
### WRITE RELEVANT PROPERTIES TO CSV ###
### SOME OF THE MOVIES HAVE MORE THAN ONE RUNTIME REGISTERRED, WE CHOSE ANY RANDOMLY TFOR CONVENIENCE ###
### TOTAL DOUBLE COUNTS 112 / 9582 ###
data_file.close()
data_file = open(os.path.join('Data', 'PropertyMatching', 'LMDBProperties.csv'), 'w+', newline='')
writer = csv.DictWriter(data_file, fieldnames=['LMDB_URI', 'LMDB_DB_link', 'LMDB_title', 'LMDB_runtime'])
writer.writeheader()
errors = []
rows = []
for instance, link in matched_instances:
    title = lmdb_film_graph.value(subject=rdflib.URIRef(instance), 
                                  predicate=rdflib.URIRef('http://purl.org/dc/terms/title'), 
                                  object=None, any=False)    
    try:
        runtime = lmdb_film_graph.value(subject=rdflib.URIRef(instance), 
                                        predicate=rdflib.URIRef('http://data.linkedmdb.org/resource/movie/runtime'), 
                                        object=None, any=False)
    except rdflib.exceptions.UniquenessError as e:
        print('WARNING: Multiple values found')
        errors.append(e)
    row = {
        'LMDB_URI': str(instance),
        'LMDB_DB_link': str(link),
        'LMDB_title': str(title),
        'LMDB_runtime': int(float(runtime)) if runtime is not None else -1
    }
    rows.append(row)
writer.writerows(rows)
data_file.close()

