In [7]:
### IMPORTS ###
import rdflib
import pickle
import os
import urllib.parse

In [8]:
### SIMILARITY MEASURES ###

def conjuction(first: set, second: set) -> set:
    """Compute the conjunctive set of the 2 collections."""
    return first.intersection(second)

def disjunction(first: set, second: set) -> set:
    """Compute the disjunctive set of the 2 collections."""
    return first.union(second)
def difference(first: set, second: set):
    return first.difference(second)

def compute_sim(first:set, second:set, sim_type: str) -> tuple:
    """ Compute the similarity of 2 classes."""
    if sim_type == "jaccard":
        conj = conjuction(first, second)
        disj = disjunction(first, second)
        f_diff = difference(first, second)
        s_diff = difference(second, first)
        sim = len(conj) / len(disj)
    else:
        raise Exception('Similarity measure not implemented: {}'.format(sim_type))

    return sim, conj, disj, f_diff, s_diff

In [9]:
### INSTANCE RETRIEVAL ###

def lmdb_to_dbp_links(graph: rdflib.Graph, cls: str, unquote: bool):
    """Get all sameAs URIS from LMDB class to DBP links."""
    results = graph.query("""
            PREFIX lmdbm: <http://data.linkedmdb.org/resource/movie/>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            SELECT ?obj WHERE {{
                ?sub rdf:type lmdbm:{} .
                ?sub owl:sameAs ?obj .
                FILTER REGEX(STR(?obj), 'dbpedia.org') .
            }}""".format(cls))
    if unquote:
        return list(map(lambda tup: urllib.parse.unquote(str(tup[0])), results))
    else:
        return list(map(lambda tup: str(tup[0]), results))

def dbp_subjects(graph: rdflib.Graph, cls: str, unquote: bool):
    """Get all subject URIS for a DBPedia class"""
    results = graph.query("""
                PREFIX dbo: <http://dbpedia.org/ontology/>
                SELECT ?sub WHERE {{
                    ?sub a dbo:{} .
            }}""".format(cls))
    if unquote:
        return list(map(lambda tup: urllib.parse.unquote(str(tup[0])), results))
    else:
        return list(map(lambda tup: str(tup[0]), results))

In [10]:
### LOAD FILM GRAPHS ###
lmdb_film_graph = pickle.load(open(os.path.join('Data', 'LMDB', 'FilmGraph.pkl'), 'rb'))
dbp_film_graph = pickle.load(open(os.path.join("Data", 'LodALot', 'DBPediaFilmGraph.pkl'), 'rb'))

In [11]:
### GET FILM INSTANCES TO COMPARE ###
lmdb_dbp_film_same_as = lmdb_to_dbp_links(lmdb_film_graph, 'film', True)
dbp_film_subj = dbp_subjects(dbp_film_graph, 'Film', True) 

In [12]:
### COMPARE FILM INSTANCES ###
sim_film, conj_film, disj_film, lmdb_dbp_film, dbp_lmdb_film = compute_sim(set(lmdb_dbp_film_same_as), set(dbp_film_subj), 'jaccard')
print(' SIMILARITY: ', sim_film)
print('CONJUNCTION: ', len(conj_film))
print('DISJUNCTION: ', len(disj_film))
print(' LMDB \ DBP: ', len(lmdb_dbp_film))
print(' DBP \ LMDB: ', len(dbp_lmdb_film))

SIMILARITY:  0.030823180147241953
CONJUNCTION:  9282
DISJUNCTION:  301137
 LMDB \ DBP:  1243
 DBP \ LMDB:  290612


In [13]:
### LOAD FILM GRAPHS ###
lmdb_actor_graph = pickle.load(open(os.path.join('Data', 'LMDB', 'ActorGraph.pkl'), 'rb'))
dbp_actor_graph = pickle.load(open(os.path.join("Data", 'LodALot', 'DBPediaActorGraph.pkl'), 'rb'))

In [14]:
### GET ACTOR INSTANCES TO COMPARE ###
lmdb_dbp_actor_same_as = lmdb_to_dbp_links(lmdb_actor_graph, 'actor', True)
dbp_actor_subj = dbp_subjects(dbp_actor_graph, 'Actor', True)

In [15]:
### COMPARE ACTOR INSTANCES ###
sim_actor, conj_actor, disj_actor, lmdb_dbp_actor, dbp_lmdb_actor = compute_sim(set(lmdb_dbp_actor_same_as), set(dbp_actor_subj), 'jaccard')
print('SIMILARITY: ', sim_actor)
print('CONJUNCTION: ', len(conj_actor))
print('DISJUNCTION: ', len(disj_actor))
print(' LMDB \ DBP: ', len(lmdb_dbp_actor))
print(' DBP \ LMDB: ', len(dbp_lmdb_actor))

SIMILARITY:  0.008031509140735292
CONJUNCTION:  1354
DISJUNCTION:  168586
 LMDB \ DBP:  560
 DBP \ LMDB:  166672
