## Initial embedding pipeline
Prerequisites:<br>
DBpedia subset hosted with a SPARQL endpoint, metacritic-movies dataset cleaned in previous notebook.

Purpose:<br>
This pipeline builds knowledge graph embeddings for movie entities in the metacritic-movies dataset and trains a binary classifier to predict movie ratings (good/bad).

(Hint)
Make sure to select the poetry kernel. If it does not show up, try to reload your editor.

In [2]:
import pandas as pd
import numpy as np
import os.path
import pickle
import json
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from rdflimeConfig import dbpediaLocation, movieLocation

### Initiate DBpedia and metacritic-movie datasets

In [3]:
# Read movie dataset
movieFull = pd.read_csv(os.path.join(movieLocation, "movies_fixed.tsv"), sep="\t")
movieTrain = movieFull[:1600]
movieTest = movieFull[1600:]
movies = [movie.DBpedia_URI for index, movie in movieFull.iterrows()]

# Check dataset structure
movieFull[movieFull.DBpedia_URI.str.contains("Matrix")]

Unnamed: 0,Wikidata_URI15,Movie,Release date,DBpedia_URI,label,id,rating,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32
92,http://www.wikidata.org/entity/Q83495,The Matrix,3/31/1999 0:00,http://dbpedia.org/resource/The_Matrix,good,1693,73,http://dbpedia.org/resource/The_Matrix,http://yago-knowledge.org/resource/The_Matrix,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...
1154,http://www.wikidata.org/entity/Q189600,The Matrix Reloaded,5/15/2003 0:00,http://dbpedia.org/resource/The_Matrix_Reloaded,good,755,62,http://dbpedia.org/resource/The_Matrix_Reloaded,http://yago-knowledge.org/resource/The_Matrix_...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...


### Build DBpedia embeddings

In [3]:
for algo in [0,1]: # 0: CBOW, 1: SG
    for vsize in [50, 100, 200]: # Vector size of embeddings
        print(algo, vsize)
        if os.path.exists(os.path.join(movieLocation, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")):
            continue
        
        dbpedia = KG(dbpediaLocation, skip_verify=False, mul_req=False)
        
        transformer = RDF2VecTransformer(
            Word2Vec(negative=25, sg=algo, vector_size=vsize),
            walkers=[RandomWalker(max_walks=22, max_depth=2, with_reverse=True, n_jobs=8, md5_bytes=None)],
            verbose=1
        )

        # transformer.get_walks(KG, Entities) -> Walks
        # transformer.fit(Walks) -> Emebeddings?
        # transformer.transform(KG, Entities) -> Embeddings
        # transformer.fit_transform(KG, Entities) -> Embeddings 
        walks = transformer.get_walks(dbpedia, movies[:])
        transformer.fit(walks)
        embeddings, literals = transformer.transform(dbpedia, movies[:])

        transformer.save(os.path.join(movieLocation, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}"))

        

0 50
0 100
0 200


100%|██████████| 2000/2000 [08:54<00:00,  3.75it/s]


Extracted 927694 walks for 2000 entities (534.4617s)
Fitted 927694 walks (120.4660s)
1 50


100%|██████████| 2000/2000 [09:14<00:00,  3.61it/s]


Extracted 927689 walks for 2000 entities (555.2162s)
Fitted 927689 walks (248.2377s)
1 100


100%|██████████| 2000/2000 [09:15<00:00,  3.60it/s]


Extracted 927688 walks for 2000 entities (557.1990s)
Fitted 927688 walks (260.9383s)
1 200


100%|██████████| 2000/2000 [09:39<00:00,  3.45it/s] 


Extracted 927686 walks for 2000 entities (592.8399s)
Fitted 927686 walks (283.8501s)


### Store embeddings that are compatible to the Evaluation Framework

In [20]:
with open(os.path.join(movieLocation, "datasetFixes.json"), "r") as f:
    fixes = json.load(f)

for algo in ["cbow", "sg"]: # 0: CBOW, 1: SG
    for vsize in [50, 100, 200]: # Vector size of embeddings

        with open(os.path.join(movieLocation, f"rdf2vec_transformer_{algo}_{vsize}"), "rb") as file:
            transformer: RDF2VecTransformer = pickle.load(file)

        with open(os.path.join(movieLocation, f"embeddings_{algo}_{vsize}"), "w") as file:
            for index, embedding in enumerate(transformer._embeddings):

                # "Unfix" URL and replace with the version that the Evaluation Framework by Pellegrino et al. understands
                movie = movies[index]
                fix = next(filter(lambda f: f["fix"] == movie, fixes), None)
                if fix: movie = fix["original"]            

                # Write embedding to file
                line = f"{movie} {' '.join(map(str,embedding))}\n"
                file.write(line)

## Evaluation

### Learn classifier on embeddings

In [4]:
for algo in [0,1]: # 0: CBOW, 1: SG
    for vsize in [50,100,200]: # Vector size of embeddings
        pt = os.path.join(movieLocation, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")
        if os.path.exists(pt):
            with open(pt, "rb") as file:
                transformer = pickle.load(file)

            clf = SVC(C=100, probability=True)
            scores = cross_val_score(clf, transformer._embeddings, movieFull.label, cv=10, scoring="accuracy")
            print(algo, vsize, round(np.mean(scores), 2))

            pc = os.path.join(movieLocation, f"embedding_classifier_{'sg' if algo else 'cbow'}_{vsize}")
            clf = SVC(C=100, probability=True)
            clf.fit(transformer._embeddings[:1600], movieFull.label[:1600])
            with open(pc, "wb") as file:
                pickle.dump(clf, file)
            

0 50 0.82
0 100 0.84
0 200 0.85
1 50 0.74
1 100 0.76
1 200 0.79


### Evaluation using Evaluation-Framework by Pellegrino et al.
- Might need to run this in a separate project due to dependency conflicts

In [6]:

# poetry add evaluation-framework
from evaluation_framework.manager import FrameworkManager

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]: # Vector size of embeddings
        p = os.path.join(movieLocation, f"embeddings_{algo}_{vsize}")

        evaluation_manager = FrameworkManager()
        evaluation_manager.evaluate(
            p,
            tasks=["Classification"],
            parallel=False,
            debugging_mode=False,
            vector_size=vsize,
            result_directory_path=os.path.join(movieLocation, f"embedding_evaluation_{algo}_{vsize}")
        )


In [None]:
results = pd.read_csv("comparison.csv", sep=" ")
tab = pd.DataFrame()

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]:
        r = results[results.test_name.str.contains(f"{algo}_{vsize}")] \
            .groupby("model") \
            .max() \
            .reset_index()

        row = {"strategy": f"{algo}_{vsize}"}
        for m in ["NB", "KNN", "SVM", "C45"]:
            row[m] = round(r[r.model==m].iloc[0]["score_value"]*100, 2)
        tab = pd.concat([tab, pd.DataFrame([row])], ignore_index=True)

tab

### Experiments

In [None]:
walks = transformer._walks

print(f"Number of entities with walks: {len(walks)}")

print(movies[0])
print(f"Walks per entity: {len(walks[0])}")
print(f"First walk of first entity:")
print(walks[0][0])

for walk in walks[1692]:
    print(walk[0], walk[1], walk[2])

# Distance(Matrix, Matrix Reloaded) < Distance(Matrix, The Batman)?
# Can we represent the embeddings in a 2D space for visualization? -> See examples


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Reduce the dimensions of entity embeddings to represent them in a 2D plane.
X_tsne = TSNE(random_state=42).fit_transform(transformer._embeddings[1600:])

colors = list(map(lambda e: "#00ff00" if movieTest[movieTest.DBpedia_URI==e].iloc[0].label == "good" else "#ff0000", transformer._entities[1600:]))
sizes = list(map(lambda e: abs(50-movieTest[movieTest.DBpedia_URI==e].iloc[0].rating)**2, transformer._entities[1600:]))

# Ploy the embeddings of entities in a 2D plane, annotating them.
f = plt.figure(figsize=(200, 80))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, s=sizes)
for x, y, t in zip(X_tsne[:, 0], X_tsne[:, 1], transformer._entities):
    plt.annotate(t.split("/")[-1], (x, y))

# Display the graph with a title, removing the axes for better readability.
plt.title("pyRDF2Vec", fontsize=4)
plt.axis("off")
plt.show()

#f.savefig("figure.pdf", bbox_inches='tight')


'#ff0000'

In [102]:
movieFull.sort_values(by="rating")

Unnamed: 0,Wikidata_URI15,Movie,Release date,DBpedia_URI,label,id,rating,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32
1999,http://www.wikidata.org/entity/Q2067539,Bio-Dome,01/12/1996 00:00,http://dbpedia.org/resource/Bio-Dome,bad,1600,1,http://dbpedia.org/resource/Bio-Dome,http://yago-knowledge.org/resource/Bio-Dome,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6Q...
1997,http://www.wikidata.org/entity/Q232248,Two of a Kind,12/16/1983 0:00,http://dbpedia.org/resource/Two_of_a_Kind_(198...,bad,1598,5,http://dbpedia.org/resource/Two_of_a_Kind_(198...,,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...
1998,http://www.wikidata.org/entity/Q7943531,Vulgar,4/26/2002 0:00,http://dbpedia.org/resource/Vulgar_(film),bad,1599,5,http://dbpedia.org/resource/Vulgar_(film),http://yago-knowledge.org/resource/Vulgar_(film),NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...
399,http://www.wikidata.org/entity/Q3079834,Baby Geniuses,03/12/1999 00:00,http://dbpedia.org/resource/Baby_Geniuses,bad,2000,6,http://dbpedia.org/resource/Baby_Geniuses,http://yago-knowledge.org/resource/Baby_Geniuses,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6Q...
1995,http://www.wikidata.org/entity/Q475136,Miss March,3/13/2009 0:00,http://dbpedia.org/resource/Miss_March,bad,1596,7,http://dbpedia.org/resource/Miss_March,http://yago-knowledge.org/resource/Miss_March,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6T...
...,...,...,...,...,...,...,...,...,...,...
403,http://www.wikidata.org/entity/Q511914,Hoop Dreams,10/14/1994 0:00,http://dbpedia.org/resource/Hoop_Dreams,good,4,98,http://dbpedia.org/resource/Hoop_Dreams,http://yago-knowledge.org/resource/Hoop_Dreams,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6S...
404,http://www.wikidata.org/entity/Q216006,Pan's Labyrinth,12/29/2006 0:00,http://dbpedia.org/resource/Pan's_Labyrinth,good,5,98,http://dbpedia.org/resource/Pan's_Labyrinth,http://yago-knowledge.org/resource/Pan's_Labyr...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6U...
402,http://www.wikidata.org/entity/Q681225,Army of Shadows,4/28/2006 0:00,http://dbpedia.org/resource/Army_of_Shadows,good,3,99,http://dbpedia.org/resource/Army_of_Shadows,http://yago-knowledge.org/resource/Army_of_Sha...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6Q...
401,http://www.wikidata.org/entity/Q47703,The Godfather,03/11/1972 00:00,http://dbpedia.org/resource/The_Godfather,good,2,100,http://dbpedia.org/resource/The_Godfather,http://yago-knowledge.org/resource/The_Godfather,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...
