## Initial embedding pipeline
Prerequisites:<br>
DBpedia subset hosted with a SPARQL endpoint, metacritic-movies dataset cleaned in previous notebook.

Purpose:<br>
This pipeline builds knowledge graph embeddings for movie entities in the metacritic-movies dataset and trains a binary classifier to predict movie ratings (good/bad).

(Hint)
Make sure to select the poetry kernel. If it does not show up, try to reload your editor.

In [11]:
import pandas as pd
import numpy as np
import os.path
import pickle
import json
from pathlib import Path
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from rdflimeConfig import dbpediaLocation, movieLocation

### Initiate DBpedia and metacritic-movie datasets

In [5]:
# Read movie dataset
movieFull = pd.read_csv(os.path.join(movieLocation, "movies_fixed.tsv"), sep="\t")
movieTrain = movieFull[400:]
movieTest = movieFull[:400]
movies = [movie.DBpedia_URI for index, movie in movieFull.iterrows()]

# Check dataset structure
movieFull[movieFull.DBpedia_URI.str.contains("Matrix")]

Unnamed: 0,Wikidata_URI15,Movie,Release date,DBpedia_URI,label,id,rating,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32
92,http://www.wikidata.org/entity/Q83495,The Matrix,3/31/1999 0:00,http://dbpedia.org/resource/The_Matrix,good,1693,73,http://dbpedia.org/resource/The_Matrix,http://yago-knowledge.org/resource/The_Matrix,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...
1154,http://www.wikidata.org/entity/Q189600,The Matrix Reloaded,5/15/2003 0:00,http://dbpedia.org/resource/The_Matrix_Reloaded,good,755,62,http://dbpedia.org/resource/The_Matrix_Reloaded,http://yago-knowledge.org/resource/The_Matrix_...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...


### Build DBpedia embeddings

Train and store a PyRDF2Vec transformer with various parameter settings.

In [14]:
for algo in [0,1]: # 0: CBOW, 1: SG
    for vsize in [50, 100, 200]: # Vector size of embeddings
        print(algo, vsize)
        
        if os.path.exists(os.path.join(movieLocation, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")):
            print(f"Skipping transformer, as it already exists.")
            continue
        
        dbpedia = KG(dbpediaLocation, skip_verify=False, mul_req=False)
        
        transformer = RDF2VecTransformer(
            Word2Vec(negative=25, sg=algo, vector_size=vsize),
            walkers=[RandomWalker(max_walks=22, max_depth=2, with_reverse=True, n_jobs=8, md5_bytes=None)],
            verbose=1
        )

        walks = transformer.get_walks(dbpedia, movies[:])
        transformer.fit(walks)
        embeddings, literals = transformer.transform(dbpedia, movies[:])

        targetPath = os.path.join(movieLocation, "transformers")
        Path(targetPath).mkdir(parents=True, exist_ok=True)
        transformer.save(os.path.join(targetPath, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}"))

0 50


100%|██████████| 2000/2000 [02:58<00:00, 11.20it/s]


Extracted 967877 walks for 2000 entities (178.8813s)
Fitted 967877 walks (33.0955s)
0 100


100%|██████████| 2000/2000 [03:07<00:00, 10.68it/s]


Extracted 967877 walks for 2000 entities (187.6276s)
Fitted 967877 walks (37.3635s)
0 200


100%|██████████| 2000/2000 [03:20<00:00,  9.98it/s]


Extracted 967877 walks for 2000 entities (200.8808s)
Fitted 967877 walks (52.4809s)
1 50


100%|██████████| 2000/2000 [03:29<00:00,  9.55it/s]


Extracted 967877 walks for 2000 entities (209.7185s)
Fitted 967877 walks (89.9783s)
1 100


100%|██████████| 2000/2000 [03:28<00:00,  9.60it/s]


Extracted 967877 walks for 2000 entities (208.8070s)
Fitted 967877 walks (105.5400s)
1 200


100%|██████████| 2000/2000 [03:22<00:00,  9.86it/s]


Extracted 967877 walks for 2000 entities (203.2874s)
Fitted 967877 walks (151.5139s)


### Store embeddings that are compatible to the Evaluation Framework
In a next step, we would like to evaluate the quality of our generated embeddings. However, we first need to convert to a format that is readable by [GEval](https://github.com/mariaangelapellegrino/Evaluation-Framework), the graph embedding framework by Pellegrino et al.

This entails two steps: First, apply the fixes from 1_DBpediaFixes.ipynb in reverse, i.e. "unfix" the IRIs to be compatible with the framework. Second, store the embeddings in the required CSV-like format.

In [15]:
with open(os.path.join(movieLocation, "datasetFixes.json"), "r") as f:
    fixes = json.load(f)

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]: # Vector size of embeddings

        with open(os.path.join(movieLocation, "transformers", f"rdf2vec_transformer_{algo}_{vsize}"), "rb") as file:
            transformer: RDF2VecTransformer = pickle.load(file)

        targetPath = os.path.join(movieLocation, "embeddings")
        Path(targetPath).mkdir(parents=True, exist_ok=True)

        with open(os.path.join(targetPath, f"embeddings_{algo}_{vsize}"), "w") as file:
            for index, embedding in enumerate(transformer._embeddings):

                # "Unfix" IRI and replace with the version that the Evaluation Framework by Pellegrino et al. understands
                movie = movies[index]
                fix = next(filter(lambda f: f["fix"] == movie, fixes), None)
                if fix: movie = fix["original"]            

                # Write embedding to file
                line = f"{movie} {' '.join(map(str,embedding))}\n"
                file.write(line)

## Evaluation

### Learn final classifier on embeddings
Testing with the framework by Pellegrino et al. (see below) reveals that SVC with C=100 delivers high accuracy on the given task (predicting movie quality). We therefore train and store such a classifier for every embedding variant that was trained. 

In [17]:
for algo in [0,1]: # 0: CBOW, 1: SG
    for vsize in [50,100,200]: # Vector size of embeddings
        
        transformerPath = os.path.join(movieLocation, "transformers", f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")
        transformer = RDF2VecTransformer.load(transformerPath)

        clf = SVC(C=100, probability=True)

        # train-test split as seen in 0_MovieDataSetExploration.ipynb
        clf.fit(transformer._embeddings[400:], movieTrain.label)

        targetPath = os.path.join(movieLocation, "classifiers")
        Path(targetPath).mkdir(parents=True, exist_ok=True)
           
        with open(os.path.join(targetPath, f"svc_100_{'sg' if algo else 'cbow'}_{vsize}" ), "wb") as file:
            pickle.dump(clf, file)
            

### Evaluation using Evaluation-Framework by Pellegrino et al.
- Due to dependency conflicts, the evaluation cannot be run directly in this environment.
- Instead use poetry to manage a separate environment for us

In [29]:
evalPath = os.path.join(movieLocation, "embeddings", "evaluation")

In [None]:

# poetry add evaluation-framework
from evaluation_framework.manager import FrameworkManager

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]: # Vector size of embeddings
        p = os.path.join(movieLocation, f"embeddings_{algo}_{vsize}")

        evaluation_manager = FrameworkManager()
        evaluation_manager.evaluate(
            p,
            tasks=["Classification"],
            parallel=False,
            debugging_mode=False,
            vector_size=vsize,
            result_directory_path=os.path.join(movieLocation, f"embedding_evaluation_{algo}_{vsize}")
        )


In [None]:
results = pd.read_csv("comparison.csv", sep=" ")
tab = pd.DataFrame()

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]:
        r = results[results.test_name.str.contains(f"{algo}_{vsize}")] \
            .groupby("model") \
            .max() \
            .reset_index()

        row = {"strategy": f"{algo}_{vsize}"}
        for m in ["NB", "KNN", "SVM", "C45"]:
            row[m] = round(r[r.model==m].iloc[0]["score_value"]*100, 2)
        tab = pd.concat([tab, pd.DataFrame([row])], ignore_index=True)

tab

### Experiments

In [None]:
walks = transformer._walks

print(f"Number of entities with walks: {len(walks)}")

print(movies[0])
print(f"Walks per entity: {len(walks[0])}")
print(f"First walk of first entity:")
print(walks[0][0])

for walk in walks[1692]:
    print(walk[0], walk[1], walk[2])

# Distance(Matrix, Matrix Reloaded) < Distance(Matrix, The Batman)?
# Can we represent the embeddings in a 2D space for visualization? -> See examples


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# Reduce the dimensions of entity embeddings to represent them in a 2D plane.
X_tsne = TSNE(random_state=42).fit_transform(transformer._embeddings[1600:])

colors = list(map(lambda e: "#00ff00" if movieTest[movieTest.DBpedia_URI==e].iloc[0].label == "good" else "#ff0000", transformer._entities[1600:]))
sizes = list(map(lambda e: abs(50-movieTest[movieTest.DBpedia_URI==e].iloc[0].rating)**2, transformer._entities[1600:]))

# Ploy the embeddings of entities in a 2D plane, annotating them.
f = plt.figure(figsize=(200, 80))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, s=sizes)
for x, y, t in zip(X_tsne[:, 0], X_tsne[:, 1], transformer._entities):
    plt.annotate(t.split("/")[-1], (x, y))

# Display the graph with a title, removing the axes for better readability.
plt.title("pyRDF2Vec", fontsize=4)
plt.axis("off")
plt.show()

#f.savefig("figure.pdf", bbox_inches='tight')
