## Initial embedding pipeline
Prerequisites:<br>
DBpedia subset hosted with a SPARQL endpoint, relevant datasets (e.g. metacritic-movies) cleaned in previous notebook.

Purpose:<br>
This pipeline builds knowledge graph embeddings for entities of interest (e.g. movies) and trains a classifier to predict the target variable.

(Hint)
Make sure to select the poetry kernel. If it does not show up, try to reload your editor.

In [2]:
import pandas as pd
import os.path
import pickle
import json
from pathlib import Path
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from sklearn.svm import SVC
from evaluation_framework.manager import FrameworkManager
from rdflimeConfig import dbpediaLocation, dataLocation, datasets, load_dataset, split_dataset

### Build DBpedia embeddings

Train and store a PyRDF2Vec transformer with various parameter settings.

In [4]:
for cfg in datasets:
    for algo in [0,1]: # 0: CBOW, 1: SG
        for vsize in [50, 100, 200]: # Vector size of embeddings
            print(cfg["name"], algo, vsize)

            dataset, entities = load_dataset(cfg)
            datasetLocation = cfg["location"]
            
            targetPath = os.path.join(datasetLocation, "transformers")
            targetFile = f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}"
            if os.path.exists(os.path.join(targetPath, targetFile)):
                print(f"Skipping transformer, as it already exists.")
                continue
            
            dbpedia = KG(dbpediaLocation, skip_verify=False, mul_req=False)
            
            transformer = RDF2VecTransformer(
                Word2Vec(sg=algo, vector_size=vsize), # negative = 25
                walkers=[RandomWalker(max_walks=1, max_depth=4, with_reverse=False, n_jobs=2, md5_bytes=None)], # max_walks = 22, max_depth = 2
                verbose=1
            )

            walks = transformer.get_walks(dbpedia, entities)
            transformer.fit(walks)
            embeddings, literals = transformer.transform(dbpedia, entities)

            Path(targetPath).mkdir(parents=True, exist_ok=True)
            transformer.save(os.path.join(targetPath, targetFile))

metacritic-movies 0 50
Skipping transformer, as it already exists.
metacritic-movies 0 100
Skipping transformer, as it already exists.
metacritic-movies 0 200
Skipping transformer, as it already exists.
metacritic-movies 1 50
Skipping transformer, as it already exists.
metacritic-movies 1 100
Skipping transformer, as it already exists.
metacritic-movies 1 200
Skipping transformer, as it already exists.
metacritic-albums 0 50
Skipping transformer, as it already exists.
metacritic-albums 0 100


KeyboardInterrupt: 

### Store embeddings that are compatible to the Evaluation Framework
In a next step, we would like to evaluate the quality of our generated embeddings. However, we first need to convert to a format that is readable by [GEval](https://github.com/mariaangelapellegrino/Evaluation-Framework), the graph embedding framework by Pellegrino et al.

This entails two steps: First, apply the fixes from 1_DBpediaFixes.ipynb in reverse, i.e. "unfix" the IRIs to be compatible with the framework. Second, store the embeddings in the required CSV-like format.

In [7]:
for cfg in datasets:
    datasetLocation = cfg["location"]
    dataset, entities = load_dataset(cfg)

    for algo in ["cbow", "sg"]:
        for vsize in [50, 100, 200]: # Vector size of embeddings

            with open(os.path.join(datasetLocation, "transformers", f"rdf2vec_transformer_{algo}_{vsize}"), "rb") as file:
                transformer: RDF2VecTransformer = pickle.load(file)

            targetPath = os.path.join(datasetLocation, "..", "embeddings")
            Path(targetPath).mkdir(parents=True, exist_ok=True)

            with open(os.path.join(targetPath, f"embeddings_{algo}_{vsize}"), "a") as file:
                for index, embedding in enumerate(transformer._embeddings):

                    # "Unfix" IRI and replace with the version that the Evaluation Framework by Pellegrino et al. understands
                    entity = entities[index]
                    entity = dataset[dataset[cfg["columns"]["uri_fixed"]] == entity][cfg["columns"]["uri_geval"]].iloc[0].replace(" ", "+")        

                    # Write embedding to file
                    line = f"{entity} {' '.join(map(str,embedding))}\n"
                    file.write(line)

FileNotFoundError: [Errno 2] No such file or directory: '../data/metacritic-albums/transformers/rdf2vec_transformer_cbow_100'

## Evaluation

### Evaluation using Evaluation-Framework by Pellegrino et al.
Load each of our embedding versions and run the classification task on the movie dataset.

In [8]:
evalPath = os.path.join(dataLocation, "embeddings", "evaluation")
Path(evalPath).mkdir(parents=True, exist_ok=True)

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]: # Vector size of embeddings
        print(algo, vsize)

        embeddingPath = os.path.join(dataLocation, "embeddings", f"embeddings_{algo}_{vsize}")
        
        evaluation_manager = FrameworkManager()
        evaluation_manager.evaluate(
            embeddingPath,
            tasks=["Classification"],
            parallel=False,
            debugging_mode=False,
            vector_size=vsize,
            result_directory_path=os.path.join(evalPath, f"geval_result_{algo}_{vsize}")
        )

cbow 50
Start evaluation...
Classification finished
0:02:51
cbow 100
Start evaluation...
Classification finished
0:02:18
cbow 200
Start evaluation...
Classification finished
0:04:23
sg 50
Start evaluation...
Classification finished
0:01:30
sg 100
Start evaluation...
Classification finished
0:02:40
sg 200
Start evaluation...
Classification finished
0:05:07


In [9]:
# Move the results file to the proper location
!mv comparison.csv $evalPath

In [10]:
results = pd.read_csv(os.path.join(evalPath, "comparison.csv"), sep=" ")
tab = pd.DataFrame()

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]:
        r = results[results.test_name.str.contains(f"{algo}_{vsize}")] \
            .groupby("model") \
            .max() \
            .reset_index()

        row = {"strategy": f"{algo}_{vsize}"}
        for m in ["NB", "KNN", "SVM", "C45"]:
            row[m] = round(r[r.model==m].iloc[0]["score_value"]*100, 2)
        tab = pd.concat([tab, pd.DataFrame([row])], ignore_index=True)

tab

Unnamed: 0,strategy,NB,KNN,SVM,C45
0,cbow_50,77.02,86.37,93.54,75.39
1,cbow_100,75.66,89.4,94.97,79.61
2,cbow_200,76.85,89.69,95.44,81.45
3,sg_50,82.76,81.44,87.96,72.92
4,sg_100,82.55,81.9,91.64,70.99
5,sg_200,82.33,82.28,92.88,71.76


### Learn final classifier on embeddings
Testing with the framework by Pellegrino et al. (see above) reveals that SVC with C=100 delivers high accuracy on the given task (predicting movie quality). We therefore train and store such a classifier for every embedding variant that was trained. 

In [10]:
for cfg in datasets:
    for algo in [0,1]: # 0: CBOW, 1: SG
        for vsize in [50,100,200]: # Vector size of embeddings

            print(cfg["name"], algo, vsize)
            dataset, entities = load_dataset(cfg)
            train, test = split_dataset(dataset, cfg)
            datasetLocation = cfg["location"]
            label_col = cfg["columns"]["label"]
        
            transformerPath = os.path.join(datasetLocation, "transformers", f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")
            transformer = RDF2VecTransformer.load(transformerPath)

            clf = SVC(C=100, probability=True)

            # train-test split as seen in 0_MovieDataSetExploration.ipynb
            train_partition = cfg["train_partition"]
            clf.fit(transformer._embeddings[200:1800], dataset[200:1800][label_col]) #test[label_col])

            test_partition = cfg["test_partition"]
            pred = clf.predict(transformer._embeddings[1800:])

            from sklearn.metrics import accuracy_score
            s = accuracy_score(dataset[1800:][label_col], pred)
            print(s)


            targetPath = os.path.join(datasetLocation, "classifiers")
            Path(targetPath).mkdir(parents=True, exist_ok=True)
            
            with open(os.path.join(targetPath, f"svc_100_{'sg' if algo else 'cbow'}_{vsize}" ), "wb") as file:
                pickle.dump(clf, file)

metacritic-movies 0 50
0.97
metacritic-movies 0 100
0.95
metacritic-movies 0 200
0.965
metacritic-movies 1 50
0.95
metacritic-movies 1 100
0.925
metacritic-movies 1 200
0.92
metacritic-albums 0 50


ValueError: Found input variables with inconsistent numbers of samples: [1392, 1400]

## Experiments

In [35]:
walks = transformer._walks

print(f"Number of entities with walks: {len(walks)}")
print(f"Walks per entity: {len(walks[0])}")
print(f"First walk of first entity:")
print(walks[0][0])

# Distance(Matrix, Matrix Reloaded) < Distance(Matrix, The Batman)?
# Can we represent the embeddings in a 2D space for visualization? -> See examples

Number of entities with walks: 2000
Walks per entity: 484
First walk of first entity:
('http://dbpedia.org/resource/Category:Romanian_films_by_genre', 'http://www.w3.org/2004/02/skos/core#broader', 'http://dbpedia.org/resource/Category:Romanian_drama_films', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/resource/4_Months,_3_Weeks_and_2_Days', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/resource/Category:European_Film_Awards_winners_(films)', 'http://www.w3.org/2004/02/skos/core#prefLabel', 'European Film Awards winners (films)')


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

transformer = RDF2VecTransformer.load(os.path.join(movieLocation, "transformers", "rdf2vec_transformer_cbow_200"))

# Reduce the dimensions of entity embeddings to represent them in a 2D plane.
X_tsne = TSNE(random_state=42).fit_transform(transformer._embeddings[:])

colors = list(map(lambda e: "#00ff00" if movieFull[movieFull.DBpedia_URI==e].iloc[0].label == "good" else "#ff0000", transformer._entities[:]))
sizes = list(map(lambda e: abs(50-movieFull[movieFull.DBpedia_URI==e].iloc[0].rating)**2, transformer._entities[:]))

# Plot the embeddings of entities in a 2D plane, annotating them.
f = plt.figure(figsize=(200, 80))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, s=sizes)
for x, y, t in zip(X_tsne[:, 0], X_tsne[:, 1], transformer._entities):
    plt.annotate(t.split("/")[-1], (x, y))

# Display the graph with a title, removing the axes for better readability.
plt.title("pyRDF2Vec", fontsize=4)
plt.axis("off")
#plt.show()

f.savefig(os.path.join(movieLocation, "figure.pdf"), bbox_inches='tight')