## Initial embedding pipeline
Prerequisites:<br>
DBpedia subset hosted with a SPARQL endpoint, relevant datasets (e.g. metacritic-movies) cleaned in previous notebook.

Purpose:<br>
This pipeline builds knowledge graph embeddings for entities of interest (e.g. movies) and trains a classifier to predict the target variable.

(Hint)
Make sure to select the poetry kernel. If it does not show up, try to reload your editor.

In [1]:
import pandas as pd
import os.path
import pickle
import json
from pathlib import Path
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from sklearn.svm import SVC
from evaluation_framework.manager import FrameworkManager
from rdflimeConfig import dbpediaLocation, datasets, load_dataset, split_dataset

### Check datasets

In [3]:
for cfg in datasets:
    dataset, entities = load_dataset(cfg)
    display(dataset.head(3))

Unnamed: 0,Wikidata_URI15,Movie,Release date,DBpedia_URI,label,id,rating,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32,DBpedia_URI16
0,http://www.wikidata.org/entity/Q238245,"4 Months, 3 Weeks and 2 Days",1/23/2008 0:00,"http://dbpedia.org/resource/4_Months,_3_Weeks_...",good,1601,97,"http://dbpedia.org/resource/4_Months,_3_Weeks_...",,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6N...,"http://dbpedia.org/resource/4_Months,_3_Weeks_..."
1,http://www.wikidata.org/entity/Q170035,Ratatouille,6/29/2007 0:00,http://dbpedia.org/resource/Ratatouille_(film),good,1602,96,http://dbpedia.org/resource/Ratatouille_(film),http://yago-knowledge.org/resource/Ratatouille...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6U...,http://dbpedia.org/resource/Ratatouille_(film)
2,http://www.wikidata.org/entity/Q1752684,Killer of Sheep,3/30/2007 0:00,http://dbpedia.org/resource/Killer_of_Sheep,good,1603,94,http://dbpedia.org/resource/Killer_of_Sheep,http://yago-knowledge.org/resource/Killer_of_S...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6S...,http://dbpedia.org/resource/Killer_of_Sheep


Unnamed: 0,Wikidata_URI15,id,album,artist,date,rating,DBpedia_URI,label,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32,DBpedia_URI16
0,http://www.wikidata.org/entity/Q7913484,1.0,Van Lear Rose,Loretta Lynn,27-Apr-04,97.0,http://dbpedia.org/resource/Van_Lear_Rose,good,http://dbpedia.org/resource/Van_Lear_Rose,http://yago-knowledge.org/resource/Van_Lear_Rose,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...,http://dbpedia.org/resource/Van_Lear_Rose
1,http://www.wikidata.org/entity/Q1036873,2.0,Histoire de Melody Nelson,Serge Gainsbourg,24-Mar-09,96.0,http://dbpedia.org/resource/Histoire_de_Melody...,good,http://dbpedia.org/resource/Histoire_de_Melody...,http://yago-knowledge.org/resource/Histoire_de...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6S...,http://dbpedia.org/resource/Histoire_de_Melody...
2,http://www.wikidata.org/entity/Q378166,3.0,Stankonia,Outkast,31-Oct-00,95.0,http://dbpedia.org/resource/Stankonia,good,http://dbpedia.org/resource/Stankonia,http://yago-knowledge.org/resource/Stankonia,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6U...,http://dbpedia.org/resource/Stankonia


Unnamed: 0,Wikidata_URI15,id,Company,Industry,Country,rating,Sales,Profits,Assets,Rank,DBpedia_URI,label,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32,DBpedia_URI16
0,http://www.wikidata.org/entity/Q26463,1,Industrial and Commercial Bank of China,Major Banks,China,237.3,134.8,37.8,2813.5,1,http://dbpedia.org/resource/Industrial_and_Com...,high,http://dbpedia.org/resource/Industrial_and_Com...,,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6S...,http://dbpedia.org/resource/Industrial_and_Com...
1,http://www.wikidata.org/entity/Q26299,2,China Construction Bank,Regional Banks,China,202.0,113.1,30.6,2241.0,2,http://dbpedia.org/resource/China_Construction...,high,http://dbpedia.org/resource/China_Construction...,http://yago-knowledge.org/resource/China_Const...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6Q...,http://dbpedia.org/resource/China_Construction...
2,http://www.wikidata.org/entity/Q192314,3,JPMorgan Chase,Major Banks,United States,191.4,108.2,21.3,2359.1,3,http://dbpedia.org/resource/JPMorgan_Chase,high,http://dbpedia.org/resource/JPMorgan_Chase,http://yago-knowledge.org/resource/JPMorgan_Chase,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6S...,http://dbpedia.org/resource/JPMorgan_Chase


Unnamed: 0,Wikidata_URI15,id,city_name,DBpedia_URI,rating,label,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32,DBpedia_URI16
0,http://www.wikidata.org/entity/Q24639,1.0,Vancouver,http://dbpedia.org/resource/Vancouver,106.0,high,http://dbpedia.org/resource/Vancouver,http://yago-knowledge.org/resource/Vancouver,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...,http://dbpedia.org/resource/Vancouver
1,http://www.wikidata.org/entity/Q72,2.0,Zurich,http://dbpedia.org/resource/Zurich,106.0,high,http://dbpedia.org/resource/Zürich,,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6W...,http://dbpedia.org/resource/Zürich
2,http://www.wikidata.org/entity/Q1741,3.0,Vienna,http://dbpedia.org/resource/Vienna,106.0,high,http://dbpedia.org/resource/Vienna,http://yago-knowledge.org/resource/Vienna,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...,http://dbpedia.org/resource/Vienna


Unnamed: 0,Wikidata_URI15,FICE,College_name,State,Type,Average_salary_full_professors,Average_salary_associate_professors,Average_salary_assistant_professors,rating,Average_compensation_full_professor,...,Number_of_instructors,Number_of_faculty_all_ranks,DBpedia_URI,label,label_comp,id,DBpedia_URI15,YAGO_URI15,DBpedia_URI15_Base32,DBpedia_URI16
0,http://www.wikidata.org/entity/Q1565621,1061,Alaska Pacific University,AK,IIB,454,382,362.0,382,567,...,4,32,http://dbpedia.org/resource/Alaska_Pacific_Uni...,medium,medium,1,http://dbpedia.org/resource/Alaska_Pacific_Uni...,http://yago-knowledge.org/resource/Alaska_Paci...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6Q...,http://dbpedia.org/resource/Alaska_Pacific_Uni...
1,http://www.wikidata.org/entity/Q1285262,1063,University of Alaska-Fairbanks,AK,I,686,560,432.0,508,914,...,40,404,http://dbpedia.org/resource/University_of_Alas...,high,high,2,http://dbpedia.org/resource/University_of_Alas...,http://yago-knowledge.org/resource/University_...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...,http://dbpedia.org/resource/University_of_Alas...
2,http://www.wikidata.org/entity/Q94279,1065,University of Alaska-Southeast,AK,IIA,533,494,329.0,415,716,...,9,70,http://dbpedia.org/resource/University_of_Alas...,medium,medium,3,http://dbpedia.org/resource/University_of_Alas...,http://yago-knowledge.org/resource/University_...,NB2HI4B2F4XWIYTQMVSGSYJON5ZGOL3SMVZW65LSMNSS6V...,http://dbpedia.org/resource/University_of_Alas...


### Build DBpedia embeddings

Train and store a PyRDF2Vec transformer with various parameter settings.

In [4]:
for cfg in datasets:
    for algo in [0,1]: # 0: CBOW, 1: SG
        for vsize in [50, 100, 200]: # Vector size of embeddings
            print(cfg["name"], algo, vsize)

            dataset, entities = load_dataset(cfg)
            datasetLocation = cfg["location"]
            
            if os.path.exists(os.path.join(datasetLocation, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")):
                print(f"Skipping transformer, as it already exists.")
                continue
            
            dbpedia = KG(dbpediaLocation, skip_verify=False, mul_req=False)
            
            transformer = RDF2VecTransformer(
                Word2Vec(sg=algo, vector_size=vsize), # negative = 25
                walkers=[RandomWalker(max_walks=500, max_depth=4, with_reverse=False, n_jobs=8, md5_bytes=None)], # max_walks = 22, max_depth = 2
                verbose=1
            )

            walks = transformer.get_walks(dbpedia, entities)
            transformer.fit(walks)
            embeddings, literals = transformer.transform(dbpedia, entities)

            targetPath = os.path.join(datasetLocation, "transformers")
            Path(targetPath).mkdir(parents=True, exist_ok=True)
            transformer.save(os.path.join(targetPath, f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}"))

metacritic-movies 0 50


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### Store embeddings that are compatible to the Evaluation Framework
In a next step, we would like to evaluate the quality of our generated embeddings. However, we first need to convert to a format that is readable by [GEval](https://github.com/mariaangelapellegrino/Evaluation-Framework), the graph embedding framework by Pellegrino et al.

This entails two steps: First, apply the fixes from 1_DBpediaFixes.ipynb in reverse, i.e. "unfix" the IRIs to be compatible with the framework. Second, store the embeddings in the required CSV-like format.

In [15]:
for cfg in datasets:
    datasetLocation = cfg["location"]
    _, entities = load_dataset(cfg)

    with open(os.path.join(datasetLocation, "datasetFixes.json"), "r") as f:
        fixes = json.load(f)

    for algo in ["cbow", "sg"]:
        for vsize in [50, 100, 200]: # Vector size of embeddings

            with open(os.path.join(datasetLocation, "transformers", f"rdf2vec_transformer_{algo}_{vsize}"), "rb") as file:
                transformer: RDF2VecTransformer = pickle.load(file)

            targetPath = os.path.join(datasetLocation, "embeddings")
            Path(targetPath).mkdir(parents=True, exist_ok=True)

            with open(os.path.join(targetPath, f"embeddings_{algo}_{vsize}"), "w") as file:
                for index, embedding in enumerate(transformer._embeddings):

                    # "Unfix" IRI and replace with the version that the Evaluation Framework by Pellegrino et al. understands
                    movie = entities[index]
                    fix = next(filter(lambda f: f["fix"] == movie, fixes), None)
                    if fix: movie = fix["original"]            

                    # Write embedding to file
                    line = f"{movie} {' '.join(map(str,embedding))}\n"
                    file.write(line)

## Evaluation

### Evaluation using Evaluation-Framework by Pellegrino et al.
Load each of our embedding versions and run the classification task on the movie dataset.

In [3]:
evalPath = os.path.join(movieLocation, "embeddings", "evaluation")
Path(evalPath).mkdir(parents=True, exist_ok=True)

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]: # Vector size of embeddings
        print(algo, vsize)

        embeddingPath = os.path.join(movieLocation, "embeddings", f"embeddings_{algo}_{vsize}")
        
        evaluation_manager = FrameworkManager()
        evaluation_manager.evaluate(
            embeddingPath,
            tasks=["Classification"],
            parallel=False,
            debugging_mode=False,
            vector_size=vsize,
            result_directory_path=os.path.join(evalPath, f"geval_result_{algo}_{vsize}")
        )

Start evaluation...
Classification finished
0:01:49
Start evaluation...
Classification finished
0:03:10
Start evaluation...
Classification finished
0:06:05
Start evaluation...
Classification finished
0:02:02
Start evaluation...
Classification finished
0:03:28
Start evaluation...
Classification finished
0:06:44


In [5]:
# Move the results file to the proper location
!mv comparison.csv $evalPath

In [10]:
results = pd.read_csv(os.path.join(evalPath, "comparison.csv"), sep=" ")
tab = pd.DataFrame()

for algo in ["cbow", "sg"]:
    for vsize in [50, 100, 200]:
        r = results[results.test_name.str.contains(f"{algo}_{vsize}")] \
            .groupby("model") \
            .max() \
            .reset_index()

        row = {"strategy": f"{algo}_{vsize}"}
        for m in ["NB", "KNN", "SVM", "C45"]:
            row[m] = round(r[r.model==m].iloc[0]["score_value"]*100, 2)
        tab = pd.concat([tab, pd.DataFrame([row])], ignore_index=True)

tab

Unnamed: 0,strategy,NB,KNN,SVM,C45
0,cbow_50,75.35,80.64,86.04,67.65
1,cbow_100,77.65,81.15,89.29,68.2
2,cbow_200,76.88,83.17,91.04,68.54
3,sg_50,69.76,72.28,78.82,57.86
4,sg_100,70.9,73.66,83.78,57.04
5,sg_200,71.42,72.53,85.7,57.98


### Learn final classifier on embeddings
Testing with the framework by Pellegrino et al. (see above) reveals that SVC with C=100 delivers high accuracy on the given task (predicting movie quality). We therefore train and store such a classifier for every embedding variant that was trained. 

In [13]:
for cfg in datasets:
    for algo in [0,1]: # 0: CBOW, 1: SG
        for vsize in [50,100,200]: # Vector size of embeddings

            dataset, entities = load_dataset(cfg)
            train, test = split_dataset(dataset, cfg)
            datasetLocation = cfg["location"]
            label_col = cfg["columns"]["label"]
        
            transformerPath = os.path.join(datasetLocation, "transformers", f"rdf2vec_transformer_{'sg' if algo else 'cbow'}_{vsize}")
            transformer = RDF2VecTransformer.load(transformerPath)

            clf = SVC(C=100, probability=True)

            # train-test split as seen in 0_MovieDataSetExploration.ipynb
            train_partition = cfg["train_partition"]
            clf.fit(transformer._embeddings[train_partition[0]:train_partition[1]], train[label_col])

            targetPath = os.path.join(datasetLocation, "classifiers")
            Path(targetPath).mkdir(parents=True, exist_ok=True)
            
            with open(os.path.join(targetPath, f"svc_100_{'sg' if algo else 'cbow'}_{vsize}" ), "wb") as file:
                pickle.dump(clf, file)

## Experiments

In [35]:
walks = transformer._walks

print(f"Number of entities with walks: {len(walks)}")
print(f"Walks per entity: {len(walks[0])}")
print(f"First walk of first entity:")
print(walks[0][0])

# Distance(Matrix, Matrix Reloaded) < Distance(Matrix, The Batman)?
# Can we represent the embeddings in a 2D space for visualization? -> See examples

Number of entities with walks: 2000
Walks per entity: 484
First walk of first entity:
('http://dbpedia.org/resource/Category:Romanian_films_by_genre', 'http://www.w3.org/2004/02/skos/core#broader', 'http://dbpedia.org/resource/Category:Romanian_drama_films', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/resource/4_Months,_3_Weeks_and_2_Days', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/resource/Category:European_Film_Awards_winners_(films)', 'http://www.w3.org/2004/02/skos/core#prefLabel', 'European Film Awards winners (films)')


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

transformer = RDF2VecTransformer.load(os.path.join(movieLocation, "transformers", "rdf2vec_transformer_cbow_200"))

# Reduce the dimensions of entity embeddings to represent them in a 2D plane.
X_tsne = TSNE(random_state=42).fit_transform(transformer._embeddings[:])

colors = list(map(lambda e: "#00ff00" if movieFull[movieFull.DBpedia_URI==e].iloc[0].label == "good" else "#ff0000", transformer._entities[:]))
sizes = list(map(lambda e: abs(50-movieFull[movieFull.DBpedia_URI==e].iloc[0].rating)**2, transformer._entities[:]))

# Plot the embeddings of entities in a 2D plane, annotating them.
f = plt.figure(figsize=(200, 80))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, s=sizes)
for x, y, t in zip(X_tsne[:, 0], X_tsne[:, 1], transformer._entities):
    plt.annotate(t.split("/")[-1], (x, y))

# Display the graph with a title, removing the axes for better readability.
plt.title("pyRDF2Vec", fontsize=4)
plt.axis("off")
#plt.show()

f.savefig(os.path.join(movieLocation, "figure.pdf"), bbox_inches='tight')