# Metrics

This notebook contains 4 of ther 5 metrics used to evaluate and compare the different embeddings.

## M1 Downstream Classification

This metric can be found in the notebook "LSTM.ipynb" as it turned out to be a micro-project in itself :-)

## M2 Clustering by Subsystem

TThe basic idea of this metric is clustering / classification in the embedding vector space.
To implement, sklearn KNN classification with cosine distance as a distance measure is used. 

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_distances.html


## M3 Word Analogy

Here, word analogies in the form 

a:b::c:d

e.g. King:Man::Queen:Woman

or in vector addition: King-Man+Woman=Queen 

are employed as a metric. The basic idea is to count how many word analogies from a predefined list are found in the different embeddings.


## M4 Word Similarity List

This is a well-known intrinsic measure for word embeddings. I adapt it here to the domain specific realm of automotive engineering. For this purpose I compiled a list of wordpairs like the well known "wordsim353" and asked colleagues to provide a similarity measure between 0 and 10. This human similarity score is then (Pearson-) correlated to the cosine-similarity of the embeddings.


## M5 Visualizations

Visualizations are not a quantitative metric, however they provide (with limitations) qualitative insights about the properties of edmbeddings. Here I employ t-SNE, one of the most commomly used algorithms for this purpose.

In [None]:
import pickle
import keras
import pandas as pd
import pickle
import gensim
import fasttext
from gensim.models import FastText, Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from scipy.stats import pearsonr


import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
# Load Word Vectors

#Load pretrained FastText
model_ft =  KeyedVectors.load_word2vec_format('../WordEmbeddings/FastText/cc.de.300.vec') 

#Load domain specific FastText
model_ds_ft = FastText.load("../WordEmbeddings/ft_ds.model")

#Load domain specific word2vec
model_ds_w2v = Word2Vec.load("../WordEmbeddings/w2v_ds.model")


In [None]:
#sanity checks
print(len(model_ds_w2v.wv.vocab))
print(len(model_ds_ft.wv.vocab))
print(len(model_ft.wv.vocab))
#model_ft['Getriebe']
model_ft.get_vector('Zylinder')

# M2: Subsystem Classification in Vector Space

In [None]:
# M2 Clustering by subsystem

engine = ['motor', 'einspritzung', 'zündung', "zündkerze", "wasserpumpe", "nockenwelle", "kurbelwelle", "starter", "generator"]
transmission = ['getriebe', "gang", "schlupf", "kupplung", "einlegen", "zahnrad", "gang", "automatik", "schaltgetriebe"]
control_unit = ["steuergerät", "funktion", "applikation", "regler", "tsk", "architektur", "variable", "parameter"]
vehicle = ["can", "kombiinstrument", "gaspedal", "fahrwerk", "chassis", "pedal", "mil", "esp", "abs", "lenkrad"]
exhaust = ["lambdasonde", "dpf", "opf", "katalysator", "abgas", "emission", "nox"]

classes = [engine, transmission, control_unit, vehicle, exhaust]
models = [model_ds_ft, model_ds_w2v, model_ft]
data = engine+transmission+control_unit+vehicle+exhaust

X_train = [model_ds_ft[word] for word in data]
y_train = [1]*len(engine)+[2]*len(transmission)+[3]*len(control_unit)+[4]*len(vehicle)+[5]*len(exhaust)

knnc=KNeighborsClassifier(metric='cosine', n_neighbors=3)
knnc.fit(X_train, y_train)
y_pred=knnc.predict(X_train) 

print(accuracy_score(y_train, y_pred, normalize=True, sample_weight=None))

def check_existence(list, model):
    flag = True
    for word in list:
        if word not in model.wv.vocab:
            print("{} in Class {} not found in Embedding {}".format(word, c, model))
            flag = False
    return flag

for model in models:
    for c in classes:
        #print(check_existence(c, model))
        pass

cm = confusion_matrix(y_train, y_pred, labels=None, sample_weight=None)
    
print(cm)
plt.imshow(cm, cmap='binary')

# M3: Word Analogy

In [None]:
# M3 Word Analogy
#syntax: (positive=['woman', 'king'], negative=['man'])

a1 = ["diesel", "dieselmotor", "benzin", "ottomotor",]
a2 = ["motor", "motorsteuergerät", "getriebe", "getriebesteuergerät"]
a3 = ["benzin","zündkerze","diesel","selbstzündung"]
a4 = ["wasser", "wasserpumpe", "benzin", "benzinpumpe"]
a5 = ["benzin", "tank", "strom", "batterie"]
a6 = ["can","botschaft","flexray","pdu"]

analogies = [a1,a2,a3,a4,a5,a6]

#pretty print for results: print("{}: {:.4f}".format(*result[0]))

def analogy(quartet):
    #result_ft = model_ft.most_similar(positive=[quartet[0], quartet[1]], negative=[quartet[2]])
    result_ds_ft = model_ds_ft.most_similar(positive=[quartet[0], quartet[1]], negative=[quartet[2]])
    result_ds_w2v = model_ds_w2v.most_similar(positive=[quartet[0], quartet[1]], negative=[quartet[2]])

    return result_ds_w2v
for a in analogies:
    print(analogy(a))

# M4 Word Similarity

In [None]:
DATA_SOURCE = "list"

#Loading data from disk
if DATA_SOURCE == "disk":
    data = pd.read_csv('../Data/sim_list_N_o.tsv', delimiter ='\t',header = None)
    df_human_score = pd.read_csv('../Data/human_score.tsv', header = None) 
    e1 = [e.lower() for e in data[0]]
    e2 = [e.lower() for e in data[1]]
    words = [(e1,e2) for (e1,e2) in zip(e1,e2)]
    human_score = [s for s in df_human_score[0]]
    
    
#Loading data from list
if DATA_SOURCE == "list":
    words_original=[("riemenstartergenerator","rsg"),("motorsteuergerät","msg"),("haltemanagementsystem","hms"),
       ("batterymanagementsystem","bms"),("stateofcharge","soc"),("konzernschaltprogramm","ksp"),
        ("hochvoltkoordinator","hvk"),("geschwindigkeitsregelanlage","gra"),("triebstrangkoordinator","tsk"),
        ("functionondemand","fod"),("automatictransmissionfluid","atf"),("on boarddiagnose","obd"),("dieselpartikelfilter","dpf"),
        ("ottopartikelfilter","opf"),("electronicstabilitycontrol","esc"),("antriebssteuergeraet","asg"),("vorderachse","va"),
        ("intelligenterparkassistent","ipa"),("praediktivereffizienzassistent","pea"),("intelligentspeedassistent","isa"),
        ("Motor","Zylinder"),("tsk","laengsbeschleunigungsregler"),("Drehzahl","Schub"),("Hybrid","phev"),("Brennstoffzelle","Anode"),
        ("Brennstoffzelle","Kathode"),("kraftstoff","diesel"),("getriebe","k0"),("Display","Anzeige"),("Monitoring","Überwachung"),
        ("Wasserstoff","H2"),("motor","vkm"),("EMaschine","Strom"),("Bremspedal","Kriechen"),("Emissionen","nox"),("Druckventil","entllueftung"),
       ("fahrpedal","moment"),("wählhebel","einlegen"),("kupplung","getriebe"),("dl382","schaltung"),("phev","cbev"),("pumpe","öl"),
        ("wärme","heizen"),("saugrohrdruck","drosseln"),("drehzahl","anzeige"),("drehzahl","antrieb"),("leerlauf","schaltung"),
    ("mqb","golf"),("meb","bev"),("entprellung","signal"),("brennstoffzelle","purgeventil"),("adpcus","signal"),
    ("startstopp","leerlauf"),("drehmoment","achsfunktionalität"),("rampe","momentengradient"),("rekuperation","potential"),
    ("asg","msg"),("antrieb","leistung"),("sekundaerachse","ankoppeln"),("waehlhebel","kupplung")]
    
    words=[("motorsteuergerät","msg"),
       ("motor","zylinder"),
       ("drehzahl","schub"),
       ("brennstoffzelle","anode"),
       ("kraftstoff","diesel"),
       ("display","anzeige"),
       ("monitoring","überwachung"),
       ("emissionen","nox"),
       ("kupplung","getriebe"),
       ("pumpe","oel"),
       ("wärme","heizen"),
       ("drehzahl","anzeige"),
       ("drehzahl","antrieb"),
       ("leerlauf","schaltung"),
       ("asg","msg"),
       ("antrieb","leistung")]
    human_score = (9, 6, 3, 6, 7, 8, 8, 7, 6, 5, 7, 4, 5, 1, 5, 6)

In [None]:
similarity_ds_ft = [model_ds_ft.wv.similarity(e1, e2) for (e1, e2) in words]
similarity_ds_w2v = [model_ds_w2v.wv.similarity(e1, e2) for (e1, e2) in words]
similarity_ft = [model_ft.wv.similarity(e1, e2) for (e1, e2) in words]

score_ds_ft = pearsonr(similarity_ds_ft, human_score)
score_ds_w2v = pearsonr(similarity_ds_w2v, human_score)
score_ft = pearsonr(similarity_ft, human_score)

print("Score with domain specific FastText: {}".format(score_ds_ft[0]))
print("Score with domain specific word2vec: {}".format(score_ds_w2v[0]))
print("Score with general FastText: {}".format(score_ft[0]))

## M5 Visualizations

In [None]:
model = model_ds_ft

keys1 = ['benzin', 'diesel']
keys2 =['motor','getriebe']

embedding_clusters = []
word_clusters = []
for word in keys2:
    embeddings = []
    words = []
    for similar_word, _ in model.most_similar(word, topn=20):
        words.append(similar_word)
        embeddings.append(model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
from sklearn.manifold import TSNE
import numpy as np

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=12, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(12, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=12)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

title = "t_SNE: 20 neighbours of '{}' and '{}' with domain specific FastText".format(keys2[0], keys2[1], model)
    
tsne_plot_similar_words(title, keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')