In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from os.path import join
from scipy.sparse import coo_matrix
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
cd ../src

In [None]:
from lib.propagation.label_spreading import label_spreading
from lib.phase8 import evaluate

In [None]:
def get_train_test(path):
    train = pd.read_table(
        join(path, "y_train.txt"),
        sep=" ",
        dtype={"synset": np.int32},
        index_col=0)["synset"]
    test = pd.read_table(
        join(path, "y_test.txt"),
        sep=" ",
        dtype={"synset": np.int32},
        index_col=0)["synset"]
    return train, test

def baseline_synset_vector(word_vectors, words, k):
    return word_vectors.most_similar(positive=words, topn=k)  # [(word, similarity)]

In [None]:
k = 200

# fastText
#embeddings_file = "<PROJECT_DIR>/02_word_embeddings/<DATE>/main.txt"
#thesaurus_sampled_list = [
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>"
#]

# word2vec
embeddings_file = "<PROJECT_DIR>/02_word_embeddings/<DATE>/main.txt"
thesaurus_sampled_list = [
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>"
]

word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
vocab = list(word_vectors.vocab.keys())

In [None]:
stats = []

for thesaurus_sampled in thesaurus_sampled_list:
    y_train, y_test = get_train_test(thesaurus_sampled)
    grouped = y_train.groupby(y_train)
    synsets = grouped.groups
    print("unique synsets", len(synsets))

    axis = ["synset_{}".format(synset_id) for synset_id in list(synsets.keys())] + vocab
    labels = list(synsets.keys()) + [-1 for v in vocab]

    edges = [] # (data, (i, j))
    for idx, (synset_id, group) in enumerate(grouped):
        synset_predictions = baseline_synset_vector(word_vectors, group.index.tolist(), k) # [(word, similarity)]

        edges += [(similarity, (idx, axis.index(word))) for word, similarity in synset_predictions]
        edges += [(similarity, (axis.index(word), idx)) for word, similarity in synset_predictions]

        if idx % 100 == 0:
            print(idx)

    print("edge number", len(edges))
    data, locs = zip(*edges)
    i, j = zip(*locs)

    graph = coo_matrix((data, (i, j)), shape=(len(axis), len(axis)))
    
    # Propagate baseline graph with various propagation parameters
    predictions, confidences, top3_classes = label_spreading(
        graph.tocsr(), 
        labels, 
        { "alpha": 0.2, "iter": 15 }
    )
    
    df_labels = pd.DataFrame(axis).set_index(0)
    df_labels.index = df_labels.index.rename("word")

    df_predicted = df_labels.assign(y_pred=predictions)
    df_predicted = df_predicted.assign(y_conf=confidences)
    df_predicted = df_predicted.assign(y_top3_classes=top3_classes)
    df_predicted = df_predicted.iloc[len(synsets):] # remove the synset rows
    print(df_predicted.describe())
    print(df_predicted.head())
    
    # Evaluate performance of propagated graph
    df_evaluation = df_predicted.join(y_train, how="left").fillna(-1, downcast="infer")
    df_evaluation.rename(columns={"synset": "y_train"}, inplace=True)

    df_evaluation = df_evaluation.join(y_test, how="left")
    df_evaluation.rename(columns={"synset": "y_test"}, inplace=True)

    df_evaluation = df_evaluation[["y_train", "y_pred", "y_conf", "y_top3_classes", "y_test"]]
    stats.append(evaluate(df_evaluation))
    
stats