In [None]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from os.path import join
from scipy.sparse import coo_matrix
import logging
import math
from collections import defaultdict
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
cd ../..

In [None]:
from lib.phase8 import evaluate

In [None]:
def get_train_test(path):
    train = pd.read_table(
        join(path, "y_train.txt"),
        sep=" ",
        dtype={"synset": np.int32},
        index_col=0)["synset"]
    test = pd.read_table(
        join(path, "y_test.txt"),
        sep=" ",
        dtype={"synset": np.int32},
        index_col=0)["synset"]
    return train, test

def baseline_synset_vector(word_vectors, words, k):
    return word_vectors.most_similar(positive=words, topn=k)  # [(word, similarity)]

def row_check_top3(row):
    if not isinstance(row["y_top3_classes"], list):
        return 0
    
    return row["y_test"] in row["y_top3_classes"]

def row_check_top1(row):
    if not isinstance(row["y_top3_classes"], list):
        return 0
    
    return row["y_test"] == row["y_top3_classes"][0]

def row_check_no_prediction(row):
    if not isinstance(row["y_top3_classes"], list):
        return 1
    
    return 0

In [None]:
k = 200

# fastText
#embeddings_file = "<PROJECT_DIR>/02_word_embeddings/<DATE>/main.txt"
#thesaurus_sampled_list = [
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
#    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>"
#]

# word2vec
embeddings_file = "<PROJECT_DIR>/02_word_embeddings/<DATE>/main.txt"
thesaurus_sampled_list = [
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>",
    "<PROJECT_DIR>/05_thesaurus_sampled/<DATE>"
]

word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
vocab = list(word_vectors.vocab.keys())

In [None]:
stats = []

for thesaurus_sampled in thesaurus_sampled_list:
    y_train, y_test = get_train_test(thesaurus_sampled)
    grouped = y_train.groupby(y_train)
    synsets = grouped.groups
    print("unique synsets", len(synsets))
    
    suggestions = defaultdict(list)

    for idx, (synset_id, group) in enumerate(grouped):
        synset_predictions = baseline_synset_vector(word_vectors, group.index.tolist(), k) # [(word, similarity)]

        for word, similarity in synset_predictions:
            suggestions[word].append((synset_id, similarity))

        if idx % 100 == 0:
            print(idx)
            
    suggestions_sorted_top3 = defaultdict(list)
    for word, suggestion_list in suggestions.items():
        suggestions_sorted_top3[word] = sorted(suggestion_list, key=lambda x: x[1], reverse=True)[:3] # sort descending

    for word, suggestion_list in suggestions_sorted_top3.items():
        suggestions_sorted_top3[word] = [synset_id for synset_id, conf in suggestion_list]

    print(len(suggestions_sorted_top3.items()))
    
    preds = pd.DataFrame(list(suggestions_sorted_top3.items()))
    preds.columns = ["word", "y_top3_classes"]
    preds = preds.set_index("word")
    print(preds.head())
    
    evaluation = y_test.to_frame().join(preds)
    evaluation.columns = ["y_test", "y_top3_classes"]
    print(evaluation.head())
    
    within_top3_mean = evaluation.apply(row_check_top3, axis=1).mean()
    top1_mean = evaluation.apply(row_check_top1, axis=1).mean()
    no_predictions = evaluation.apply(row_check_no_prediction, axis=1).sum()
    
    stats.append({
        "within_top3_mean": within_top3_mean,
        "top1_mean": top1_mean,
        "no_predictions": no_predictions
    })
          
print(stats)
# print(np.mean(stats))