all submissions were made with original file. This is only for p = 1, norm_p = 2

In [1]:
import re
import os
import json
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import defaultdict

# Merging data from NKRYa into one json

In [2]:
words_num = 99
data_files_format = 'data/RuShiftEval/test/eval_23_True_True/2-3_{0}.data'

In [3]:
def load_words(data_files_format, words_num=99):
    all_words = []
    
    for word_id in range(words_num):
        current_file = data_files_format.format(word_id)
        with open(current_file) as f:
            word_examples = json.load(f)
            all_words.extend(word_examples)
            
    return all_words

In [45]:
all_words = load_words(data_files_format, words_num=words_num)

len(all_words)

9149

In [46]:
all_words[200]

{'id': 'test.scd_2.31',
 'lemma': 'апостол',
 'pos': 'NOUN',
 'sentence1': 'Ложные пророки ее и апостолы -- Юнг-Штиллинг, Эккартсгаузен, Гион, Беме, Лабзин, Госнер, Фесслер, методисты, гернгутеры"...',
 'sentence2': 'Муки совести, трагедийное чувство вины приобретают у Петрухи едва ли не христианский характер и отделяют его от остальных "апостолов" нового мира, идущих "без имени святого".',
 'start1': 20,
 'end1': 28,
 'start2': 123,
 'end2': 132,
 'grp': 'COMPARE'}

In [47]:
with open('data/RuShiftEval/test/eval_23.json', 'w', encoding='utf-8') as f:
    json.dump(all_words, f, indent=4, ensure_ascii=False)

# Getting average score per word

In [196]:
preds_filename = 'data/RuShiftEval/RuShiftEvalPreds/large/preds_eval_23.json'
data_files_format = 'data/RuShiftEval/test/eval_23_True_True/2-3_{0}.data'
output_epochs_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_23.json'

In [197]:
all_words = load_words(data_files_format, words_num=words_num)
all_words = {word['id']: word for word in all_words}

len(all_words)

9149

In [198]:
class VectorsDotPredictor:
    def __init__(self, threshold=None, normalize=True, norm_ord=2):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) > self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
            
        return sum(out_vector_1 * out_vector_2)

In [199]:
class VectorsDistPredictor:
    def __init__(self, threshold, normalize=True, norm_ord=2, dist_ord=None):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
        self.dist_ord = dist_ord
        if self.dist_ord is None:
            self.dist_ord = self.norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) < self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
        
        return np.linalg.norm(out_vector_1 - out_vector_2, ord=self.dist_ord)

In [200]:
with open(preds_filename) as f:
    eval_samples = json.load(f)

In [201]:
word_to_dists = defaultdict(list)
predictor = VectorsDistPredictor(threshold=None, normalize=True, norm_ord=2, dist_ord=1)
# predictor = VectorsDistPredictor(threshold=None, normalize=True, norm_ord=2)
# predictor = VectorsDotPredictor(threshold=None, normalize=True, norm_ord=2)

for sample in eval_samples:
    dist = predictor.predict_proba(sample['context_output1'], sample['context_output2'])
    current_lemma = all_words[sample['id']]['lemma']
    word_to_dists[current_lemma].append(dist)
    
len(word_to_dists)

99

In [202]:
word_to_score = {}

for word, dists in word_to_dists.items():
    word_to_score[word] = np.mean(dists)    
        
len(word_to_score)

99

In [203]:
with open(output_epochs_filename, 'w', encoding='utf-8') as f:
    json.dump(word_to_score, f, indent=4, ensure_ascii=False)

# Geeting final results

In [204]:
output_epochs_12_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_12.json'
output_epochs_13_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_13.json'
output_epochs_23_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_23.json'

prediction_filename = 'data/RuShiftEval/RuShiftEvalPreds/post_preds/prediction_large_glm_l1n2_v2.tsv'

In [205]:
def load_scores(filename):
    with open(filename) as f:
        return json.load(f)

In [206]:
scores_12 = load_scores(output_epochs_12_filename)
scores_13 = load_scores(output_epochs_13_filename)
scores_23 = load_scores(output_epochs_23_filename)

len(scores_12), len(scores_13), len(scores_23)

(99, 99, 99)

In [207]:
with open(prediction_filename, 'w', encoding='utf-8') as f:
    for word, score_12 in scores_12.items():
        score_12 = 1 / score_12
        score_13 = 1 / scores_13[word]
        score_23 = 1 / scores_23[word]
        f.write(f'{word}\t{score_12}\t{score_23}\t{score_13}\n')