In [70]:
import re
import os
import json
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import defaultdict

In [71]:
random.seed(42)

# Fitting linear model on top of the 

In [72]:
preds_large_1_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_large/preds_dev.rusemshift_1.data'
preds_large_2_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_large/preds_dev.rusemshift_2.data'

In [73]:
data_1_files = 'data/RuSemShift/dev.rusemshift_1'
data_2_files = 'data/RuSemShift/dev.rusemshift_2'

In [74]:
def construct_preds(pred_filename):
    with open(pred_filename) as f:
        return json.load(f)

In [75]:
def construct_data_samples(data_files):
    with open(data_files + '.data') as f_data, open(data_files + '.gold') as f_labels:
        data_json = json.load(f_data)
        labels_json = json.load(f_labels)
    
    labels_dict = {sample['id']: sample for sample in labels_json}
    for sample in data_json:
        sample['score'] = labels_dict[sample['id']]['score']

    return data_json

In [76]:
def construct_samples(data_files, pred_filename):
    preds = construct_preds(pred_filename)
    samples = construct_data_samples(data_files)
    
    preds_dict = {pred['id']: pred for pred in preds}

    for sample in samples:
        pred = preds_dict[sample['id']]
        for key, value in pred.items():
            sample[key] = value
    
    return samples

In [77]:
samples_1 = construct_samples(data_1_files, preds_large_1_filename)
samples_2 = construct_samples(data_2_files, preds_large_2_filename)

samples = samples_1 + samples_2
random.shuffle(samples)

samples[0]

{'id': 'dev.rusemshift_1.3201',
 'lemma': 'публика',
 'pos': 'NOUN',
 'sentence1': 'Я не касаюсь персонажей той хищной и дрянной пьесы, которая держала на привязи жалкое воображение зрителей чрезмерными прыжками и сатанинскими преступлениями, очевидно, смакуемыми известного рода публикой, выносящей отсюда азарт и идеал свой.',
 'sentence2': 'Я ужо писал тебе, что на первом представлении было такое возбуждение в публике и за сценой, какого отродясь не видел суфлер, служивший в театре 32 года.',
 'start1': 196,
 'end1': 204,
 'start2': 71,
 'end2': 78,
 'grp': 'COMPARE',
 'score': 4,
 'context_output1': [-0.13911877572536469,
  -0.02875070832669735,
  0.07912805676460266,
  -0.0393800288438797,
  -0.05979243665933609,
  0.056471794843673706,
  -0.27775463461875916,
  0.08017101883888245,
  -0.09742088615894318,
  0.024649333208799362,
  -0.11390726268291473,
  0.03020729124546051,
  0.04740726947784424,
  0.0282105952501297,
  -0.09196290373802185,
  -0.13751354813575745,
  -0.2135793417

# Constructing variables

In [78]:
class VectorsDotPredictor:
    def __init__(self, threshold=None, normalize=True, norm_ord=2):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) > self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
            
        return sum(out_vector_1 * out_vector_2)

In [79]:
class VectorsDistPredictor:
    def __init__(self, threshold=None, normalize=True, norm_ord=2):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) < self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
        
        return np.linalg.norm(out_vector_1 - out_vector_2, ord=self.norm_ord)

In [80]:
predictors = [
    VectorsDotPredictor(normalize=True, norm_ord=2),
    VectorsDotPredictor(normalize=True, norm_ord=1),
    VectorsDotPredictor(normalize=False),
    VectorsDistPredictor(normalize=True, norm_ord=2),
    VectorsDistPredictor(normalize=False, norm_ord=2),
    VectorsDistPredictor(normalize=False, norm_ord=1),
    VectorsDistPredictor(normalize=True, norm_ord=1),
]

In [81]:
def construct_dataset(samples, predictors):
    x, y = [], []
    
    for sample in samples:
        current_features = []
        
        for predictor in predictors:
            current_features.append(predictor.predict_proba(
                sample['context_output1'], sample['context_output2']
            ))
            
        x.append(current_features)
        y.append(sample['score'])
        
    return np.array(x), np.array(y)

In [82]:
x, y = construct_dataset(samples, predictors)

mask = y != 0
x, y = x[mask], y[mask]

x.shape

(7561, 7)

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

x_train.shape

(6804, 7)

# Fitting the model

In [85]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from scipy import stats

In [86]:
lr = Ridge()
lr_params = {
    'alpha': np.linspace(0, 2, 100)
}
model = GridSearchCV(lr, lr_params)

model.fit(x_train, y_train);

y_train_preds = model.best_estimator_.predict(x_train)
stats.spearmanr(y_train, y_train_preds)

SpearmanrResult(correlation=0.2840672735750184, pvalue=1.8150781094450998e-126)

In [87]:
model.best_params_

{'alpha': 0.0}

In [88]:
y_val_preds = model.best_estimator_.predict(x_val)
stats.spearmanr(y_val, y_val_preds)

SpearmanrResult(correlation=0.31578930187520443, pvalue=5.43765319494285e-19)

In [89]:
model.best_estimator_.coef_

array([ 3.33106001e+01,  4.38728583e+00,  4.97818965e-03,  1.43810026e+02,
       -4.12945561e+00,  4.76721439e-03, -4.67973535e+00])

# Test predictions

In [110]:
words_num = 99
preds_filename = 'data/RuShiftEval/RuShiftEvalPreds/large_raw/preds_eval_23.json'
data_files_format = 'data/RuShiftEval/test/eval_23_True_True/2-3_{0}.data'
output_epochs_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_23.json'

In [111]:
def load_words(data_files_format, words_num=99):
    all_words = []
    
    for word_id in range(words_num):
        current_file = data_files_format.format(word_id)
        with open(current_file) as f:
            word_examples = json.load(f)
            all_words.extend(word_examples)
            
    return all_words

In [112]:
all_words = load_words(data_files_format, words_num=words_num)
all_words = {word['id']: word for word in all_words}

len(all_words)

9149

In [113]:
with open(preds_filename) as f:
    eval_samples = json.load(f)

In [114]:
def model_predict(test_sample, predictors, model):
    current_features = []
        
    for predictor in predictors:
        current_features.append(predictor.predict_proba(
            test_sample['context_output1'], test_sample['context_output2']
        ))
        
        
    return model.predict([current_features])[0]

In [115]:
model_predict(eval_samples[5], predictors, model.best_estimator_)

3.2919615479765483

In [116]:
word_to_scores = defaultdict(list)

for sample in eval_samples:
    score = model_predict(sample, predictors, model.best_estimator_)
    current_lemma = all_words[sample['id']]['lemma']
    word_to_scores[current_lemma].append(score)
    
len(word_to_scores)

99

In [117]:
word_to_score = {}

for word, scores in word_to_scores.items():
    word_to_score[word] = np.mean(scores)    
        
len(word_to_score)

99

In [118]:
with open(output_epochs_filename, 'w', encoding='utf-8') as f:
    json.dump(word_to_score, f, indent=4, ensure_ascii=False)

In [119]:
word_to_score

{'авторитет': 2.8686360790309733,
 'амбиция': 3.1273112429138483,
 'апостол': 3.1170421316394368,
 'благодарность': 3.1961669311507226,
 'блин': 2.744671681290989,
 'блондин': 3.105699484602379,
 'брат': 2.989800243306872,
 'бригада': 3.063579508897924,
 'веер': 3.0578853607333314,
 'век': 3.1611447845429486,
 'вызов': 2.8840723306393845,
 'головка': 3.0722445188625063,
 'грех': 2.893680830822051,
 'дух': 2.763324268575263,
 'дядька': 3.3252550608435296,
 'дядя': 3.2996940106350308,
 'железо': 3.039660611382552,
 'жесть': 3.1190348504721492,
 'живот': 2.8634635303734064,
 'заблуждение': 3.4601635026634394,
 'издательство': 3.3758760288951257,
 'итальянец': 3.043515557959687,
 'кабан': 3.173578166209397,
 'карман': 3.1680920954725327,
 'крушение': 3.1980859968286786,
 'крыша': 3.183495511798262,
 'кулиса': 3.295881400449638,
 'лечение': 2.8930080152827515,
 'линейка': 2.832185401645147,
 'лишение': 3.482014676070394,
 'локоть': 3.190366346683601,
 'любовник': 3.184057968901263,
 'любовь

# Getting final prediction

In [120]:
output_epochs_12_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_12.json'
output_epochs_13_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_13.json'
output_epochs_23_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_23.json'

prediction_filename = 'data/RuShiftEval/RuShiftEvalPreds/MLM_preds/prediction_2.tsv'

In [121]:
def load_scores(filename):
    with open(filename) as f:
        return json.load(f)

In [122]:
scores_12 = load_scores(output_epochs_12_filename)
scores_13 = load_scores(output_epochs_13_filename)
scores_23 = load_scores(output_epochs_23_filename)

len(scores_12), len(scores_13), len(scores_23)

(99, 99, 99)

In [123]:
with open(prediction_filename, 'w', encoding='utf-8') as f:
    for word, score_12 in scores_12.items():
        f.write(f'{word}\t{score_12}\t{scores_23[word]}\t{scores_13[word]}\n')