In [1]:
import re
import os
import json
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm

In [2]:
random.seed(42)

# Fitting linear model on top of the 

In [3]:
preds_large_1_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_large_raw/preds_dev.rusemshift_1.data'
preds_large_2_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_large_raw/preds_dev.rusemshift_2.data'

preds_base_1_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_base_raw/preds_dev.rusemshift_1.data'
preds_base_2_filename = 'data/RuShiftEval/RuSemShiftBemXLMR/XLMR_base_raw/preds_dev.rusemshift_2.data'

In [4]:
data_1_files = 'data/RuSemShift/dev.rusemshift_1'
data_2_files = 'data/RuSemShift/dev.rusemshift_2'

In [5]:
def construct_preds(pred_filename):
    with open(pred_filename) as f:
        return json.load(f)

In [6]:
def construct_data_samples(data_files):
    with open(data_files + '.data') as f_data, open(data_files + '.gold') as f_labels:
        data_json = json.load(f_data)
        labels_json = json.load(f_labels)
    
    labels_dict = {sample['id']: sample for sample in labels_json}
    for sample in data_json:
        sample['score'] = labels_dict[sample['id']]['score']

    return data_json

In [7]:
def construct_samples(data_files, pred_large_filename, pred_base_filename):
    large_preds = construct_preds(pred_large_filename)
    base_preds = construct_preds(pred_base_filename)
    samples = construct_data_samples(data_files)
    
    large_preds_dict = {pred['id']: pred for pred in large_preds}
    base_preds_dict = {pred['id']: pred for pred in base_preds}

    for sample in samples:
        large_pred = large_preds_dict[sample['id']]
        base_pred = base_preds_dict[sample['id']]
        
        sample['base_context_output1'] = base_pred['context_output1']
        sample['base_context_output2'] = base_pred['context_output2']
        
        sample['large_context_output1'] = large_pred['context_output1']
        sample['large_context_output2'] = large_pred['context_output2']
    
    return samples

In [8]:
samples_1 = construct_samples(data_1_files, preds_large_1_filename, preds_base_1_filename)
samples_2 = construct_samples(data_2_files, preds_large_2_filename, preds_base_2_filename)

samples = samples_1 + samples_2
random.shuffle(samples)

samples[0]

{'id': 'dev.rusemshift_1.3201',
 'lemma': 'публика',
 'pos': 'NOUN',
 'sentence1': 'Я не касаюсь персонажей той хищной и дрянной пьесы, которая держала на привязи жалкое воображение зрителей чрезмерными прыжками и сатанинскими преступлениями, очевидно, смакуемыми известного рода публикой, выносящей отсюда азарт и идеал свой.',
 'sentence2': 'Я ужо писал тебе, что на первом представлении было такое возбуждение в публике и за сценой, какого отродясь не видел суфлер, служивший в театре 32 года.',
 'start1': 196,
 'end1': 204,
 'start2': 71,
 'end2': 78,
 'grp': 'COMPARE',
 'score': 4,
 'base_context_output1': [-0.08449062705039978,
  -0.004835515283048153,
  0.002801349852234125,
  0.014338865876197815,
  -0.12707017362117767,
  0.10699810832738876,
  0.05359426140785217,
  -0.028332728892564774,
  -0.04114815592765808,
  -0.2508789002895355,
  0.05741122364997864,
  -0.021813496947288513,
  0.7542309165000916,
  -0.07511778175830841,
  0.09063654392957687,
  0.011449511162936687,
  0.025

# Constructing variables

In [9]:
class VectorsDotPredictor:
    def __init__(self, threshold=None, normalize=True, norm_ord=2):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) > self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
            
        return sum(out_vector_1 * out_vector_2)

In [10]:
class VectorsDistPredictor:
    def __init__(self, threshold=None, normalize=True, norm_ord=2):
        self.threshold = threshold
        self.normalize = normalize
        self.norm_ord = norm_ord
    
    def predict(self, out_vector_1, out_vector_2):
        return self.predict_proba(out_vector_1, out_vector_2) < self.threshold
    
    def predict_proba(self, out_vector_1, out_vector_2):
        out_vector_1 = np.array(out_vector_1)
        out_vector_2 = np.array(out_vector_2)
        
        if self.normalize:
            out_vector_1 /= np.linalg.norm(out_vector_1, ord=self.norm_ord)
            out_vector_2 /= np.linalg.norm(out_vector_2, ord=self.norm_ord)
        
        return np.linalg.norm(out_vector_1 - out_vector_2, ord=self.norm_ord)

In [11]:
predictors = [
    VectorsDotPredictor(normalize=True, norm_ord=2),
    VectorsDotPredictor(normalize=True, norm_ord=1),
    VectorsDotPredictor(normalize=False),
    VectorsDistPredictor(normalize=True, norm_ord=2),
    VectorsDistPredictor(normalize=False, norm_ord=2),
    VectorsDistPredictor(normalize=False, norm_ord=1),
    VectorsDistPredictor(normalize=True, norm_ord=1),
]

In [12]:
def construct_dataset(samples, predictors):
    x, y = [], []
    
    for sample in samples:
        current_features = []
        
        for predictor in predictors:
            current_features.append(predictor.predict_proba(
                sample['base_context_output1'], sample['base_context_output2']
            ))
            current_features.append(predictor.predict_proba(
                sample['large_context_output1'], sample['large_context_output2']
            ))
            
        x.append(current_features)
        y.append(sample['score'])
        
    return np.array(x), np.array(y)

In [13]:
x, y = construct_dataset(samples, predictors)

mask = y != 0
x, y = x[mask], y[mask]

x.shape

(7561, 14)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.05, random_state=24)

x_train.shape

(7182, 14)

# Fitting the model

In [17]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from scipy import stats

In [18]:
lr = Ridge(random_state=42)
lr_params = {
    'alpha': np.linspace(0, 2, 100)
}
model = GridSearchCV(lr, lr_params)

model.fit(x_train, y_train);

y_train_preds = model.best_estimator_.predict(x_train)
stats.spearmanr(y_train, y_train_preds)

SpearmanrResult(correlation=0.3812070582463536, pvalue=3.70860469645347e-247)

In [19]:
#y_val_preds = model.best_estimator_.predict(x_val)
y_val_preds = model.predict(x_val)
stats.spearmanr(y_val, y_val_preds)

SpearmanrResult(correlation=0.36720490366018177, pvalue=1.5284006774138044e-13)

# Test predictions

In [42]:
words_num = 99
preds_base_filename = 'data/RuShiftEval/RuShiftEvalPreds/base_raw/preds_eval_23.json'
preds_large_filename = 'data/RuShiftEval/RuShiftEvalPreds/large_raw/preds_eval_23.json'
data_files_format = 'data/RuShiftEval/test/eval_23_True_True/2-3_{0}.data'
output_epochs_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_all_23.json'

In [43]:
def load_words(data_files_format, words_num=99):
    all_words = []
    
    for word_id in range(words_num):
        current_file = data_files_format.format(word_id)
        with open(current_file) as f:
            word_examples = json.load(f)
            all_words.extend(word_examples)
            
    return all_words

In [44]:
all_words = load_words(data_files_format, words_num=words_num)
all_words = {word['id']: word for word in all_words}

len(all_words)

9149

In [45]:
def construct_test_samples(pred_large_filename, pred_base_filename):
    large_preds = construct_preds(pred_large_filename)
    base_preds = construct_preds(pred_base_filename)
    
    large_preds_dict = {pred['id']: pred for pred in large_preds}
    base_preds_dict = {pred['id']: pred for pred in base_preds}
    
    samples = []

    for pred_id in large_preds_dict.keys():
        large_pred = large_preds_dict[pred_id]
        base_pred = base_preds_dict[pred_id]
        current_sample = {'id': pred_id}
        
        current_sample['base_context_output1'] = base_pred['context_output1']
        current_sample['base_context_output2'] = base_pred['context_output2']
        
        current_sample['large_context_output1'] = large_pred['context_output1']
        current_sample['large_context_output2'] = large_pred['context_output2']
        
        samples.append(current_sample)
    
    return samples

In [46]:
eval_samples = construct_test_samples(preds_large_filename, preds_base_filename)

In [47]:
def sample_test_data(test_sample, predictors):
    current_features = []
        
    for predictor in predictors:
        current_features.append(predictor.predict_proba(
            test_sample['base_context_output1'], test_sample['base_context_output2']
        ))
        current_features.append(predictor.predict_proba(
            test_sample['large_context_output1'], test_sample['large_context_output2']
        ))
        
        
    return current_features

In [48]:
model.predict([sample_test_data(eval_samples[5], predictors)])[0]

3.371121948465138

In [49]:
word_to_scores = defaultdict(list)
test_data = []

for sample in eval_samples:
    sample_data = sample_test_data(sample, predictors)
    test_data.append(sample_data)
    
scores =  model.predict(test_data)
    
for i, sample in enumerate(eval_samples):
    current_lemma = all_words[sample['id']]['lemma']
    word_to_scores[current_lemma].append(scores[i])
    
len(word_to_scores)

99

In [50]:
word_to_score = {}

for word, scores in word_to_scores.items():
    word_to_score[word] = np.mean(scores)    
        
len(word_to_score)

99

In [51]:
with open(output_epochs_filename, 'w', encoding='utf-8') as f:
    json.dump(word_to_score, f, indent=4, ensure_ascii=False)

In [52]:
word_to_score

{'авторитет': 2.8996426445855628,
 'амбиция': 3.040346278298691,
 'апостол': 3.3083070042349827,
 'благодарность': 3.285061457660676,
 'блин': 2.4787318151495223,
 'блондин': 3.2640779977091468,
 'брат': 2.948716040009842,
 'бригада': 3.142243913464319,
 'веер': 2.906197814689192,
 'век': 3.073071961609001,
 'вызов': 2.7116834586400183,
 'головка': 2.9213167776561697,
 'грех': 2.960911926611907,
 'дух': 2.5794583701908373,
 'дядька': 3.45353519155353,
 'дядя': 3.41761053264227,
 'железо': 2.862860816567027,
 'жесть': 2.8992341458606545,
 'живот': 2.8596282078318085,
 'заблуждение': 3.443762540780672,
 'издательство': 3.469267983058717,
 'итальянец': 3.059460815716713,
 'кабан': 3.0617332894284806,
 'карман': 3.1648686175182816,
 'крушение': 3.0685939231007797,
 'крыша': 3.204024814892273,
 'кулиса': 3.118335991156573,
 'лечение': 3.0784975964710424,
 'линейка': 2.6542950840703923,
 'лишение': 3.326066205126783,
 'локоть': 3.0988934572811178,
 'любовник': 3.1886190657684956,
 'любовь': 

# Getting final prediction

In [54]:
output_epochs_12_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_all_12.json'
output_epochs_13_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_all_13.json'
output_epochs_23_filename = 'data/RuShiftEval/RuShiftEvalPreds/words_lr_all_23.json'

prediction_filename = 'data/RuShiftEval/RuShiftEvalPreds/MLM_preds/prediction_3_v2.tsv'

In [55]:
def load_scores(filename):
    with open(filename) as f:
        return json.load(f)

In [56]:
scores_12 = load_scores(output_epochs_12_filename)
scores_13 = load_scores(output_epochs_13_filename)
scores_23 = load_scores(output_epochs_23_filename)

len(scores_12), len(scores_13), len(scores_23)

(99, 99, 99)

In [57]:
with open(prediction_filename, 'w', encoding='utf-8') as f:
    for word, score_12 in scores_12.items():
        f.write(f'{word}\t{score_12}\t{scores_23[word]}\t{scores_13[word]}\n')