In [3]:
!python3.7 -m pip install transformers==4.9.2

You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from collections import defaultdict

%matplotlib inline

In [2]:
%config Completer.use_jedi = False

In [3]:
import pandas as pd
import numpy as np
# from transformers import GPT2LMHeadModel, GPT2Tokenizer


# model_name_or_path = "sberbank-ai/rugpt3large_based_on_gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).cuda()

In [4]:
BASE_PATH = "data/"

In [5]:
train_data = pd.read_csv(BASE_PATH + 'rus_train_dataset.csv', encoding='utf-8', sep='|')
en_ru_aux_data = pd.read_csv(BASE_PATH + 'en_to_rus_train_dataset.csv', encoding='utf-8', sep='|')

In [6]:
%%time
train_bundle = np.load(BASE_PATH + 'train_dataset_bundle.npy', allow_pickle=True)

CPU times: user 493 ms, sys: 762 ms, total: 1.25 s
Wall time: 1.3 s


In [7]:
%%time
ext_bundle = np.load(BASE_PATH + 'en_ru_aux_dataset_bundle.npy', allow_pickle=True)

CPU times: user 454 ms, sys: 897 ms, total: 1.35 s
Wall time: 2.05 s


In [8]:
from collections import Counter
Counter(en_ru_aux_data["right_answer_id"].values)

Counter({'2.0': 626,
         '0.0': 390,
         '1.0': 626,
         nan: 69,
         '0,0': 210,
         'right_answer_id': 2,
         '1,0': 14,
         '2,0': 3,
         '1.0.': 1})

In [9]:
def evaluate(generator, pipe, aggregator):
    for _split in generator:
        aggregator(pipe(_split))

In [10]:
from scipy.stats import norm

def p_val(x):
    return 2 * min(norm.cdf(-x), norm.cdf(x))

class ScoreAggregator:
    def __init__(self):
        self._scores = []
        self._baselines = []
    def __call__(self, update):
        score, baseline = update
        self._scores.append(score)
        self._baselines.append(baseline)
    def calc_info(self):
        diff = np.array(self._scores) - np.array(self._baselines)
        return {"mean_score": np.mean(self._scores), "std_score": np.std(self._scores),
               "mean_diff": np.mean(diff), "std_diff": np.std(diff), "p_value": p_val(np.mean(diff) / (np.std(diff) + 1e-9))}


In [11]:
train_raw_targets = train_data["right_answer_id"].values
def extract_targets_in_domain(idxs: np.array, is_pairwise: bool) -> np.array:
    if not is_pairwise:
        return train_raw_targets[idxs]
    targets = np.zeros(len(idxs) * 3)
    for i, t in enumerate(train_raw_targets[idxs]):
        targets[3 * i + t] = 1
    return targets

In [12]:
tr_ext_4K_dataset = np.load("data/en_ru_4k_dataset.npy", allow_pickle=True)
tr_ext_4K_bundle = np.load("data/en_ru_4k_bundle.npy", allow_pickle=True)

In [13]:
tr_ext_4K_targets = tr_ext_4K_dataset[:, -1]
tr_ext_4K_targets

array([2, 1, 0, ..., 2, 0, 0], dtype=object)

In [362]:
rubq_qat_pairs = np.load("data/rubq_qat_pairs.npy", allow_pickle=True)
rubq_bundle = [x[-1] for x in rubq_qat_pairs]
rubq_targets = np.array([x[-2] for x in rubq_qat_pairs])
rubq_dataset = np.array([list([i,]) + list(x[:5]) for i,x in enumerate(rubq_qat_pairs)])
rubq_predictions_by_model_v1 = np.load("data/rubq_predictions_by_model_v1.npy", allow_pickle=True)

In [492]:
def _fix_input(x):
    if x in ['0.0', '1.0', '2.0', '0,0', '1,0', '2,0', '1.0.']:
        return float(x[0])
    return -1

prepared_en_ru_targets = np.array([_fix_input(x) for x in en_ru_aux_data["right_answer_id"].values])

full_ext_targets = np.concatenate([
    prepared_en_ru_targets, 
                                   tr_ext_4K_targets, rubq_targets])

def extract_targets_ext(is_pairwise: bool) -> np.array:
    assert is_pairwise
    N = full_ext_targets.shape[0]
    targets = np.zeros(N * 3)
    for i, t in enumerate(full_ext_targets):
        t = int(t)
        if t == -1:
            continue
        targets[3 * i + t] = 1
    return targets

In [16]:
train_predictions_by_model_v1 =  np.load("data/original_train_predictions_by_model_v1.npy", allow_pickle=True)
ext_predictions_by_model_v1 =  np.load("data/ext_predictions_by_model_v1.npy", allow_pickle=True)
tr_ext_4K_predictions_by_model_v1 =  np.load("data/tr_ext_4K_predictions_by_model_v1.npy", allow_pickle=True)

In [493]:
full_ext_predictions_by_model_v1 = np.concatenate([
    ext_predictions_by_model_v1, 
    tr_ext_4K_predictions_by_model_v1, rubq_predictions_by_model_v1])
full_ext_bundle = np.concatenate([
    ext_bundle,
    tr_ext_4K_bundle, rubq_bundle])
full_ext_dataset = np.concatenate([
    en_ru_aux_data.values,
    tr_ext_4K_dataset, rubq_dataset])

In [155]:
ext_q_embs = []

for b in ext_bundle:
    e = b['q']['embed'][0][-1].reshape(-1)
    ext_q_embs.append(e)
ext_q_embs = np.array(ext_q_embs)

In [156]:
from sklearn.decomposition import PCA

In [159]:
N = len(ext_q_embs)
M = int(N * 0.9)
ext_q_embs_train = ext_q_embs[:M]
ext_q_embs_val = ext_q_embs[M:]

In [217]:
pca = PCA(n_components=32, random_state=0)
ext_q_embs_train_proj = pca.fit_transform(ext_q_embs_train)
ext_q_embs_val_proj = pca.transform(ext_q_embs_val)

In [262]:
np.save("models/pca_h32_v1", pca)

In [218]:
np.mean((pca.inverse_transform(ext_q_embs_train_proj) - ext_q_embs_train) ** 2)

0.04108772

In [219]:
np.mean((pca.inverse_transform(ext_q_embs_val_proj) - ext_q_embs_val) ** 2)

0.047490757

In [494]:
def make_emb_random_proj(emb: np.array):
    emb_proj = np.matmul(emb, random_proj)
    emb_proj /= np.sum(np.abs(emb_proj))
    return emb_proj

def make_pca_proj(emb: np.array):
    e = pca.transform([emb]).reshape(-1)
    e /= np.sum(np.abs(e))
    return e

def extract_year(s: str):
    cur = []
    for c in str(s) + " ":
        if c.isdigit():
            cur.append(c)
        else:
            if len(cur) == 4:
                return float(''.join(cur))
            else:
                cur = []
    return -1


def extract_features_in_domain_v1(bundle: np.array, idxs: np.array, reset_cache: bool, is_pairwise: bool, is_train: bool) -> np.array:
    res = []

    for idx in idxs:
        emb = bundle[idx]['q']['embed'][0][-1].reshape(-1)
    #         emb /= np.sum(np.abs(emb))
#         emb_proj = make_emb_random_proj(emb)
        emb_proj = make_pca_proj(emb)
    
#         q = questions[idx]

        ll = np.array([
            bundle[idx]['a1']['likelihood'],
            bundle[idx]['a2']['likelihood'],
            bundle[idx]['a3']['likelihood'],
        ])
        embs = np.array([
            bundle[idx]['a1']['embed'][0][-1].reshape(-1),
            bundle[idx]['a2']['embed'][0][-1].reshape(-1),
            bundle[idx]['a3']['embed'][0][-1].reshape(-1),
        ])

        dists = np.sum(np.abs(embs - emb), axis=1)

        ll = (ll - np.mean(ll)) / np.std(ll)
        vec = np.concatenate([
#             [extract_year(q)],
#             [np.argmax(ll)],
            emb_proj
        ])
        for j in range(3):
#             ans = answers[idx][j]
            emb_a = embs[j]
            prediction = train_predictions_by_model_v1[3 * idx + j] if is_train else full_ext_predictions_by_model_v1[3 * idx + j]
            res.append(np.concatenate([
                vec,
                np.array([np.mean(np.abs(emb - emb_a))]),
                make_pca_proj(emb_a),
                np.array([ll[j],prediction, j == np.argmax(ll)]),
            ]))
    return np.array(res)

In [495]:
train_data.values

array([[0, 'Как называется половина основного времени матча в футболе?',
        'Тайм', 'Период', 'Гейм', 0],
       [1,
        'Какая из указанных команд чаще других становилась чемпионом мира по хоккею?',
        'Россия', 'Англия', 'Чехия', 0],
       [2,
        'В каком году мужская сборная СССР по баскетболу впервые выиграла золото на Олимпийских играх?',
        '1972', '1980', '1988', 0],
       ...,
       [4058, 'В каком году была выпущена приставка «Playstation 2»?',
        '2000', '2001', '1999', 0],
       [4059, 'Как называлась добыча мёда диких пчёл на Руси?',
        'Бортничество', 'Капище', 'Полюдье', 0],
       [4060, 'Какого произведения НЕ было у Л. Н. Толстого?',
        '«Оловянный солдатик»', '«Хаджи-Мурат»', '«Казаки»', 0]],
      dtype=object)

In [496]:
f = extract_features_in_domain_v1(train_bundle, [2], True, True, True)
f.shape, f[0]

((3, 68),
 array([-0.11351538, -0.02693948,  0.06229229, -0.10466813, -0.04197856,
        -0.01103976, -0.02884177,  0.03987112, -0.04965616, -0.08312124,
         0.03852581,  0.03369633,  0.03872566,  0.00550216, -0.021699  ,
         0.00924887, -0.01096463,  0.01409951,  0.03005068, -0.00094993,
        -0.00216449,  0.04341423,  0.02227794, -0.01268454,  0.01205025,
         0.03482549,  0.03769698, -0.01937344,  0.00161218, -0.01009029,
         0.00910751, -0.02931618,  0.36769983,  0.03501877, -0.01560779,
        -0.00335252, -0.06607332, -0.03694912, -0.03568615, -0.04999945,
         0.0421305 ,  0.02423212, -0.02678693,  0.07623202,  0.07491807,
         0.0018204 ,  0.01541569, -0.03876211, -0.00327033,  0.0339863 ,
         0.03316834, -0.01695172,  0.02052337,  0.06128677,  0.02366902,
         0.02891698, -0.03665592,  0.02161785,  0.0435885 ,  0.0198328 ,
         0.02516922,  0.02372486,  0.00428901,  0.01493454, -0.04542952,
        -0.4280453 , -0.9240756 ,  0.    

In [466]:
import catboost

In [497]:
def fit_predict_with_catboost(model, X_train, X_val, X_test, y_train, y_val, val_ratio: float, is_pairwise: bool, weights:np.array):
    train_group_id = np.repeat(np.arange(X_train.shape[0] // 3), 3) if is_pairwise else None
    val_group_id = np.repeat(np.arange(X_val.shape[0] // 3), 3) if is_pairwise else None
    train_pool = catboost.Pool(X_train, y_train, group_id=train_group_id, weight=weights)
#     val_pool = catboost.Pool(X_val, y_val, group_id=val_group_id)
    model.fit(train_pool)#, eval_set=val_pool)
    return model.predict(X_test)

def calc_metric(y_true, y_score, is_pairwise: bool):
    if not is_pairwise:
        return np.mean(y_true == y_score)
    y_score = y_score.reshape(-1, 3)
    y_true = y_true.reshape(-1, 3)
    return np.mean(np.argmax(y_true, axis=1) == np.argmax(y_score, axis=1))

In [498]:
class PiperV1:
    def __init__(self, model_fn, val_ratio: float, is_pairwise: bool, ext_weight:float, fit_predict_with_model, global_dataset_ids: np.array):
        self.model_fn = model_fn
        self.models = []
        self.reset_cache = True
        self.is_pairwise = is_pairwise
        self.val_ratio = val_ratio
        self.ext_weight = ext_weight
        self.fit_predict_with_model = fit_predict_with_model
        self.global_dataset_ids = global_dataset_ids
        self.iter = 0

    def pipe(self, _split):
        train_val_idx, test_idx = _split
        train_val_idx = self.global_dataset_ids[train_val_idx]
        test_idx = self.global_dataset_ids[test_idx]
        
        self.models.append(self.model_fn(self.iter))

        np.random.seed(self.iter)
        self.iter += 1
        ids = np.arange(len(train_val_idx))
        np.random.shuffle(ids)
        N = int(len(train_val_idx) * self.val_ratio)
        train_idx = train_val_idx[N:]
        val_idx = train_val_idx[:N]

        X_ext = extract_features_in_domain_v1(
            full_ext_bundle,
            np.arange(len(full_ext_targets)),
            self.reset_cache,
            self.is_pairwise,
            False,
        )
        y_ext = extract_targets_ext(self.is_pairwise)

        X_train = extract_features_in_domain_v1(train_bundle, train_idx, self.reset_cache, self.is_pairwise, True)
        X_val = extract_features_in_domain_v1(train_bundle, val_idx, self.reset_cache, self.is_pairwise, True)
        X_test = extract_features_in_domain_v1(train_bundle, test_idx, self.reset_cache, self.is_pairwise, True)

        y_train = extract_targets_in_domain(train_idx, self.is_pairwise)
        y_val = extract_targets_in_domain(val_idx, self.is_pairwise)
        y_test = extract_targets_in_domain(test_idx, self.is_pairwise)

        train_weights = np.ones_like(y_train)
        ext_weights = np.concatenate([np.ones((len(ext_bundle) + len(tr_ext_4K_bundle)) * 3) * self.ext_weight,
                                      np.ones(len(rubq_bundle) * 3)]) * self.ext_weight
#                                       ext_bundle, tr_ext_4K_bundle, rubq_bundle
        
        if True:  # useful for debug
            train_weights = np.concatenate([train_weights, ext_weights], axis=0)
            X_train = np.concatenate([X_train, X_ext], axis=0)
            y_train = np.concatenate([y_train, y_ext], axis=0)

        y_test_pred = self.fit_predict_with_model(self.models[-1], X_train, X_val, X_test, y_train, y_val, self.val_ratio, self.is_pairwise, train_weights)
#         y_test_baseline = np.array([np.eye(3)[int(x)] for x in X_test[::3, 0]]).reshape(-1)
        y_test_baseline = X_test[:, -1]   # is sber-gpt3
        metric = calc_metric(y_test, y_test_pred, self.is_pairwise)
        base_metric = calc_metric(y_test, y_test_baseline, self.is_pairwise)
        return (metric, base_metric)

In [499]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [500]:
def fit_predict_with_sklearn(model, X_train, X_val, X_test, y_train, y_val, val_ratio: float, is_pairwise: bool, weights:np.array):
#     train_group_id = np.repeat(np.arange(X_train.shape[0] // 3), 3) if is_pairwise else None
#     val_group_id = np.repeat(np.arange(X_val.shape[0] // 3), 3) if is_pairwise else None
#     train_pool = catboost.Pool(X_train, y_train, group_id=train_group_id, weight=weights)
#     val_pool = catboost.Pool(X_val, y_val, group_id=val_group_id)
#     model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)
    model.fit(X_train, y_train)#, weights)
    return model.predict(X_test)

In [501]:
from sklearn.model_selection import RepeatedStratifiedKFold
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [502]:
GLOBAL_TEST_RATIO = 0.1
np.random.seed(42)
ids = np.arange(len(train_data))
np.random.shuffle(ids)
print(ids[:10])
N = int(len(train_data) * GLOBAL_TEST_RATIO)
train_global_idx = ids[N:]
val_global_idx = ids[:N]

[3286 2463 4058  746 1979  534  166 2620 3205 3664]


In [None]:
score_aggregator = ScoreAggregator()
global_dataset_ids = np.arange(train_data.shape[0])[train_global_idx]
piper = PiperV1(
    lambda n_split: catboost.CatBoost({
        "loss_function":"QueryRMSE",
        "rsm":0.3,
        "random_seed": n_split,
        "max_bin": 32,
        "n_estimators": 1000}),
    0.,
    True,
    0.01,
    fit_predict_with_catboost,
    global_dataset_ids
)

evaluate(
    generator=RepeatedStratifiedKFold(n_repeats=3, n_splits=3, random_state=13).split(
        global_dataset_ids,
        train_raw_targets[train_global_idx]
    ),
    pipe=piper.pipe,
    aggregator=score_aggregator
)

In [431]:
for i, m in enumerate(piper.models):
    m.save_model("models/cb_model_v1_{}_out_of_{}".format(i+1, len(piper.models)))

In [251]:
score_aggregator.calc_info() # 37.3 for 500 it

{'mean_score': 0.37328101447928325,
 'std_score': 0.008979662688540868,
 'mean_diff': 0.018972619111229033,
 'std_diff': 0.012395365464154137,
 'p_value': 0.12586287112651248}

In [504]:
score_aggregator.calc_info() # 

{'mean_score': 0.3726407984986243,
 'std_score': 0.009915737520026517,
 'mean_diff': 0.01833240313057006,
 'std_diff': 0.012346703234910505,
 'p_value': 0.13759645389105363}

In [451]:
score_aggregator.calc_info()

{'mean_score': 0.3726407984986243,
 'std_score': 0.009915737520026517,
 'mean_diff': 0.01833240313057006,
 'std_diff': 0.012346703234910505,
 'p_value': 0.13759645389105363}

In [435]:
score_aggregator.calc_info()

{'mean_score': 0.34527667283459196,
 'std_score': 0.015831845922636752,
 'mean_diff': -0.009031722533462246,
 'std_diff': 0.016623245791967542,
 'p_value': 0.5869103209204434}

In [460]:
score_aggregator._scores

[0.35684987694831827,
 0.3817733990147783,
 0.3850574712643678,
 0.3724364232977851,
 0.3768472906403941,
 0.37110016420361247,
 0.38392124692370794,
 0.36042692939244664,
 0.3620689655172414]

In [458]:
np.std(score_aggregator._scores), np.min(score_aggregator._scores)

(0.009966793166767428, 0.35684987694831827)

In [459]:
np.std(score_aggregator._baselines), np.min(score_aggregator._baselines)

(0.005675757668625942, 0.3440065681444992)

In [428]:
piper.models[-1].best_score_

{'learn': {'QueryRMSE': 0.4114848185417093}}

In [76]:
m = piper.models[-1]

In [438]:
global_test = extract_features_in_domain_v1(train_bundle, val_global_idx, True, True, True)

In [449]:
res = []

for m in piper.models:
    scores = m.predict(global_test)
    scores = scores.reshape(-1, 3)
    res.append(scores)
res = np.array(res)
global_test_pred = np.median(res, axis=0)

y_global_test = extract_targets_in_domain(val_global_idx, True)
metric = calc_metric(y_global_test, global_test_pred, True)
metric

0.4014778325123153

In [283]:
train_data.loc[val_global_idx].to_csv("data/global_val_dataset.csv")

In [444]:
y_global_test.shape

(1218,)

In [445]:
global_test_pred.shape

(406, 3)

In [450]:
[(x,y) for x,y in zip(np.argmax(y_global_test.reshape(-1,3),axis=1), global_test_pred)]

[(2, array([ 0.01412552,  0.01579307, -0.05706615])),
 (2, array([0.01256603, 0.02836872, 0.00798458])),
 (0, array([0.02716267, 0.00381815, 0.07961208])),
 (2, array([ 0.04498007, -0.03872567, -0.03266267])),
 (0, array([-0.02860388,  0.01581549, -0.06030325])),
 (2, array([ 0.08058833, -0.08315102, -0.05639211])),
 (2, array([ 0.02954279, -0.00264727, -0.01093597])),
 (0, array([ 0.02764234, -0.00946021,  0.04710662])),
 (2, array([-0.00348881, -0.03033365, -0.0187319 ])),
 (2, array([0.00855746, 0.05284259, 0.0371246 ])),
 (2, array([-0.01130463, -0.03204767, -0.05780448])),
 (2, array([-0.01928139,  0.0983702 , -0.01715747])),
 (1, array([0.02937252, 0.03327462, 0.01647447])),
 (2, array([0.00283855, 0.06703682, 0.04309594])),
 (0, array([0.03938226, 0.02194227, 0.04999485])),
 (0, array([-0.08499498, -0.12336161,  0.04435601])),
 (0, array([ 0.01372693,  0.07065571, -0.02964317])),
 (1, array([ 0.0155695 , -0.00117963,  0.0576145 ])),
 (1, array([0.02179518, 0.03174753, 0.0272986 