In [3]:
!python3.7 -m pip install transformers==4.9.2

You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from collections import defaultdict

%matplotlib inline

In [2]:
%config Completer.use_jedi = False

In [3]:
import pandas as pd
import numpy as np
# from transformers import GPT2LMHeadModel, GPT2Tokenizer


# model_name_or_path = "sberbank-ai/rugpt3large_based_on_gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(model_name_or_path).cuda()

In [4]:
BASE_PATH = "data/"

In [5]:
train_data = pd.read_csv(BASE_PATH + 'rus_train_dataset.csv', encoding='utf-8', sep='|')
en_ru_aux_data = pd.read_csv(BASE_PATH + 'en_to_rus_train_dataset.csv', encoding='utf-8', sep='|')

In [6]:
%%time
train_bundle = np.load(BASE_PATH + 'train_dataset_bundle.npy', allow_pickle=True)

CPU times: user 493 ms, sys: 762 ms, total: 1.25 s
Wall time: 1.3 s


In [7]:
%%time
ext_bundle = np.load(BASE_PATH + 'en_ru_aux_dataset_bundle.npy', allow_pickle=True)

CPU times: user 454 ms, sys: 897 ms, total: 1.35 s
Wall time: 2.05 s


In [8]:
from collections import Counter
Counter(en_ru_aux_data["right_answer_id"].values)

Counter({'2.0': 626,
         '0.0': 390,
         '1.0': 626,
         nan: 69,
         '0,0': 210,
         'right_answer_id': 2,
         '1,0': 14,
         '2,0': 3,
         '1.0.': 1})

In [9]:
def evaluate(generator, pipe, aggregator):
    for _split in generator:
        aggregator(pipe(_split))

In [10]:
from scipy.stats import norm

def p_val(x):
    return 2 * min(norm.cdf(-x), norm.cdf(x))

class ScoreAggregator:
    def __init__(self):
        self._scores = []
        self._baselines = []
    def __call__(self, update):
        score, baseline = update
        self._scores.append(score)
        self._baselines.append(baseline)
    def calc_info(self):
        diff = np.array(self._scores) - np.array(self._baselines)
        return {"mean_score": np.mean(self._scores), "std_score": np.std(self._scores),
               "mean_diff": np.mean(diff), "std_diff": np.std(diff), "p_value": p_val(np.mean(diff) / (np.std(diff) + 1e-9))}


In [11]:
train_raw_targets = train_data["right_answer_id"].values
def extract_targets_in_domain(idxs: np.array, is_pairwise: bool) -> np.array:
    if not is_pairwise:
        return train_raw_targets[idxs]
    targets = np.zeros(len(idxs) * 3)
    for i, t in enumerate(train_raw_targets[idxs]):
        targets[3 * i + t] = 1
    return targets

In [12]:
tr_ext_4K_dataset = np.load("data/en_ru_4k_dataset.npy", allow_pickle=True)
tr_ext_4K_bundle = np.load("data/en_ru_4k_bundle.npy", allow_pickle=True)

In [13]:
tr_ext_4K_targets = tr_ext_4K_dataset[:, -1]
tr_ext_4K_targets

array([2, 1, 0, ..., 2, 0, 0], dtype=object)

In [14]:
def _fix_input(x):
    if x in ['0.0', '1.0', '2.0', '0,0', '1,0', '2,0', '1.0.']:
        return float(x[0])
    return -1

prepared_en_ru_targets = np.array([_fix_input(x) for x in en_ru_aux_data["right_answer_id"].values])

full_ext_targets = np.concatenate([prepared_en_ru_targets, tr_ext_4K_targets])

def extract_targets_ext(is_pairwise: bool) -> np.array:
    assert is_pairwise
    N = full_ext_targets.shape[0]
    targets = np.zeros(N * 3)
    for i, t in enumerate(full_ext_targets):
        t = int(t)
        if t == -1:
            continue
        targets[3 * i + t] = 1
    return targets

In [15]:
H = 32
np.random.seed(0)
random_proj = np.random.normal(size=(1536, H))
np.mean(random_proj)

-0.003485747948384979

In [16]:
train_predictions_by_model_v1 =  np.load("data/original_train_predictions_by_model_v1.npy", allow_pickle=True)
ext_predictions_by_model_v1 =  np.load("data/ext_predictions_by_model_v1.npy", allow_pickle=True)
tr_ext_4K_predictions_by_model_v1 =  np.load("data/tr_ext_4K_predictions_by_model_v1.npy", allow_pickle=True)

In [21]:
full_ext_predictions_by_model_v1 = np.concatenate([ext_predictions_by_model_v1, tr_ext_4K_predictions_by_model_v1])
full_ext_bundle = np.concatenate([ext_bundle, tr_ext_4K_bundle])

In [155]:
ext_q_embs = []

for b in ext_bundle:
    e = b['q']['embed'][0][-1].reshape(-1)
    ext_q_embs.append(e)
ext_q_embs = np.array(ext_q_embs)

In [156]:
from sklearn.decomposition import PCA

In [159]:
N = len(ext_q_embs)
M = int(N * 0.9)
ext_q_embs_train = ext_q_embs[:M]
ext_q_embs_val = ext_q_embs[M:]

In [217]:
pca = PCA(n_components=32, random_state=0)
ext_q_embs_train_proj = pca.fit_transform(ext_q_embs_train)
ext_q_embs_val_proj = pca.transform(ext_q_embs_val)

In [218]:
np.mean((pca.inverse_transform(ext_q_embs_train_proj) - ext_q_embs_train) ** 2)

0.04108772

In [219]:
np.mean((pca.inverse_transform(ext_q_embs_val_proj) - ext_q_embs_val) ** 2)

0.047490757

In [236]:
def make_emb_random_proj(emb: np.array):
    emb_proj = np.matmul(emb, random_proj)
    emb_proj /= np.sum(np.abs(emb_proj))
    return emb_proj

def make_pca_proj(emb: np.array):
    e = pca.transform([emb]).reshape(-1)
    e /= np.sum(np.abs(e))
    return e


def extract_features_in_domain_v1(bundle: np.array, idxs: np.array, reset_cache: bool=True, is_pairwise: bool=False, is_train: bool=True) -> np.array:
    res = []

    for idx in idxs:
        emb = bundle[idx]['q']['embed'][0][-1].reshape(-1)
    #         emb /= np.sum(np.abs(emb))
#         emb_proj = make_emb_random_proj(emb)
        emb_proj = make_pca_proj(emb)

        ll = np.array([
            bundle[idx]['a1']['likelihood'],
            bundle[idx]['a2']['likelihood'],
            bundle[idx]['a3']['likelihood'],
        ])
        embs = np.array([
            bundle[idx]['a1']['embed'][0][-1].reshape(-1),
            bundle[idx]['a2']['embed'][0][-1].reshape(-1),
            bundle[idx]['a3']['embed'][0][-1].reshape(-1),
        ])

        dists = np.sum(np.abs(embs - emb), axis=1)

        ll = (ll - np.mean(ll)) / np.std(ll)
        vec = np.concatenate([
#             [0],
#             [np.argmax(ll)],
            emb_proj
        ])
        for j in range(3):
            emb_a = embs[j]
            prediction = train_predictions_by_model_v1[3 * idx + j] if is_train else full_ext_predictions_by_model_v1[3 * idx + j]
            res.append(np.concatenate([
                vec,
                np.array([np.mean(np.abs(emb - emb_a))]),
                make_pca_proj(emb_a),
                np.array([ll[j],prediction, j == np.argmax(ll)]),
            ]))
    return np.array(res)

In [237]:
f = extract_features_in_domain_v1(train_bundle, [0], True, True, False)
f.shape, f[0]

((3, 68),
 array([-1.25097914e-01, -4.40770919e-02, -6.33244460e-02, -3.00110278e-02,
         3.31132167e-02,  1.55699046e-03, -4.61391210e-02,  4.91796536e-02,
         4.53230453e-03,  5.19091071e-03,  2.95666851e-02, -4.19176789e-02,
         5.46434388e-02,  1.99204987e-02, -1.82127392e-02, -1.73031577e-02,
         8.15590222e-04,  1.53523215e-02, -2.66761652e-02,  1.24470212e-02,
        -2.60671311e-02,  5.20211985e-02,  8.47667758e-03, -1.64939470e-02,
        -2.25043793e-02,  3.12857765e-02,  3.15408804e-02, -4.42142555e-02,
         7.09321428e-02, -1.33227832e-02, -1.48099100e-02, -2.92529448e-02,
         5.09109557e-01,  6.80187324e-02, -2.21136950e-02, -8.13530869e-02,
        -9.97428135e-04, -2.03629977e-02, -8.84313876e-03, -3.64948562e-02,
         4.44431439e-02,  5.76733541e-02,  1.77590755e-02,  6.22163666e-02,
         3.35112129e-02,  8.40264272e-03,  3.57924634e-02, -2.24515967e-02,
        -1.36650380e-02,  3.40634625e-02,  5.39864289e-02, -8.66375284e-03,
  

In [238]:
import catboost

In [239]:
def fit_predict_with_catboost(model, X_train, X_val, X_test, y_train, y_val, val_ratio: float, is_pairwise: bool, weights:np.array):
    train_group_id = np.repeat(np.arange(X_train.shape[0] // 3), 3) if is_pairwise else None
    val_group_id = np.repeat(np.arange(X_val.shape[0] // 3), 3) if is_pairwise else None
    train_pool = catboost.Pool(X_train, y_train, group_id=train_group_id, weight=weights)
#     val_pool = catboost.Pool(X_val, y_val, group_id=val_group_id)
    model.fit(train_pool)#, eval_set=val_pool)
    return model.predict(X_test)

def calc_metric(y_true, y_score, is_pairwise: bool):
    if not is_pairwise:
        return np.mean(y_true == y_score)
    y_score = y_score.reshape(-1, 3)
    y_true = y_true.reshape(-1, 3)
    return np.mean(np.argmax(y_true, axis=1) == np.argmax(y_score, axis=1))

In [240]:
class PiperV1:
    def __init__(self, model_fn, val_ratio: float, is_pairwise: bool, ext_weight:float, fit_predict_with_model, global_dataset_ids: np.array):
        self.model_fn = model_fn
        self.models = []
        self.reset_cache = True
        self.is_pairwise = is_pairwise
        self.val_ratio = val_ratio
        self.ext_weight = ext_weight
        self.fit_predict_with_model = fit_predict_with_model
        self.global_dataset_ids = global_dataset_ids
        self.iter = 0

    def pipe(self, _split):
        train_val_idx, test_idx = _split
        train_val_idx = self.global_dataset_ids[train_val_idx]
        test_idx = self.global_dataset_ids[test_idx]
        
        self.models.append(self.model_fn(self.iter))

        np.random.seed(self.iter)
        self.iter += 1
        ids = np.arange(len(train_val_idx))
        np.random.shuffle(ids)
        N = int(len(train_val_idx) * self.val_ratio)
        train_idx = train_val_idx[N:]
        val_idx = train_val_idx[:N]

        X_ext = extract_features_in_domain_v1(
            full_ext_bundle,
            np.arange(len(full_ext_targets)),
            self.reset_cache,
            self.is_pairwise,
            False
        )
        y_ext = extract_targets_ext(self.is_pairwise)

        X_train = extract_features_in_domain_v1(train_bundle, train_idx, self.reset_cache, self.is_pairwise, True)
        X_val = extract_features_in_domain_v1(train_bundle, val_idx, self.reset_cache, self.is_pairwise, True)
        X_test = extract_features_in_domain_v1(train_bundle, test_idx, self.reset_cache, self.is_pairwise, True)

        y_train = extract_targets_in_domain(train_idx, self.is_pairwise)
        y_val = extract_targets_in_domain(val_idx, self.is_pairwise)
        y_test = extract_targets_in_domain(test_idx, self.is_pairwise)

        train_weights = np.ones_like(y_train)
        ext_weights = np.ones_like(y_ext) * self.ext_weight
        
        if True:  # useful for debug
            train_weights = np.concatenate([train_weights, ext_weights], axis=0)
            X_train = np.concatenate([X_train, X_ext], axis=0)
            y_train = np.concatenate([y_train, y_ext], axis=0)

        y_test_pred = self.fit_predict_with_model(self.models[-1], X_train, X_val, X_test, y_train, y_val, self.val_ratio, self.is_pairwise, train_weights)
#         y_test_baseline = np.array([np.eye(3)[int(x)] for x in X_test[::3, 0]]).reshape(-1)
        y_test_baseline = X_test[:, -1]   # is sber-gpt3
        metric = calc_metric(y_test, y_test_pred, self.is_pairwise)
        base_metric = calc_metric(y_test, y_test_baseline, self.is_pairwise)
        return (metric, base_metric)

In [241]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [242]:
def fit_predict_with_sklearn(model, X_train, X_val, X_test, y_train, y_val, val_ratio: float, is_pairwise: bool, weights:np.array):
#     train_group_id = np.repeat(np.arange(X_train.shape[0] // 3), 3) if is_pairwise else None
#     val_group_id = np.repeat(np.arange(X_val.shape[0] // 3), 3) if is_pairwise else None
#     train_pool = catboost.Pool(X_train, y_train, group_id=train_group_id, weight=weights)
#     val_pool = catboost.Pool(X_val, y_val, group_id=val_group_id)
#     model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)
    model.fit(X_train, y_train)#, weights)
    return model.predict(X_test)

In [243]:
from sklearn.model_selection import RepeatedStratifiedKFold
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [244]:
GLOBAL_TEST_RATIO = 0.1
np.random.seed(42)
ids = np.arange(len(train_data))
np.random.shuffle(ids)
print(ids[:10])
N = int(len(train_data) * GLOBAL_TEST_RATIO)
train_global_idx = ids[N:]
val_global_idx = ids[:N]

[3286 2463 4058  746 1979  534  166 2620 3205 3664]


In [None]:
score_aggregator = ScoreAggregator()
global_dataset_ids = np.arange(train_data.shape[0])[train_global_idx]
piper = PiperV1(
    lambda n_split: catboost.CatBoost({
        "loss_function":"QueryRMSE",
        "rsm":0.3,
        "random_seed": n_split,
        "max_bin": 32,
        "n_estimators": 1000}),
    0,
    True,
    0.01,
    fit_predict_with_catboost,
    global_dataset_ids
)

evaluate(
    generator=RepeatedStratifiedKFold(n_repeats=3, n_splits=3, random_state=13).split(
        global_dataset_ids,
        train_raw_targets[train_global_idx]
    ),
    pipe=piper.pipe,
    aggregator=score_aggregator
)

0:	learn: 0.4711395	total: 5.72ms	remaining: 5.71s
1:	learn: 0.4710314	total: 10.3ms	remaining: 5.13s
2:	learn: 0.4708790	total: 14.2ms	remaining: 4.73s
3:	learn: 0.4707728	total: 18ms	remaining: 4.49s
4:	learn: 0.4707164	total: 22ms	remaining: 4.38s
5:	learn: 0.4706053	total: 25.7ms	remaining: 4.25s
6:	learn: 0.4704613	total: 29.6ms	remaining: 4.2s
7:	learn: 0.4703471	total: 33.3ms	remaining: 4.12s
8:	learn: 0.4702702	total: 37.3ms	remaining: 4.11s
9:	learn: 0.4701766	total: 41ms	remaining: 4.05s
10:	learn: 0.4700386	total: 45.6ms	remaining: 4.1s
11:	learn: 0.4698655	total: 49.3ms	remaining: 4.06s
12:	learn: 0.4696724	total: 53.5ms	remaining: 4.06s
13:	learn: 0.4695043	total: 57ms	remaining: 4.02s
14:	learn: 0.4694239	total: 61.1ms	remaining: 4.01s
15:	learn: 0.4692768	total: 64.7ms	remaining: 3.98s
16:	learn: 0.4691239	total: 68.8ms	remaining: 3.98s
17:	learn: 0.4690210	total: 72.4ms	remaining: 3.95s
18:	learn: 0.4688614	total: 76.4ms	remaining: 3.95s
19:	learn: 0.4688017	total: 82ms

In [251]:
score_aggregator.calc_info() # 37.3 for 500 it

{'mean_score': 0.37328101447928325,
 'std_score': 0.008979662688540868,
 'mean_diff': 0.018972619111229033,
 'std_diff': 0.012395365464154137,
 'p_value': 0.12586287112651248}

In [163]:
score_aggregator.calc_info() # for 1K it

{'mean_score': 0.3738264141364478,
 'std_score': 0.010596926147310963,
 'mean_diff': 0.019518018768393555,
 'std_diff': 0.014825118673130074,
 'p_value': 0.18798933923295036}

In [65]:
score_aggregator.calc_info()

{'mean_score': 0.3789692133374536,
 'std_score': 0.01007919934625741,
 'mean_diff': 0.023883142772314424,
 'std_diff': 0.01694510193500251,
 'p_value': 0.15870445996045657}

In [1183]:
score_aggregator.calc_info()

{'mean_score': 0.3814319645659511,
 'std_score': 0.011065598922396114,
 'mean_diff': 0.026345894000811932,
 'std_diff': 0.01658368026740085,
 'p_value': 0.11213631720066593}

In [252]:
score_aggregator._scores

[0.36833470057424117,
 0.38752052545155996,
 0.38752052545155996,
 0.36177194421657094,
 0.3727422003284072,
 0.36617405582922824,
 0.3634126333059885,
 0.3760262725779967,
 0.3760262725779967]

In [253]:
np.std(score_aggregator._scores), np.min(score_aggregator._scores)

(0.008979662688540868, 0.36177194421657094)

In [254]:
np.std(score_aggregator._baselines), np.min(score_aggregator._baselines)

(0.005675757668625942, 0.3440065681444992)

In [216]:
piper.models[-1].best_score_

{'learn': {'QueryRMSE': 0.40951366224807756}}

In [76]:
m = piper.models[-1]

In [150]:
global_test = extract_features_in_domain_v1(train_bundle, val_global_idx, True, True, True)

In [151]:
res = []

for m in piper.models:
    scores = m.predict(global_test)
    scores = scores.reshape(-1, 3)
    res.append(scores)
res = np.array(res)
global_test_pred = np.sum(res, axis=0)

y_global_test = extract_targets_in_domain(val_global_idx, True)
metric = calc_metric(y_global_test, global_test_pred, True)
metric

0.39408866995073893