In [6]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from collections import defaultdict

%matplotlib inline

In [7]:
%config Completer.use_jedi = False

In [8]:
BASE_PATH = "data/"

In [11]:
train_data = pd.read_csv(BASE_PATH + 'rus_train_dataset.csv', encoding='utf-8', sep='|')
en_ru_aux_data = pd.read_csv(BASE_PATH + 'en_to_rus_train_dataset.csv', encoding='utf-8', sep='|')

In [12]:
train_raw_targets = train_data["right_answer_id"].values

In [9]:
%%time
train_bundle = np.load(BASE_PATH + 'train_dataset_bundle.npy', allow_pickle=True)

CPU times: user 538 ms, sys: 899 ms, total: 1.44 s
Wall time: 1.79 s


In [10]:
%%time
ext_bundle = np.load(BASE_PATH + 'en_ru_aux_dataset_bundle.npy', allow_pickle=True)

CPU times: user 362 ms, sys: 676 ms, total: 1.04 s
Wall time: 1.3 s


In [13]:
def _fix_input(x):
    if x in ['0.0', '1.0', '2.0', '0,0', '1,0', '2,0', '1.0.']:
        return float(x[0])
    return -1

prepared_en_ru_targets = np.array([_fix_input(x) for x in en_ru_aux_data["right_answer_id"].values])

In [14]:
import torch
from torch import nn
from torch import optim

from tensorboardX import SummaryWriter

import numpy as np

In [15]:
class Dataset:
    def __init__(self, question, answer, y):
        self.question = question
        self.answer = answer
        self.y = y

In [17]:
# nn.BCEWithLogitsLoss()()

In [18]:
def learn(train, val, model_ff, epochs=5, batch_size=64, shuffle=True, freq=10,lr=1e-3,criterion = nn.BCEWithLogitsLoss()): 
    writer = SummaryWriter()
    
    np.random.seed(1)
    ids_nn = np.arange(train.y.shape[0])
    
#     reshape_to_last = lambda x: torch.reshape(x, [np.prod(x.shape[:-1]), x.shape[-1]])

    optimizer = optim.Adam(model_ff.parameters(), lr=lr)

    time_for_print_loss = lambda i: (i + 1) % freq == 0
    
    n_iter = 0
    

    for epoch in np.arange(epochs):
        np.random.shuffle(ids_nn)

        model_ff.train(True)

        for b in np.arange(0, train.y.shape[0], batch_size):
            X_batch = torch.FloatTensor(train.question[ids_nn[b:b+batch_size]])
            y_batch = torch.FloatTensor(train.y[ids_nn[b:b+batch_size]])
            a_batch = torch.FloatTensor(train.answer[ids_nn[b:b+batch_size]])

            optimizer.zero_grad()
            y_pred_logits = model_ff(X_batch, a_batch)

            loss = criterion(y_pred_logits, y_batch)
            loss.backward()

            optimizer.step()

            if (b // batch_size + 1) % freq == 0:
                print('train loss in %d epoch in %d batch: %.5f' %
                  (epoch + 1, b // batch_size + 1, loss.item()))
                
                writer.add_scalar('data/train_loss', loss.item(), n_iter)
                writer.add_scalar('data/epoch', epoch + 1, n_iter)
                writer.add_scalar('data/batch', b // batch_size + 1, n_iter)

                val_loss = 0
                val_acc = 0
                its = 0
                model_ff.train(False)
                for b in np.arange(0, val.y.shape[0], batch_size):
                    its += 1
                    X_batch = torch.FloatTensor(val.question[b:b+batch_size])
                    y_batch = torch.FloatTensor(val.y[b:b+batch_size])
                    a_batch = torch.FloatTensor(val.answer[b:b+batch_size])
                    with torch.no_grad():
                        y_pred_logits = model_ff(X_batch, a_batch)
                    loss = criterion(y_pred_logits, y_batch)
#                     print(X_batch.shape, y_batch.shape, y_pred_logits.shape)
                    s_pred = torch.argmax(y_pred_logits.reshape(-1, 3), dim=1)
                    s_true = torch.argmax(y_batch.reshape(-1, 3), dim=1)
                    acc = torch.mean((s_pred == s_true).type(torch.FloatTensor))
                    val_loss += loss.item()
                    val_acc += acc.item()
                val_loss /= its
                val_acc /= its
                print('val loss in %d epoch: %.5f' % (epoch + 1, val_loss))
                print('val acc in %d epoch: %.5f' % (epoch + 1, val_acc))
                
                writer.add_scalar('data/val_loss', val_loss, n_iter)
                writer.add_scalar('data/val_acc', val_acc, n_iter)
                n_iter += 1
                
def apply(val, model_ff, batch_size=33, criterion = nn.BCEWithLogitsLoss()):
    val_loss = 0
    val_acc = 0
    its = 0
    model_ff.train(False)
    for b in np.arange(0, val.y.shape[0], batch_size):
        its += 1
        X_batch = torch.FloatTensor(val.question[b:b+batch_size])
        y_batch = torch.FloatTensor(val.y[b:b+batch_size])
        a_batch = torch.FloatTensor(val.answer[b:b+batch_size])
        with torch.no_grad():
            y_pred_logits = model_ff(X_batch, a_batch)
        loss = criterion(y_pred_logits, y_batch)
#                     print(X_batch.shape, y_batch.shape, y_pred_logits.shape)
        s_pred = torch.argmax(y_pred_logits.reshape(-1, 3), dim=1)
        s_true = torch.argmax(y_batch.reshape(-1, 3), dim=1)
        acc = torch.mean((s_pred == s_true).type(torch.FloatTensor))
        val_loss += loss.item()
        val_acc += acc.item()
    val_loss /= its
    val_acc /= its
    return val_loss, val_acc

In [19]:
ids = np.arange(len(ext_bundle))
np.random.seed(0)
np.random.shuffle(ids)
print(ids[:10])
N = int(len(ext_bundle) * 0.2)
val_ids = ids[:N]
train_ids = ids[N:]

[ 768  704  726  215 1158  813  436 1611  148 1423]


In [20]:
def get_embed(bundle: dict)->np.array:
    e = bundle['embed'][0]
    return np.concatenate([np.max(e, axis=0), e[-1]],axis=0).reshape(-1)  # last + max pooling

def make_dataset(bundle, targets, ids) -> Dataset:
    assert len(bundle) == len(targets)
    Q,A,Y = [],[],[]
    for idx in ids:
        q = get_embed(bundle[idx]['q'])
        a = []
        for j in range(3):
            a = get_embed(bundle[idx]['a{}'.format(j + 1)])
            Q.append(q)
            A.append(a)
            Y.append([targets[idx] == j])
    Q = np.array(Q, dtype=np.float32)
    A = np.array(A, dtype=np.float32)
    Y = np.array(Y, dtype=np.float32)
    return Dataset(Q, A, Y)

In [21]:
trainD = make_dataset(ext_bundle, prepared_en_ru_targets, train_ids)
valD = make_dataset(ext_bundle, prepared_en_ru_targets, val_ids)

In [22]:
class NNModelV1(nn.Module):
    def __init__(self, INPUT_F, DROP_P, H):
        super().__init__()
        self.model_ff =  nn.Sequential(
            nn.BatchNorm1d(INPUT_F),
            nn.Dropout(DROP_P),
            nn.Linear(INPUT_F, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, 1)
        )

    def forward(self, q, a):
        x = torch.cat([q, a], dim=1)
        return self.model_ff(x)

In [127]:
INPUT_F = 1536 * 4
H = 128
DROP_P = 0.1

torch.manual_seed(0)
np.random.seed(0)

model = NNModelV1(INPUT_F, DROP_P, H)

learn(trainD, valD, model, freq=40, batch_size=33,lr=1e-5, epochs=30)

train loss in 1 epoch in 40 batch: 0.68664
val loss in 1 epoch: 0.68680
val acc in 1 epoch: 0.30724
train loss in 1 epoch in 80 batch: 0.68539
val loss in 1 epoch: 0.68151
val acc in 1 epoch: 0.30471
train loss in 1 epoch in 120 batch: 0.66841
val loss in 1 epoch: 0.67489
val acc in 1 epoch: 0.31145
train loss in 2 epoch in 40 batch: 0.66671
val loss in 2 epoch: 0.66420
val acc in 2 epoch: 0.33670
train loss in 2 epoch in 80 batch: 0.61747
val loss in 2 epoch: 0.65773
val acc in 2 epoch: 0.34428
train loss in 2 epoch in 120 batch: 0.64184
val loss in 2 epoch: 0.65246
val acc in 2 epoch: 0.32155
train loss in 3 epoch in 40 batch: 0.62614
val loss in 3 epoch: 0.64878
val acc in 3 epoch: 0.32407
train loss in 3 epoch in 80 batch: 0.65755
val loss in 3 epoch: 0.64639
val acc in 3 epoch: 0.31902
train loss in 3 epoch in 120 batch: 0.65796
val loss in 3 epoch: 0.64403
val acc in 3 epoch: 0.32660
train loss in 4 epoch in 40 batch: 0.61400
val loss in 4 epoch: 0.64231
val acc in 4 epoch: 0.321

In [24]:
original_trainD = make_dataset(train_bundle, train_raw_targets, np.arange(len(train_raw_targets)))

In [25]:
full_extD = make_dataset(ext_bundle, prepared_en_ru_targets, np.arange(len(prepared_en_ru_targets)))

In [26]:
apply(original_trainD, model) # drop 0.1

(0.6405012151679477, 0.36191647149421075)

In [129]:
# torch.save(model.state_dict(), 'models/sber_gpt3_hidden_to_target_v1')

In [27]:
INPUT_F = 1536 * 4
H = 128
DROP_P = 0.1

torch.manual_seed(0)
np.random.seed(0)

model = NNModelV1(INPUT_F, DROP_P, H)

model.load_state_dict(torch.load("models/sber_gpt3_hidden_to_target_v1"))

In [28]:
def apply_and_save_predictions(data, file):
    model.train(False)
    with torch.no_grad():
        logits = model(torch.FloatTensor(data.question), torch.FloatTensor(data.answer))
    logits_prepared = logits.reshape(-1).cpu().detach().numpy()
    print(logits_prepared.shape)
    np.save(file, logits_prepared)
    return logits_prepared

In [29]:
apply_and_save_predictions(original_trainD, "data/original_train_predictions_by_model_v1")

(12183,)


array([-0.86392146, -0.87211543, -0.79791766, ..., -0.45517492,
       -0.42451477, -0.52346647], dtype=float32)

In [141]:
apply_and_save_predictions(full_extD, "data/ext_predictions_by_model_v1")

(5823,)


array([-0.8281525 , -0.8319542 , -0.5934919 , ..., -0.5707167 ,
       -0.75683475, -0.72892743], dtype=float32)

In [31]:
tr_ext_4K_dataset = np.load("data/en_ru_4k_dataset.npy", allow_pickle=True)

In [32]:
tr_ext_4K_dataset

array([[0,
        'Компания Loake Brothers, основанная в 1880 году, известна своим высоким качеством.',
        'The Lion King', 'Баптистская церковь Уэстборо', 'Обувь', 2],
       [1, '"Кто играет главную роль в фильме" Бэтмен "?',
        'Уиттингтон, Ричард', 'Мишель Баффер',
        'Ошибки, допущенные в ходе выборов, могут привести к тому, что многие избиратели не смогут проголосовать за кандидата в президенты США от Республиканской партии Дональда Трампа, который, по их мнению, является наиболее вероятным кандидатом от Республиканской партии на предстоящих президентских выборах в 2016 году.',
        1],
       [2, 'Под каким названием проходят съемки фильма "Лед и пламя"?',
        'Игра престолов', 'Жан Симмонс', 'Мэри Квант', 0],
       ...,
       [4093,
        'Что представляет собой европейская система оценки качества образования?',
        'Мидлмарш', 'Дидо', 'Альбатрос', 2],
       [4094, 'В 1999 году Том Хэнкс', 'История игрушек 2',
        'Бетьман, Джон', 'Бриннер, Ю

In [33]:
tr_ext_4K_targets = tr_ext_4K_dataset[:, -1]
tr_ext_4K_targets

array([2, 1, 0, ..., 2, 0, 0], dtype=object)

In [34]:
tr_ext_4K_bundle = np.load("data/en_ru_4k_bundle.npy", allow_pickle=True)

In [35]:
tr_ext_4KD = make_dataset(tr_ext_4K_bundle, tr_ext_4K_targets, np.arange(len(tr_ext_4K_targets)))

In [36]:
apply(tr_ext_4KD, model)

(0.6538657828885812, 0.38148916358922186)

In [153]:
apply_and_save_predictions(tr_ext_4KD, "data/tr_ext_4K_predictions_by_model_v1")

(12288,)


array([-0.47767383, -0.8918863 , -0.74630445, ..., -0.7757869 ,
       -1.3973103 , -1.2401094 ], dtype=float32)

In [38]:
rubq_qat_pairs = np.load("data/rubq_qat_pairs.npy", allow_pickle=True)
rubq_bundle = [x[-1] for x in rubq_qat_pairs]
rubq_targets = [x[-2] for x in rubq_qat_pairs]

In [43]:
rubqD = make_dataset(rubq_bundle, rubq_targets, np.arange(len(rubq_targets)))

In [44]:
apply(rubqD, model)

(0.6379615324285796, 0.3739756146129572)

In [45]:
apply_and_save_predictions(rubqD, "data/rubq_predictions_by_model_v1")

(6990,)


array([-0.7742601 , -0.83825415, -0.83260673, ..., -0.46999604,
       -0.5549119 , -0.5293012 ], dtype=float32)