## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import datetime
import os
import re
import gc
import copy

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import nltk.data

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from tqdm import tqdm_notebook as tqdm
from transformers import AlbertTokenizer
import transformers

from text_data import TextDataset5 as TextDataset
from albert import CustomAlbert
from create_features import get_categorical_features
from losses_metrics import spearmanr_torch, spearmanr_np, optimize_rounding_params, get_cvs, spearmanr
from inference import infer
from common import *
from utils.helpers import init_logger, init_seed
from utils.torch import *

%matplotlib inline
pd.set_option('max_colwidth',400)

## Loading and preparing data

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)
path = 'data/'
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
test = pd.read_csv(f'{path}test.csv').fillna(' ')
train = pd.read_csv(f'{path}train.csv').fillna(' ')

In [3]:
%%time
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
seg_ids_train, ids_train = {}, {}
seg_ids_test, ids_test = {}, {}
max_seq_len = 512
for mode, df in [('train', train), ('test', test)]:
    for text, cols in [('question', ['question_title', 'question_body']), 
                       ('answer', ['question_title', 'answer'])]:
        ids, seg_ids, sent_ids = [], [], []
        for x1, x2 in tqdm(df[cols].values):
            encoded_inputs = tokenizer.encode_plus(
                x1, x2, add_special_tokens=True, max_length=max_seq_len, pad_to_max_length=True, 
                return_token_type_ids=True
            )
            ids.append(encoded_inputs['input_ids'])
            seg_ids.append(encoded_inputs['token_type_ids'])
        if mode == 'train': 
            ids_train[text] = np.array(ids)
            seg_ids_train[text] = np.array(seg_ids)
        else: 
            ids_test[text] = np.array(ids)
            seg_ids_test[text] = np.array(seg_ids)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(IntProgress(value=0, max=6079), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6079), HTML(value='')))




HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




HBox(children=(IntProgress(value=0, max=476), HTML(value='')))


CPU times: user 12.1 s, sys: 164 ms, total: 12.3 s
Wall time: 12.6 s


In [4]:
train_category, test_category, category_dict, category_dict_reverse = \
    get_categorical_features(train, test, 'category')

In [5]:
cat_features_train = train_category.reshape(-1, 1)
cat_features_test = test_category.reshape(-1, 1)
ohe = OneHotEncoder()
ohe.fit(cat_features_train)
cat_features_train = ohe.transform(cat_features_train).toarray()
cat_features_test = ohe.transform(cat_features_test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
for col in TARGETS:
    train[col] = train[col].rank(method="average")
train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS])
y = train[TARGETS].values.astype(np.float32)

In [7]:
num_workers = 10
bs = 2

In [8]:
bs_test = 2
test_loader = DataLoader(
    TextDataset(cat_features_test, ids_test['question'], ids_test['answer'], 
                seg_ids_test['question'], seg_ids_test['answer'], test.index),
    batch_size=bs_test, shuffle=False, num_workers=num_workers, drop_last=False
)

## Training model

In [9]:
device = 'cuda'
model_output_dir = 'albert_1_output/'
model_name = 'siamese_albert_1'
checkpoint_dir = 'checkpoints/'
n_folds = 10
os.makedirs(model_output_dir, exist_ok=True)

In [10]:
def get_model_outputs(model, loader, checkpoint_file, device):

    print(f'Get Albert outputs for model: {checkpoint_file}')
    n_obs = len(loader.dataset)
    batch_sz = loader.batch_size
    q_outputs, a_outputs = [], []

    currently_deterministic = torch.backends.cudnn.deterministic
    torch.backends.cudnn.deterministic = True

    if checkpoint_file is not None:
        checkpoint = torch.load(checkpoint_file)
        model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    with torch.no_grad():
        for i, (inputs, _) in enumerate(tqdm(loader)):
            inputs = to_device(inputs, device)
            batch_q_outputs = model.q_albert(inputs[1], inputs[3])
            batch_a_outputs = model.a_albert(inputs[2], inputs[4])
            q_outputs.append(to_cpu(batch_q_outputs))
            a_outputs.append(to_cpu(batch_a_outputs))
        
        q_outputs = torch.cat(q_outputs)
        a_outputs = torch.cat(a_outputs)

    torch.backends.cudnn.deterministic = currently_deterministic

    return to_numpy(q_outputs), to_numpy(a_outputs)


def store_model_outputs(model, loader, checkpoint_file, device, file_path):
    q_outputs, a_outputs = get_model_outputs(model, loader, checkpoint_file, device)
    np.save(f'{file_path}_q_outputs', q_outputs)
    np.save(f'{file_path}_a_outputs', a_outputs)

In [11]:
init_seed()
folds = GroupKFold(n_splits=n_folds).split(X=train['question_body'], groups=train['question_body'])

for fold_id, (train_index, valid_index) in enumerate(folds):
    print(f'Fold {fold_id + 1} started at {time.ctime()}')
    train_loader = DataLoader(
        TextDataset(cat_features_train, ids_train['question'], ids_train['answer'],
                    seg_ids_train['question'], seg_ids_train['answer'], train_index, targets=y), 
        batch_size=2, shuffle=False, num_workers=num_workers, drop_last=False
    )
    valid_loader = DataLoader(
        TextDataset(cat_features_train, ids_train['question'], ids_train['answer'],
                    seg_ids_train['question'], seg_ids_train['answer'], valid_index, targets=y), 
        batch_size=2, shuffle=False, num_workers=num_workers, drop_last=False
    )
    model = CustomAlbert(256, cat_features_train.shape[1])
    checkpoint_file = f'{checkpoint_dir}{model_name}_fold_{fold_id+1}_best.pth'
    
    output_file_path = lambda mode: f'{model_output_dir}{mode}_{model_name}_fold_{fold_id+1}'
    store_model_outputs(model, train_loader, checkpoint_file, device, output_file_path('train'))
    store_model_outputs(model, valid_loader, checkpoint_file, device, output_file_path('valid'))
    store_model_outputs(model, test_loader, checkpoint_file, device, output_file_path('test'))

Fold 1 started at Thu Feb  6 20:33:30 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.
There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_1_best.pth


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_1_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_1_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 2 started at Thu Feb  6 20:37:38 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_2_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_2_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_2_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 3 started at Thu Feb  6 20:41:44 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_3_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_3_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_3_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))


Fold 4 started at Thu Feb  6 20:45:47 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.
There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_4_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_4_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_4_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 5 started at Thu Feb  6 20:49:49 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_5_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_5_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_5_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 6 started at Thu Feb  6 20:53:50 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_6_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_6_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_6_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 7 started at Thu Feb  6 20:57:49 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_7_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_7_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_7_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 8 started at Thu Feb  6 21:01:51 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_8_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_8_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_8_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 9 started at Thu Feb  6 21:05:55 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_9_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_9_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_9_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))

There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.



Fold 10 started at Thu Feb  6 21:09:56 2020


There is currently an upstream reproducibility issue with ALBERT v2 models. Please see https://github.com/google-research/google-research/issues/119 for more information.


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_10_best.pth


HBox(children=(IntProgress(value=0, max=2736), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_10_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Get Albert outputs for model: checkpoints/siamese_albert_1_fold_10_best.pth


HBox(children=(IntProgress(value=0, max=238), HTML(value='')))


