## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
import re
import gc
import copy

from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import OneHotEncoder
import nltk.data

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable
from torch.optim import lr_scheduler

from tqdm import tqdm_notebook as tqdm
from transformers import DistilBertTokenizer
import transformers

from radam import RAdam
from text_data import TextDataset3, AugTextDataset, TextDataset7
from bert import CustomBert3, CustomBert7
from learning import Learner
from lr_finder import LRFinder
from one_cycle import OneCycleLR
from text_cleaning import clean_data
from sentence_embed import get_use_embedding_features, get_distill_bert_features
from create_features import get_dist_features, get_categorical_features
from losses_metrics import spearmanr_torch, spearmanr_np, FocalLoss
from inference import infer
from eda import eda
from common import *
from utils.helpers import init_logger, init_seed

%matplotlib inline
pd.set_option('max_colwidth',400)

## Loading and preparing data

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)
path = 'data/'
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
test = pd.read_csv(f'{path}test.csv').fillna(' ')
train = pd.read_csv(f'{path}train.csv').fillna(' ')

In [3]:
train['question'] = train['question_title'] + ' [SEP] ' + train['question_body']
test['question'] = test['question_title'] + ' [SEP] ' + test['question_body']

In [4]:
%%time
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
ids_train = {}
ids_test = {}
max_seq_len = 512
for mode, df in [('train', train), ('test', test)]:
    for text in ['question', 'answer']:
        ids = []
        for x in tqdm(df[text].values):
            
            sentences = sentence_tokenizer.tokenize(x)
            sentences = [' '.join(bert_tokenizer.tokenize(s)) for s in sentences]
            
            curr_seq_len = 0
            seq, seqs = '', []
            for i, s in enumerate(sentences):
                new_seq = seq + ' ' + s
                curr_seq_len = len(new_seq.split())
                
                if ((i != (len(sentences) - 1)) 
                    and (curr_seq_len < (max_seq_len - 2))): # account for [CLS] and [SEP] tokens
                    seq = new_seq
                else:
                    seq_ids = bert_tokenizer.convert_tokens_to_ids(seq.split())
                    encoded_inputs = bert_tokenizer.prepare_for_model(
                        seq_ids, add_special_tokens=True, max_length=max_seq_len, pad_to_max_length=True)
                    seqs.append(encoded_inputs['input_ids'])
                    seq = s
            ids.append(seqs)
        if mode == 'train': ids_train[text] = np.array(ids)
        else: ids_test[text] = np.array(ids)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(IntProgress(value=0, max=6079), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6079), HTML(value='')))




HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




HBox(children=(IntProgress(value=0, max=476), HTML(value='')))


CPU times: user 29.6 s, sys: 236 ms, total: 29.9 s
Wall time: 30 s


In [5]:
train_host, test_host, host_dict, host_dict_reverse = get_categorical_features(train, test, 'host')
train_category, test_category, category_dict, category_dict_reverse = \
    get_categorical_features(train, test, 'category')

In [6]:
cat_features_train = np.hstack([train_host.reshape(-1, 1), train_category.reshape(-1, 1)])
cat_features_test = np.hstack([test_host.reshape(-1, 1), test_category.reshape(-1, 1)])
ohe = OneHotEncoder()
ohe.fit(cat_features_train)
cat_features_train = ohe.transform(cat_features_train).toarray()
cat_features_test = ohe.transform(cat_features_test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
y = train[TARGETS].values.astype(np.float32)

In [8]:
num_workers = 10
bs = 4

In [53]:
bs_test = 4
test_loader = DataLoader(
    TextDataset7(cat_features_test, ids_test['question'], ids_test['answer'], test.index),
    batch_size=bs_test, shuffle=False, num_workers=num_workers, drop_last=False
)

In [10]:
# text_data.array_astype(ids_test['question'][:5], np.long)

In [52]:
from importlib import reload
import text_data
import utils.torch
import bert
reload(utils.torch)
reload(text_data)
reload(bert)
from text_data import TextDataset3, TextDataset7
from bert import CustomBert7

## Training model

In [45]:
class MyRankingLoss(nn.MSELoss):
    def forward(self, input, target):
        input = torch.sigmoid(input)
        n = input.size(0)
        n_pairs = n // 2
        n_tot_pairs = n_pairs + (n % 2)
        loss = 0
        for i in range(n_pairs):
            dp = input[2*i] - input[(2*i)+1]
            dy = target[2*i] - target[(2*i)+1]
            loss += super().forward(dp, dy) / n_tot_pairs
            
        if n_tot_pairs > n_pairs:
            dp = input[-2] - input[-1]
            dy = target[-2] - target[-1]
            loss += super().forward(dp, dy) / n_tot_pairs
        return loss
    
class MixedLoss(nn.Module):
    def __init__(self, pos_weight=N_TARGETS*[1.0]):
        super().__init__()
        pos_weight = torch.Tensor(pos_weight).cuda()
        self.bce = nn.BCEWithLogitsLoss(reduction='mean', pos_weight=pos_weight)
        self.mrl = MyRankingLoss()

    def forward(self, input, target):
        loss = (1. * self.bce(input, target) + 1. * self.mrl(input, target))
        return loss.mean()

In [46]:
lr = 0.001
loss_fn = nn.BCEWithLogitsLoss()
device = 'cuda'
n_epochs = 4
grad_accum = 2
weight_decay = 0.01
model_name = 'double_distil_bert'
checkpoint_dir = 'checkpoints/'
early_stopping = None

In [47]:
def get_optimizer_param_groups(model, lr, weight_decay):
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 
         'weight_decay': weight_decay, 'lr': lr},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
         'weight_decay': 0.0, 'lr': lr}
    ]
    return optimizer_grouped_parameters


def get_optimizer(model, lr, weight_decay):
    return transformers.AdamW(
        get_optimizer_param_groups(model.head, lr, weight_decay)
        + get_optimizer_param_groups(model.q_bert, lr / 100, weight_decay)
        + get_optimizer_param_groups(model.a_bert, lr / 100, weight_decay)
    )

In [54]:
init_seed()
folds = GroupKFold(n_splits=5).split(
    X=train['question_body'], groups=train['question_body'])#KFold(n_splits=5, random_state=42).split(train)
oofs = np.zeros((len(train), N_TARGETS))
preds = np.zeros((len(test), N_TARGETS))

for fold_id, (train_index, valid_index) in enumerate(folds):
    print(f'Fold {fold_id + 1} started at {time.ctime()}')
    train_loader = DataLoader(
        TextDataset7(cat_features_train, ids_train['question'], ids_train['answer'], train_index, targets=y),
        batch_size=bs, shuffle=True, num_workers=num_workers, drop_last=False
    )
    valid_loader = DataLoader(
        TextDataset7(cat_features_train, ids_train['question'], ids_train['answer'], valid_index, targets=y), 
        batch_size=bs, shuffle=False, num_workers=num_workers, drop_last=False
    )
    model = CustomBert7(256, cat_features_train.shape[1])
    
    if fold_id == 0:
        print(model)
        model = model.to(device)
        optimizer = get_optimizer(model, lr, weight_decay)
        lr_finder = LRFinder(n_iter=min(grad_accum*100, len(train_loader)), start_lr=1e-5, 
                             end_lr=1, device=device, grad_accum=grad_accum, divergence_factor=5)
        lr_finder.find_lr(model, optimizer, train_loader, loss_fn)
        plt.show()
    
    optimizer = get_optimizer(model, lr, weight_decay)
    scheduler = OneCycleLR(optimizer, n_epochs=n_epochs, n_batches=len(train_loader))

    learner = Learner(
        model, 
        optimizer, 
        train_loader, 
        valid_loader, 
        loss_fn, 
        device, 
        n_epochs, 
        f'{model_name}_fold_{fold_id + 1}', 
        checkpoint_dir, 
        scheduler=scheduler, 
        metric_fns={'spearmanr': (spearmanr_torch, 'epoch_end')}, 
        monitor_metric='spearmanr',
        minimize_score=False, 
        logger=None,
        grad_accum=grad_accum,
        early_st
        opping=early_stopping, 
        batch_step_scheduler=True
    )
    if (fold_id + 1) > 0: learner.train()
    
    oofs[valid_index] = infer(learner.model, valid_loader, learner.best_checkpoint_file, device)
    
    test_preds = infer(learner.model, test_loader, learner.best_checkpoint_file, device)
    preds += test_preds / 5
    
    del learner, model, train_loader, valid_loader
    gc.collect()
    
print(f'OOF score: {spearmanr_np(oofs, y)}')
#0.4134


Fold 1 started at Sun Jan  5 23:56:46 2020


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4415ea1f80>
Traceback (most recent call last):
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 906, in _shutdown_workers
    w.join()
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/multiprocessing/popen_fork.py", line 48, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 


CustomBert7(
  (q_bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1,

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 1 and 2 in dimension 1 at /opt/conda/conda-bld/pytorch_1573049310284/work/aten/src/TH/generic/THTensor.cpp:689


In [61]:
from torch.utils.data import default_collate

ImportError: cannot import name 'default_collate' from 'torch.utils.data' (/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/__init__.py)

In [60]:
train_loader.dataset[1][0][].shape

(1, 512)

In [49]:
%debug

> [0;32m/home/robin/Projects/KaggleProjects/GoogleQuest/bert.py[0m(587)[0;36mforward[0;34m()[0m
[0;32m    585 [0;31m[0;34m[0m[0m
[0m[0;32m    586 [0;31m        [0mx_q_bert[0m[0;34m[[0m[0mone_q_idx[0m[0;34m][0m [0;34m=[0m [0mapply_bert[0m[0;34m([0m[0mq_ids[0m[0;34m[[0m[0mone_q_idx_exp[0m[0;34m][0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mq_bert[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 587 [0;31m        [0mx_a_bert[0m[0;34m[[0m[0mone_a_idx[0m[0;34m][0m [0;34m=[0m [0mapply_bert[0m[0;34m([0m[0ma_ids[0m[0;34m[[0m[0mone_a_idx_exp[0m[0;34m][0m[0;34m,[0m [0mself[0m[0;34m.[0m[0ma_bert[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    588 [0;31m[0;34m[0m[0m
[0m[0;32m    589 [0;31m        [0;32mfor[0m [0mq_idx[0m [0;32min[0m [0mq_idxs[0m[0;34m[[0m[0;34m~[0m[0mone_q_idx[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> one_a_idx
tensor([ True,  True,  True, False], device='cuda

In [51]:
next(iter(train_loader))[0]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
    return default_collate([torch.as_tensor(b) for b in batch])
  File "/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 3 and 2 in dimension 1 at /opt/conda/conda-bld/pytorch_1573049310284/work/aten/src/TH/generic/THTensor.cpp:689


In [37]:
t=torch.tensor([1,3,2,1])
torch.cat([torch.full((n,),i) for i, n in enumerate(t)])

tensor([0., 1., 1., 1., 2., 2., 3.])

In [26]:
torch.arange(4)

tensor([0, 1, 2, 3])

In [None]:
next(iter(train_loader))

In [16]:
%debug

> [0;32m/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/_utils.py[0m(385)[0;36mreraise[0;34m()[0m
[0;32m    381 [0;31m            [0;31m# KeyError calls repr() on its argument (usually a dict key). This[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    382 [0;31m            [0;31m# makes stack traces unreadable. It will not be changed in Python[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    383 [0;31m            [0;31m# (https://bugs.python.org/issue2651), so we work around it.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    384 [0;31m            [0mmsg[0m [0;34m=[0m [0mKeyErrorMessage[0m[0;34m([0m[0mmsg[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 385 [0;31m        [0;32mraise[0m [0mself[0m[0;34m.[0m[0mexc_type[0m[0;34m([0m[0mmsg[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m/home/robin/anaconda3/envs/GoogleQuest/lib/python3.7/site-packages/torch/utils/data/dataloader.py[0m

In [None]:
[len(train_loader.dataset[i][0][1]) for i in range(10)]

In [None]:
np.vstack(train_loader.dataset[8][0][1])

In [None]:
dl = DataLoader(
        TextDataset7(cat_features_train, ids_train['question'], ids_train['answer'], train_index, targets=y),
        batch_size=bs, shuffle=True, num_workers=num_workers, drop_last=False
    )

In [None]:
t=torch.tensor([1,3,2,1])
torch.cat([l.repeat(l) for l in t])

In [None]:
next(iter(dl))

In [None]:
def my_round(x, num, dec=2):
    return np.round(x / num, dec) * num

def round_preds(preds, thres=0.0, low_dec=1, low_num=1, high_dec=2, high_num=3):
    low_idx = preds < thres
    new_preds = np.zeros_like(preds)
    new_preds[low_idx] = my_round(preds[low_idx], low_num, low_dec)
    new_preds[~low_idx] = my_round(preds[~low_idx], high_num, high_dec)
    return new_preds

from scipy.stats import spearmanr
def spearmanr_np(preds, targets):
    score = 0
    for i in range(N_TARGETS):
        score_i = spearmanr(preds[:, i], targets[:, i]).correlation
        score += np.nan_to_num(score_i / N_TARGETS)
    return score

In [None]:
zero_idx = np.where(y.mean(axis=0).round(3)<=0.001)[0][0]

In [None]:
my_round(oofs, 3, 2), oofs

In [None]:
spearmanr_np(oofs, y)

In [None]:
spearmanr_np(np.clip(round_preds(oofs, high_num=3), 0.00001, 0.999999), y)

In [None]:
# clipping is necessary or we will get an error
sample_submission.loc[:, 'question_asker_intent_understanding':] = np.clip(preds, 0.00001, 0.999999)
sample_submission.to_csv('subs/submission.csv', index=False)

In [None]:
sample_submission.head()