## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import datetime
import re
import gc
from urllib.parse import urlparse
from scipy.stats import spearmanr

from sklearn.model_selection import KFold
from sklearn.manifold import TSNE
from sklearn.preprocessing import OneHotEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable
from torch.optim import lr_scheduler

from tqdm import tqdm
from transformers import DistilBertTokenizer

from radam import RAdam
from text_data import TextDataset2, TextDataset3
from bert import CustomBert, HeadNet
from learning import Learner
from lr_finder import LRFinder
from one_cycle import OneCycleLR
from text_cleaning import clean_data
from sentence_embed import get_use_embedding_features, get_distill_bert_features
from create_features import get_dist_features, get_categorical_features
from losses_metrics import spearmanr_torch, spearmanr_np
from inference import infer
from common import *
from utils.helpers import init_logger, init_seed

%matplotlib inline
pd.set_option('max_colwidth',400)

Using TensorFlow backend.


## Loading and preparing data

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)
path = 'data/'
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
test = pd.read_csv(f'{path}test.csv').fillna(' ')
train = pd.read_csv(f'{path}train.csv').fillna(' ')

In [3]:
train = clean_data(train, INPUTS)
test = clean_data(test, INPUTS)

In [4]:
init_seed()

In [5]:
%%time
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
ids_train = {}
ids_test = {}
max_seq_len = 512
for mode, df in [('train', train), ('test', test)]:
    for text in INPUTS:
        ids = []
        for x in df[text].values:
            x = " ".join(x.strip().split()[:300])
            tok = tokenizer.encode(x, add_special_tokens=True)
            ids.append(tok[:max_seq_len])
        ids = np.array([i + [0] * (max_seq_len - len(i)) for i in ids])
        if mode == 'train': ids_train[text] = ids
        else: ids_test[text] = ids

Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 20.5 s, sys: 17.3 ms, total: 20.5 s
Wall time: 20.9 s


In [6]:
%%time
distill_bert_feature_path = 'proc_data/distill_bert_features/'
bert_features_train, bert_features_test = get_distill_bert_features(
    train, test, ['question_body', 'answer'], 64, distill_bert_feature_path)

CPU times: user 1.13 s, sys: 64.2 ms, total: 1.19 s
Wall time: 1.19 s


In [7]:
%%time
use_feature_path = 'proc_data/use_embedding_features/'
embedding_train, embedding_test = get_use_embedding_features(train, test, INPUTS, use_feature_path)

CPU times: user 1.18 s, sys: 27.6 ms, total: 1.21 s
Wall time: 1.2 s


In [8]:
%%time
dist_features_train, dist_features_test  = get_dist_features(embedding_train, embedding_test)

CPU times: user 1.12 s, sys: 16.3 ms, total: 1.14 s
Wall time: 115 ms


In [9]:
train_host, test_host, host_dict, host_dict_reverse = get_categorical_features(train, test, 'host')
train_category, test_category, category_dict, category_dict_reverse = \
    get_categorical_features(train, test, 'category')

In [10]:
cat_features_train = np.hstack([train_host.reshape(-1, 1), train_category.reshape(-1, 1)])
cat_features_test = np.hstack([test_host.reshape(-1, 1), test_category.reshape(-1, 1)])
merged = np.vstack([cat_features_train, cat_features_test])
ohe = OneHotEncoder()
ohe.fit(merged)

cat_features_train = ohe.transform(cat_features_train).toarray()
cat_features_test = ohe.transform(cat_features_test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
x_features_train = np.hstack([cat_features_train, dist_features_train])
x_features_test = np.hstack([cat_features_test, dist_features_test])

In [12]:
y = train[TARGETS].values.astype(np.float32)

In [13]:
num_workers = 10
bs = 4

In [14]:
bs_test = 4
test_loader = DataLoader(
    TextDataset2(x_features_test, embedding_test['question_body_embedding'], 
                 embedding_test['answer_embedding'], embedding_test['question_title_embedding'], 
                 ids_test['question_body'], ids_test['answer'], ids_test['question_title'], test.index),
    batch_size=bs_test, shuffle=False, num_workers=num_workers
)

## Training model

In [15]:
lr = 0.001
lr2 = 0.00001
loss_fn = nn.BCEWithLogitsLoss()
device = torch.device('cuda')
n_epochs = 6
model_name = 'distil_bert'
checkpoint_dir = 'checkpoints/'
early_stopping = None

In [16]:
def get_optimizer(model, lr):
    return optim.Adam([#params=model.parameters(), lr=lr)#[
                {'params': model.head.parameters(), 'lr': lr},
                {'params': model.q_bert.parameters(), 'lr': lr/100},
                {'params': model.a_bert.parameters(), 'lr': lr/100}
            ])

def get_optimizer2(model, lr):
    return RAdam(params=model.parameters(), lr=lr)#[
                #{'params': model.head.parameters(), 'lr': lr},
                #{'params': model.q_bert.parameters(), 'lr': lr/10},
                #{'params': model.a_bert.parameters(), 'lr': lr/10}
            #])

In [17]:
folds = KFold(n_splits=5, random_state=42)
oofs = np.zeros((len(train), N_TARGETS))
preds = np.zeros((len(test), N_TARGETS))

for fold_id, (train_index, valid_index) in enumerate(folds.split(train)):
    print(f'Fold {fold_id + 1} started at {time.ctime()}')
#     train_loader = DataLoader(
#         TextDataset3(x_features_train, embedding_train['question_body_embedding'], 
#                      embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
#                      bert_features_train['question_body'], bert_features_train['answer'],
#                      train_index, y),
#         batch_size=bs, shuffle=True, num_workers=num_workers, pin_memory=True
#     )
#     valid_loader = DataLoader(
#         TextDataset3(x_features_train, embedding_train['question_body_embedding'], 
#                      embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
#                      bert_features_train['question_body'], bert_features_train['answer'], 
#                      valid_index, y),
#         batch_size=bs, shuffle=False, num_workers=num_workers, pin_memory=True
#     )
        
#     model = HeadNet(n_h=256)
#     model.to(device)
    train_loader = DataLoader(
        TextDataset2(x_features_train, embedding_train['question_body_embedding'], 
                     embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
                     ids_train['question_body'], ids_train['answer'], ids_train['question_title'], 
                     train_index, y),
        batch_size=bs, shuffle=True, num_workers=num_workers, pin_memory=True
    )
    valid_loader = DataLoader(
        TextDataset2(x_features_train, embedding_train['question_body_embedding'], 
                     embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
                     ids_train['question_body'], ids_train['answer'], ids_train['question_title'], 
                     valid_index, y),
        batch_size=bs, shuffle=False, num_workers=num_workers, pin_memory=True
    )
        
    model = CustomBert(256)
    
#     if fold_id == 0:
#         optimizer = get_optimizer(model, lr)
#         lr_finder = LRFinder(n_iter=800, start_lr=1e-5, end_lr=1, device=device, grad_accum=8)
#         lr_finder.find_lr(model, optimizer, train_loader, loss_fn)
#         plt.show()
    
    optimizer = get_optimizer(model, lr)
    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
    scheduler = OneCycleLR(optimizer, n_epochs=n_epochs, n_batches=len(train_loader))

    learner = Learner(
        model, 
        optimizer, 
        train_loader, 
        valid_loader, 
        loss_fn, 
        device, 
        n_epochs, 
        f'{model_name}_fold_{fold_id + 1}', 
        checkpoint_dir, 
        scheduler=scheduler, 
        metric_fns={'spearmanr': (spearmanr_torch, 'epoch_end')}, 
        monitor_metric='spearmanr',
        minimize_score=False, 
        logger=None,
        grad_accum=8,
        early_stopping=early_stopping, 
        batch_step_scheduler=True
    )
    if fold_id + 1 > 4: learner.train()
    
    # 2nd stage
    
#     train_loader = DataLoader(
#         TextDataset2(x_features_train, embedding_train['question_body_embedding'], 
#                      embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
#                      ids_train['question_body'], ids_train['answer'], ids_train['question_title'], 
#                      train_index, y),
#         batch_size=bs, shuffle=True, num_workers=num_workers, pin_memory=True
#     )
#     valid_loader = DataLoader(
#         TextDataset2(x_features_train, embedding_train['question_body_embedding'], 
#                      embedding_train['answer_embedding'], embedding_train['question_title_embedding'], 
#                      ids_train['question_body'], ids_train['answer'], ids_train['question_title'], 
#                      valid_index, y),
#         batch_size=bs, shuffle=False, num_workers=num_workers, pin_memory=True
#     )
#     learner.train_loader = train_loader
#     learner.valid_loader = valid_loader
        
#     model = CustomBert(256)
    
#     learner.load_best_model()
#     model.head = learner.model
#     learner.model = model
#     learner.optimizer = get_optimizer2(model, lr2)
#     learner.scheduler = OneCycleLR(learner.optimizer, n_epochs=n_epochs, n_batches=len(train_loader))
# #     learner.scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
# #     learner.early_stopping = 5
# #     learner.n_epochs *= 2
# #     learner.batch_step_scheduler = False
#     learner.train()
    
    oofs[valid_index] = infer(learner.model, valid_loader, learner.best_checkpoint_file, device)
    
    test_preds = infer(learner.model, test_loader, learner.best_checkpoint_file, device)
    preds += test_preds / folds.n_splits
    
    del learner
    gc.collect()
    
print(f'OOF score: {spearmanr_np(oofs, y)}')
#0.3982

Fold 1 started at Fri Dec  6 21:00:41 2019
Starting inference for model: checkpoints/distil_bert_fold_1_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Starting inference for model: checkpoints/distil_bert_fold_1_best.pth


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Fold 2 started at Fri Dec  6 21:01:15 2019
Starting inference for model: checkpoints/distil_bert_fold_2_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Starting inference for model: checkpoints/distil_bert_fold_2_best.pth


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Fold 3 started at Fri Dec  6 21:01:52 2019
Starting inference for model: checkpoints/distil_bert_fold_3_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Starting inference for model: checkpoints/distil_bert_fold_3_best.pth


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Fold 4 started at Fri Dec  6 21:02:24 2019
Starting inference for model: checkpoints/distil_bert_fold_4_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Starting inference for model: checkpoints/distil_bert_fold_4_best.pth


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


Fold 5 started at Fri Dec  6 21:02:56 2019
epoch 0: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))




epoch 0/6 	 train : loss 0.46603 - spearmanr 0.11774
epoch 0: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 0/6 	 valid : loss 0.3936 - spearmanr 0.31171


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


best model: epoch 0 - 0.31171
epoch 1: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))


epoch 1/6 	 train : loss 0.38169 - spearmanr 0.32855
epoch 1: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 1/6 	 valid : loss 0.37211 - spearmanr 0.3863
best model: epoch 1 - 0.3863
epoch 2: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))


epoch 2/6 	 train : loss 0.36349 - spearmanr 0.4001
epoch 2: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 2/6 	 valid : loss 0.36823 - spearmanr 0.40159
best model: epoch 2 - 0.40159
epoch 3: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))


epoch 3/6 	 train : loss 0.35291 - spearmanr 0.43939
epoch 3: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 3/6 	 valid : loss 0.36526 - spearmanr 0.40574
best model: epoch 3 - 0.40574
epoch 4: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))


epoch 4/6 	 train : loss 0.34338 - spearmanr 0.476
epoch 4: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 4/6 	 valid : loss 0.36615 - spearmanr 0.40749
best model: epoch 4 - 0.40749
epoch 5: 	 Start training...


HBox(children=(IntProgress(value=0, max=1216), HTML(value='')))


epoch 5/6 	 train : loss 0.33732 - spearmanr 0.49818
epoch 5: 	 Start validation...


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


epoch 5/6 	 valid : loss 0.36676 - spearmanr 0.40648
model not improved for 1 epochs
TRAINING END: Best score achieved on epoch 4 - 0.40749
Starting inference for model: checkpoints/distil_bert_fold_5_best.pth


HBox(children=(IntProgress(value=0, max=304), HTML(value='')))


Starting inference for model: checkpoints/distil_bert_fold_5_best.pth


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))


OOF score: 0.40494972787028266


In [18]:
# clipping is necessary or we will get an error
sample_submission.loc[:, 'question_asker_intent_understanding':] = np.clip(preds, 0.00001, 0.999999)
sample_submission.to_csv('subs/submission.csv', index=False)

In [19]:
sample_submission.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.930843,0.607404,0.166507,0.440581,0.669049,0.547609,0.68764,0.663405,0.558132,0.002537,0.646797,0.544287,0.012849,0.090047,0.007879,0.008882,0.102497,0.170535,0.769937,0.001328,0.898811,0.923496,0.536194,0.959436,0.968863,0.801682,0.00659,0.031672,0.862513,0.921132
1,46,0.868914,0.530974,0.002086,0.732702,0.786088,0.927408,0.568578,0.503941,0.019184,0.005083,0.430015,0.065408,0.001647,0.000403,0.000374,0.002225,0.937538,0.183981,0.076124,0.00021,0.645376,0.95615,0.650583,0.979844,0.988976,0.883237,0.951957,0.101983,0.050922,0.91561
2,70,0.902329,0.604063,0.025335,0.733137,0.777548,0.903211,0.588324,0.498472,0.067626,0.01113,0.433295,0.148209,0.013836,0.019243,0.003444,0.004389,0.308063,0.096844,0.748049,0.001992,0.828392,0.927451,0.594645,0.973653,0.973351,0.812643,0.050541,0.040061,0.915093,0.927134
3,132,0.91502,0.387382,0.003063,0.671688,0.714732,0.856546,0.589083,0.486069,0.285419,0.002294,0.610737,0.051019,0.001037,0.000585,0.000408,0.025507,0.922026,0.244529,0.331883,0.000173,0.719859,0.94727,0.69833,0.975601,0.981805,0.875762,0.696216,0.16652,0.834448,0.92547
4,200,0.962328,0.611163,0.012648,0.890432,0.850477,0.930194,0.666699,0.658756,0.174202,0.007387,0.290648,0.41794,0.00729,0.017495,0.00393,0.035297,0.19425,0.125485,0.474603,0.002255,0.812403,0.908796,0.629941,0.960132,0.956322,0.820174,0.113894,0.120473,0.562951,0.909372
