In [None]:
!nvidia-smi

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import numpy as np
import pandas as pd
import math
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import time
import datetime

# Prepare data

In [None]:
df_train = pd.read_csv(input('Path to train.csv: '))
df_test = pd.read_csv(input('Path to test.csv: '))
df_submit = pd.read_csv(input('Path to submission.csv: '))

In [None]:
df_train.head(5)

In [None]:
target_cols = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

In [None]:
input_cols = ['question_title', 'question_body', 'answer']

# Plot out the label distribution

In [None]:
plt.figure()
fig, ax = plt.subplots(figsize=(20, 10));
df_train[target_cols].hist(ax=ax);
plt.tight_layout()
plt.show()

# Preprocess the text data
 - Define Dataset class
 - use the html library to undo the escapes.

In [None]:
#@title Model Selection

import torch
import torch.functional as F
from torch import nn
from transformers import DistilBertModel
from transformers import DistilBertPreTrainedModel
from transformers.models.bert.modeling_bert import *
from transformers import get_linear_schedule_with_warmup, DistilBertTokenizer, AdamW
from transformers.modeling_utils import SequenceSummary

pretrained_model = "distilbert-base-uncased" #@param ["distilbert-base-uncased-distilled-squad", "distilbert-base-cased-distilled-squad", "distilbert-base-uncased"]

batch_size = 16 #@param ["16", "8", "4", "2"] {type:"raw", allow-input: true}

tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model)
tokenizer("Hello, my name is Cat.")

In [None]:
df_train.shape

In [None]:
import html
class QUEST_DistilBert_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_lens, device='cpu', total_pad = 512, input_cols=input_cols, target_cols=target_cols, training=True):
        self.df = df
        self.input_cols = input_cols
        self.target_cols = target_cols
        self.is_training = training
        self.max_lens = max_lens
        self.tokenizer = tokenizer
        self.device = device
        self.total_pad = total_pad

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        col_vals = []
        sep_mask = []
        seg = 1
        for i, (input_col, max_len) in enumerate(zip(self.input_cols, self.max_lens)):
            col_val = self.df[input_col].iloc[idx]
            col_val = html.unescape(col_val)
            token_info = self.tokenizer(col_val, add_special_tokens=False)
            token_ids = token_info['input_ids'][:max_len-1]
            if i == 0:
                token_ids = [self.tokenizer.cls_token_id] + token_ids[:max_len-2] + [self.tokenizer.sep_token_id]
            else:
                token_ids = token_ids + [self.tokenizer.sep_token_id]
            sep_mask += [seg] * len(token_ids)
            seg += 1
            col_vals.append(token_ids)

        rtn = {}
        if self.is_training:
            target_vals = self.df[target_cols].iloc[idx].tolist()
            target_vals = torch.tensor(target_vals).to(self.device )
            rtn['target'] = target_vals

        cv = []
        mask = []
        for val in col_vals:
            cv += val
            mask += [1] * len(val)
        # if len(cv) < self.total_pad:
        #     mask += [0] * (self.total_pad - len(cv))
        #     sep_mask += [0] * (self.total_pad - len(cv))
        #     cv += [self.tokenizer.pad_token_id] * (self.total_pad - len(cv))
        rtn['input_sequence'] = torch.LongTensor(cv).to(self.device )
        rtn['input_mask'] = torch.FloatTensor(mask).to(self.device )
        rtn['seg_mask'] = torch.FloatTensor(sep_mask).to(self.device )

        return rtn

    def collate_fn(self, data):
        input_sequences = []
        input_masks = []
        seg_masks = []
        targets = []
        col_batch = {}
        for x in data:
            input_sequences.append(x['input_sequence'])
            input_masks.append(x['input_mask'])
            seg_masks.append(x['seg_mask'])
            if 'target' in x:
                targets.append(x['target'])
        col_batch['input_sequence'] = torch.nn.utils.rnn.pad_sequence(input_sequences,batch_first=True, padding_value=self.tokenizer.pad_token_id)
        col_batch['input_mask'] = torch.nn.utils.rnn.pad_sequence(input_masks,batch_first=True, padding_value=0.0)
        col_batch['seg_mask'] = torch.nn.utils.rnn.pad_sequence(seg_masks,batch_first=True, padding_value=0.0)
        if targets:
            col_batch['target'] = torch.stack(targets)
        if col_batch['input_sequence'].size()[1] > self.total_pad:
            print("something wrong")
            print(col_batch)
        return col_batch

In [None]:
input_cols = ['question_title', 'question_body', 'answer']
train_dataset = QUEST_DistilBert_Dataset(df_train, tokenizer, [64, 224, 224])

# Define model

In [None]:
 from collections import namedtuple
 class DistilBertRegressionHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, hidden_size, hidden_dropout_prob, num_labels):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob)
        self.out_proj = nn.Linear(hidden_size, num_labels)

    def forward(self, x, **kwargs):
        # x = x[:, 0, :]  # take <s> token (equiv. to [CLS])
        # x = torch.mean(x, 1)
        x = self.dropout(x)
        x = self.dense(x)
        # x = torch.tanh(x)
        x = torch.tanh(x)
        # x = self.dropout(x)
        x = self.out_proj(x)
        x = torch.sigmoid(x)
        return x

class DistilBertForSequenceRegression(DistilBertPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config, num_labels=30, cls_id=tokenizer.cls_token_id, sep_id=tokenizer.sep_token_id):
        super().__init__(config)
        self.num_labels = num_labels

        self.distilbert = DistilBertModel(config)
        self.regressor = DistilBertRegressionHead(config.dim * 4,
                                               config.seq_classif_dropout,
                                               num_labels)
        self.cls_id = cls_id
        self.sep_id = sep_id
        self.d_model = config.dim
        self.init_weights()


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        seg_masks=None,
        output_attentions=True,
        output_hidden_states=True,
        return_dict=True,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        cls_position = torch.where(input_ids==self.cls_id)
        # sep_position = torch.where(input_ids==self.sep_id)
        assert cls_position[1].size()[0] == input_ids.size()[0], "Some of the instances are missing cls token!"
        assert torch.unique(cls_position[0]).size()[0] == input_ids.size()[0], "Some of the instances are missing cls token!"
        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs.last_hidden_state

        cls_output = sequence_output[cls_position]

        # perform avg pooling across t q a
        t_mask = torch.where(seg_masks==1, 1.0, 0.0)
        t_sum_pool = torch.sum(sequence_output * t_mask.unsqueeze(2).expand(-1, -1, self.d_model), 1)
        t_mask_sum = t_sum_pool.sum(1)
        t_mask_sum[t_mask_sum==0.0] = 1.0
        t_avgpool = t_sum_pool / t_mask_sum.unsqueeze(1).expand(-1, self.d_model)

        q_mask = torch.where(seg_masks==2, 1.0, 0.0)
        q_sum_pool = torch.sum(sequence_output * q_mask.unsqueeze(2).expand(-1, -1, self.d_model), 1)
        q_mask_sum = q_sum_pool.sum(1)
        q_mask_sum[q_mask_sum==0.0] = 1.0
        q_avgpool = q_sum_pool / q_mask_sum.unsqueeze(1).expand(-1, self.d_model)

        a_mask = torch.where(seg_masks==3, 1.0, 0.0)
        a_sum_pool = torch.sum(sequence_output * a_mask.unsqueeze(2).expand(-1, -1, self.d_model), 1)
        a_mask_sum = a_sum_pool.sum(1)
        a_mask_sum[a_mask_sum==0.0] = 1.0
        a_avgpool = a_sum_pool / a_mask_sum.unsqueeze(1).expand(-1, self.d_model)


        reg_input = torch.cat([t_avgpool, q_avgpool, a_avgpool, cls_output], dim=1)
        probs = self.regressor(reg_input)
        # print(probs.size())
        # print('a', probs)
        # print(labels.size())
        # print('b', labels)
        # input()

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCELoss()
            # loss_fct = torch.nn.MSELoss()
            loss = loss_fct(probs, labels)

        if not return_dict:
            output = (probs,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        SequenceRegressorOutput = namedtuple('SequenceRegressorOutput',
                                             ['loss', 'probs', 'hidden_states', 'attentions'])
        return SequenceRegressorOutput(
            loss=loss,
            probs=probs,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def freeze_decoder(self):
        """
        Freeze XLNet weight parameters. They will not be updated during training.
        """
        for param in self.distilbert.parameters():
            param.requires_grad = False

    def unfreeze_decoder(self):
        """
        Freeze XLNet weight parameters. They will not be updated during training.
        """
        for param in self.distilbert.parameters():
            param.requires_grad = True
    
 

# Define utility functions

In [None]:
from scipy.stats import spearmanr
def compute_spearman(y_true, y_pred):
  col = y_true.shape[1]
  lst = []
  for i in range(col):
    # p = round(spearmanr(y_true[:, i], y_pred[:, i])[0], 5)
    p = spearmanr(y_true[:, i], y_pred[:, i]).correlation
    if np.isnan(p):
      p = spearmanr(y_true[:, i], y_pred[:, i] + np.random.normal(0, 1e-7, y_pred.shape)).correlation
    lst.append(p)
  return np.array(lst), sum(lst)/len(lst)

In [None]:
import gc
def clear_mem(model_name='model'):
    if model_name in locals():
        print('deleting model...')
        del model
    for x in list(globals().keys()):
        variable = eval(x)
        if torch.is_tensor(variable) and variable.is_cuda:
            print(x)
            del variable
    gc.collect()
    torch.cuda.empty_cache()
    

In [None]:
clear_mem()

In [None]:
def save_model(model, save_path, **metrics):
    """
    Save the model to the path directory provided
    """
    if "state_dict" in metrics:
        raise Warning("We will use states from the model instead.")
        del metrics["state_dict"]
    model_to_save = model.module if hasattr(model, 'module') else model
    checkpoint = {'state_dict': model_to_save.state_dict()}
    checkpoint.update(metrics)
    torch.save(checkpoint, save_path)
    return save_path, metrics

def load_model(save_path, model=None):
    """
    Load the model from the path directory provided
    """
    if model is None:
        model = DistilBertForSequenceRegression.from_pretrained(pretrained_model)
    checkpoint = torch.load(save_path)
    model_state_dict = checkpoint['state_dict']
    model.load_state_dict(model_state_dict)
    metrics = {k:checkpoint[k] for k in checkpoint if k!='state_dict'}

    return model, metrics

# Train

In [None]:
from sklearn.model_selection import KFold
import tqdm.notebook as tqdm
from scipy.stats import spearmanr


def fold_train(model, train_data_loader, valid_data_loader, optimizer, max_epoch, lr_decay=0.1, clip=100.0, patience=1, fold_num=1):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=2, factor=lr_decay,verbose=True)
    best_avg_valid_loss = float('inf')
    best_spearmanr = float('-inf')
    patience_count = patience
    filepath = None
    best_model_info = None
    for epoch in tqdm.trange(max_epoch, desc="training", unit="epoch"):
        total_loss = 0.0
        final_avg_loss = 0.0
        with tqdm.tqdm(train_data_loader,desc="epoch {} train".format(epoch + 1),
                  unit="batch",total=len(train_data_loader)) as train_batch_iterator:
            model.train()
            for i, batch_data in enumerate(train_batch_iterator, start=1):
                optimizer.zero_grad()
                loss_data = model(input_ids=batch_data['input_sequence'],
                                  attention_mask=batch_data['input_mask'],
                                  labels=batch_data['target'],
                                  seg_masks=batch_data['seg_mask'])
                loss = loss_data.loss
                total_loss += loss.item()
                loss.backward()

                # _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                train_batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
                final_avg_loss = total_loss / i
            scheduler.step(final_avg_loss)
        total_valid_loss = 0.0
        final_avg_valid_loss = 0.0
        true_labels = []
        pred_labels = []
        with torch.no_grad():
            model.eval()
            with tqdm.tqdm(valid_data_loader,desc="epoch {} valid".format(epoch + 1),
                      unit="batch",total=len(valid_data_loader),leave=False) as valid_batch_iterator:
                
                for i, batch_data in enumerate(valid_batch_iterator, start=1):
                    loss_data = model(input_ids=batch_data['input_sequence'],
                                      attention_mask=batch_data['input_mask'],
                                      labels=batch_data['target'],
                                      seg_masks=batch_data['seg_mask'])
                    loss = loss_data.loss
                    total_valid_loss += loss.item()
                    valid_batch_iterator.set_postfix(mean_loss=total_valid_loss / i, current_loss=loss.item())
                    final_avg_valid_loss = total_valid_loss / i
                    pred_labels.append(loss_data.probs.to('cpu').numpy())
                    true_labels.append(batch_data['target'].to('cpu').numpy())
        true_labels = np.concatenate(true_labels, axis=0)
        pred_labels = np.concatenate(pred_labels, axis=0)
        # print(true_labels.shape)
        # print(pred_labels.shape)
        sp_correlation = compute_spearman(pred_labels, true_labels)

        np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
        # print("spearmanr scores:", sp_correlation[0])
        sorted_ind = np.argsort(sp_correlation[0])
        sorted_labels = [target_cols[ind] for ind in sorted_ind]
        print("predicted features from worst to best:")
        for r, (lb,sc) in enumerate(zip(sorted_labels, sp_correlation[0][sorted_ind])):
            print(f"\t{r}. {lb}: {sc}")
        np.set_printoptions()

        print(f"Validation results for epoch #{epoch + 1}: average_loss={final_avg_valid_loss}, spearman_rho={sp_correlation[-1]}")

        # if final_avg_valid_loss > best_avg_valid_loss:
        #     patience_count -= 1
        # best_avg_valid_loss = min(final_avg_valid_loss, best_avg_valid_loss)
        # if patience_count == 0:
        #     print("Early Stopping: the average validation loss did not improve.")
        #     break

        if sp_correlation[-1] < best_spearmanr:
            patience_count -= 1
        elif sp_correlation[-1] > best_spearmanr:
            filepath = f"./best_fold{fold_num}.pt"
            score_info = {'spearmanr':sp_correlation[-1],
                          'train_loss':final_avg_loss,
                          'valid_loss':final_avg_valid_loss,
                          'epoch':epoch + 1}
            print("Saving this model...")
            filepath, best_model_info = save_model(model, filepath,
                                                   avg_valid_loss=final_avg_valid_loss,
                                                   spearmanr_corr=sp_correlation[-1])
            
        best_spearmanr = max(sp_correlation[-1], best_spearmanr)
        if patience_count == 0:
            print("Early Stopping: the average spearmanr did not improve.")
            break
    return filepath, best_model_info, score_info

best_scores = []
train_losses = []
valid_losses = []
epoch_best = []

splits = 5
kf = KFold(n_splits=splits, shuffle=True, random_state=42)
max_epoch = 30
fold_num = 1
model_records = []
device_used = None
for train_index, valid_index in tqdm.tqdm(kf.split(df_train), desc="Cross Validation", unit="fold", total=splits):
    print('Fold {} starting...'.format(fold_num))
    clear_mem()
    model = DistilBertForSequenceRegression.from_pretrained(pretrained_model)
    # model.freeze_decoder()
    model = model.cuda()
    device_used = model.distilbert.embeddings.word_embeddings.weight.device
    train_dataset = QUEST_DistilBert_Dataset(df_train.iloc[train_index], tokenizer, [64, 224, 224],
                                          device=device_used)
    train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=train_dataset.collate_fn, shuffle=True)
    valid_dataset = QUEST_DistilBert_Dataset(df_train.iloc[valid_index], tokenizer, [64, 224, 224],
                                          device=device_used, training=True)
    valid_data_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, collate_fn=valid_dataset.collate_fn, shuffle=False)

    bert_param_names = ['distilbert']
    bert_named_params = list(filter(lambda kv: any(key in kv[0] for key in bert_param_names), model.named_parameters()))
    classifier_named_params = list(filter(lambda kv: not any(key in kv[0] for key in bert_param_names), model.named_parameters()))
    bert_params = [e[1] for e in bert_named_params]
    classifier_params = [e[1] for e in classifier_named_params]

    optimizer = AdamW([{'params': bert_params}, {'params': classifier_params, 'lr': 1e-4}], lr=2e-5, weight_decay=0.01, betas=(0.5, 0.999), correct_bias=True)
    filepath, best_model_info, score_info = fold_train(model, train_data_loader, valid_data_loader, optimizer, max_epoch, fold_num=fold_num)
    best_scores.append(score_info['spearmanr'])
    train_losses.append(score_info['train_loss'])
    valid_losses.append(score_info['valid_loss'])
    epoch_best.append(score_info['epoch'])
    model_records.append((filepath, best_model_info['spearmanr_corr']))
    clear_mem()
    fold_num += 1

print('spearmanr mean:', sum(best_scores)/len(best_scores))
print('spearmanr max:', max(best_scores))
print('spearmanr min:', min(best_scores))
print('spearmanr std:', np.std(best_scores))
print('avg train loss', sum(train_losses)/len(train_losses))
print('avg valid loss', sum(valid_losses)/len(valid_losses))
print('avg epoch for convergence', sum(epoch_best)/len(epoch_best))











# Do inference on the testset

In [None]:
# prepare testset
test_dataset = QUEST_DistilBert_Dataset(df_test, tokenizer, [64, 224, 224],
                                          device=device_used, training=False)
test_data_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn, shuffle=False)
net_score = 0.0
pred_unnormalized = []
weights = []
for model_path, spearmanr_score in model_records:
    clear_mem()
    model, info = load_model(model_path)
    model.to(device_used)
    pred_labels = []
    with torch.no_grad():
        model.eval()
        with tqdm.tqdm(test_data_loader, unit="batch",total=len(test_data_loader),leave=False) as test_batch_iterator:
            
            for i, batch_data in enumerate(test_batch_iterator, start=1):
                output_data = model(input_ids=batch_data['input_sequence'],
                                    attention_mask=batch_data['input_mask'],
                                    seg_masks=batch_data['seg_mask'])
                pred_labels.append(output_data.probs.to('cpu').numpy())
    pred_labels = np.concatenate(pred_labels, axis=0)
    weight = spearmanr_score
    net_score += spearmanr_score
    pred_unnormalized.append(pred_labels)
    weights.append(weight)
clear_mem()
pred_unnormalized = np.array(pred_unnormalized)
weights = np.array(weights) / net_score
print(pred_unnormalized.shape)
pred_final = np.einsum("ijk, i", pred_unnormalized, weights)
df_submit[target_cols] = pred_final
df_submit.to_csv("submission.csv", index = False)
df_submit







