In [None]:
!nvidia-smi

In [None]:
!pip install transformers
!pip install sentencepiece

# Load modified XLNet model with mem backprop

In [None]:
#@title gist link to modified XLNet code
#@markdown - modify XLNetModel.cache_mem function in /transformers/models/xlnet/modeling_xlnet.py that it does not detach the mem states.

gist_path = "" #@param {type:"string"}

In [None]:
!git clone $gist_path

In [None]:
mod_file_path = input("where is the modified modeling_xlnet.py saved?")
!cp -f $mod_file_path /usr/local/lib/python3.7/dist-packages/transformers/models/xlnet/modeling_xlnet.py

In [None]:
import numpy as np
import pandas as pd
import math
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

# Prepare data

In [None]:
df_train = pd.read_csv(input('Path to train.csv: '))
df_test = pd.read_csv(input('Path to test.csv: '))
df_submit = pd.read_csv(input('Path to submission.csv: '))

In [None]:
df_train.head(5)

In [None]:
target_cols = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

In [None]:
input_cols = ['question_title', 'question_body', 'answer']

In [None]:
plt.figure()
fig, ax = plt.subplots(figsize=(20, 10));
df_train[target_cols].hist(ax=ax);
plt.tight_layout()
plt.show()

# Preprocess the text data
 - Define Dataset class
 - use the html library to undo the escapes.

In [None]:
import torch
import torch.functional as F
from torch import nn
from transformers import XLNetModel, RobertaModel
from transformers import XLNetPreTrainedModel
from transformers.models.roberta.modeling_roberta import *
from transformers import get_linear_schedule_with_warmup, RobertaTokenizer, XLNetTokenizer, AdamW
from transformers.modeling_utils import SequenceSummary

tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=False)

In [None]:
import html
t_len = []
q_len = []
a_len = []
for i, (t, q, a) in df_test[input_cols].iterrows():
    t, q, a = html.unescape(t), html.unescape(q), html.unescape(a)
    t = tokenizer.tokenize(t)
    q = tokenizer.tokenize(q)
    a = tokenizer.tokenize(a)
    t_len.append(len(t))
    q_len.append(len(q))
    a_len.append(len(a))
print(np.mean(t_len), np.mean(q_len), np.mean(a_len))
print(np.percentile(t_len, 90), np.percentile(q_len, 90), np.percentile(a_len, 90))
print(min(t_len), min(q_len), min(a_len))
print(max(t_len), max(q_len), max(a_len))

In [None]:
def pad_seq(seq, pad_id, total_length):
    pad_len = total_length - len(seq)
    return {'seq': seq + ([pad_id] * pad_len),
            'mask': ([1] * len(seq)) + ([0] * pad_len)}

def prepad_seq(seq, pad_id, total_length):
    pad_len = total_length - len(seq)
    return {'seq': ([pad_id] * pad_len) + seq,
            'mask': ([0] * pad_len) + ([1] * len(seq))}


import html
class QUEST_XLNet_Seg_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_lens, device='cpu', total_pad = 512, input_cols=input_cols, target_cols=target_cols, training=True, half=False):
        self.df = df
        self.input_cols = input_cols
        self.target_cols = target_cols
        self.is_training = training
        self.max_lens = max_lens
        self.tokenizer = tokenizer
        self.device = device
        self.total_pad = total_pad
        self.half = half

    def collate_fn(self, data):
        batched = {}
        float_type = torch.float32
        if self.half:
            float_type = torch.float16
        for input_col in self.input_cols:
            batched[input_col] = {}
            collate_seq = []
            collate_mask = []
            max_len = 0
            for x in data:
                max_len = max(max_len, len(x[input_col]))
            for x in data:
                padded_info = prepad_seq(x[input_col],
                                         self.tokenizer.pad_token_id,
                                         max_len)
                collate_seq.append(padded_info['seq'])
                collate_mask.append(padded_info['mask'])
            batched[input_col]['seq'] = torch.tensor(collate_seq, dtype=torch.long).to(self.device)

            batched[input_col]['mask'] = torch.tensor(collate_mask, dtype=float_type).to(self.device)
        
        targets = [x['target'] for x in data]
        batched['target'] = torch.tensor(targets, dtype=float_type).to(self.device)

        return batched

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        rtn = {}
        for i, (input_col, max_len) in enumerate(zip(self.input_cols, self.max_lens)):
            col_val = self.df[input_col].iloc[idx]
            col_val = html.unescape(col_val)
            token_info = self.tokenizer(col_val, add_special_tokens=False)
            token_ids = token_info['input_ids']
            rtn[input_col] = token_ids[:max_len - 2] + [self.tokenizer.sep_token_id, self.tokenizer.cls_token_id]
        if self.is_training:
            target_vals = self.df[self.target_cols].iloc[idx].tolist()
            rtn['target'] = target_vals
        return rtn
        

In [None]:
demo_dataset = QUEST_XLNet_Seg_Dataset(df_train, tokenizer, [64, 512, 512])

In [None]:
demo_data_loader = DataLoader(dataset=demo_dataset, batch_size=8, shuffle=True, collate_fn=demo_dataset.collate_fn)

# Define model

Just forward passing experiment...

```python
xlnet_1 = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True).to('cuda')
xlnet_2 = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True).to('cuda')
xlnet_3 = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True).to('cuda')
logit_maker = nn.Linear(768 * 3, 30).to('cuda')

input1 = torch.randint(low=0, high=200, size=(8, 64)).to('cuda')
mask1 = torch.randint(low=0, high=2, size=(8, 64)).to('cuda')
xlnet_out1 = xlnet_1(input_ids=input1,
                        attention_mask=mask1)
xlnet_cls1 = xlnet_out1.last_hidden_state[:, -1, :]
xlnet_mem1 = xlnet_out1.mems
input2 = torch.randint(low=0, high=200, size=(8, 512)).to('cuda')
mask2 = torch.randint(low=0, high=2, size=(8, 512)).to('cuda')
xlnet_out2 = xlnet_2(input_ids=input2,
                        attention_mask=mask2,
                        mem=xlnet_mem1)
x1net_cls2 = xlnet_out2.last_hidden_state[:, -1, :]
xlnet_mem2 = xlnet_out2.mems
input3 = torch.randint(low=0, high=200, size=(8, 512)).to('cuda')
mask3 = torch.randint(low=0, high=2, size=(8, 512)).to('cuda')
xlnet_out3 = xlnet_3(input_ids=input3,
                        attention_mask=mask3,
                        mem=xlnet_mem2)
x1net_cls3 = xlnet_out3.last_hidden_state[:, -1, :]
cls_unified = torch.cat([xlnet_cls1, x1net_cls2, x1net_cls3],dim=1)
print(cls_unified.size())
logits = logit_maker(cls_unified)
print(logits.size())
```



In [None]:
 from collections import namedtuple
class XLNetForSequenceRegressionPyramid(nn.Module):
    def __init__(self, num_labels=30):
        super(XLNetForSequenceRegressionPyramid, self).__init__()
        self.num_labels = num_labels
        self.xlnet_t = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True)
        self.xlnet_q = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True)
        self.xlnet_a = XLNetModel.from_pretrained("xlnet-base-cased", use_mems_train=True)
        hidden_dim = self.xlnet_a.d_model
        self.hidden_dim = hidden_dim
        num_layers = self.xlnet_a.n_layer
        self.dense_states = nn.Linear(num_layers * hidden_dim, hidden_dim)
        self.dense_1 = nn.Linear(hidden_dim * 6, hidden_dim)
        self.dense_2 = nn.Linear(hidden_dim, num_labels)
        self.activation = nn.ELU()
        self.dropout = nn.Dropout(0.5)
        self.seq_dropout = nn.Dropout(0.1)
        self.loss_fct = nn.BCEWithLogitsLoss()
        self.SequenceRegressorOutput = namedtuple('SequenceRegressorOutput',
                                             ['loss', 'logits', 'hidden_states', 'attentions'])
    
    def forward(self, seq_t, mask_t, seq_q, mask_q, seq_a, mask_a, labels=None):
        xlnet_out_t = self.xlnet_t(input_ids=seq_t,
                                attention_mask=mask_t)
        xlnet_cls_t = xlnet_out_t.last_hidden_state[:, -1, :]
        dropped_mask_t = self.seq_dropout(mask_t)
        xlnet_sum_t = torch.sum(xlnet_out_t.last_hidden_state * dropped_mask_t.unsqueeze(2).expand(-1, -1, self.hidden_dim), 1)
        mask_sum_t = dropped_mask_t.sum(1)
        mask_sum_t[mask_sum_t==0] = 1.0
        # print(mask_sum_t.size())
        # print(xlnet_sum_t.size())
        xlnet_avg_t = xlnet_sum_t / mask_sum_t.unsqueeze(1).expand(-1, self.hidden_dim)
        xlnet_mem_t = xlnet_out_t.mems

        xlnet_out_q = self.xlnet_q(input_ids=seq_q,
                                attention_mask=mask_q,
                                mem=xlnet_mem_t)
        xlnet_cls_q = xlnet_out_q.last_hidden_state[:, -1, :]
        dropped_mask_q = self.seq_dropout(mask_q)
        xlnet_sum_q = torch.sum(xlnet_out_q.last_hidden_state * dropped_mask_q.unsqueeze(2).expand(-1, -1, self.hidden_dim), 1)
        mask_sum_q = dropped_mask_q.sum(1)
        mask_sum_q[mask_sum_q==0] = 1.0
        xlnet_avg_q = xlnet_sum_q / mask_sum_q.unsqueeze(1).expand(-1, self.hidden_dim)
        xlnet_mem_q = xlnet_out_q.mems
        
        xlnet_out_a = self.xlnet_a(input_ids=seq_a,
                                attention_mask=mask_a,
                                mem=xlnet_mem_q)
        xlnet_cls_a = xlnet_out_a.last_hidden_state[:, -1, :]
        dropped_mask_a = self.seq_dropout(mask_a)
        xlnet_sum_a = torch.sum(xlnet_out_a.last_hidden_state * dropped_mask_a.unsqueeze(2).expand(-1, -1, self.hidden_dim), 1)
        mask_sum_a = dropped_mask_a.sum(1)
        mask_sum_a[mask_sum_a==0] = 1.0
        xlnet_avg_a = xlnet_sum_a / mask_sum_a.unsqueeze(1).expand(-1, self.hidden_dim)


        # xlnet_mem_a = xlnet_out_t.mems
        cls_unified = torch.cat([xlnet_cls_t,
                                 xlnet_avg_t,
                                 xlnet_cls_q,
                                 xlnet_avg_q,
                                 xlnet_cls_a,
                                 xlnet_avg_a], dim=1)
        # states_unified = torch.cat([layer[-1] for layer in xlnet_mem_a], dim=1)

        x_cls = self.dense_1(cls_unified)
        # x_states = self.dense_states(states_unified)
        # x = torch.cat((x_cls, x_states), dim=1)

        x = x_cls
        
        x = self.activation(x)
        x = self.dropout(x)
        # print(x.size())
        logits = self.dense_2(x)
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels)        
        return self.SequenceRegressorOutput(
            loss=loss,
            logits=logits,
            hidden_states=xlnet_out_a.hidden_states,
            attentions=xlnet_out_a.attentions,
        )
    
    def get_encoder_classifier_params(self):
        xlnet_param_names = ['xlnet_t', 'xlnet_q', 'xlnet_a']
        xlnet_named_params = list(filter(lambda kv: any(key in kv[0] for key in xlnet_param_names), self.named_parameters()))
        classifier_named_params = list(filter(lambda kv: not any(key in kv[0] for key in xlnet_param_names), self.named_parameters()))
        xlnet_params = [e[1] for e in xlnet_named_params]
        classifier_params = [e[1] for e in classifier_named_params]
        return {'encoder_params':xlnet_params,
                "classifier_params":classifier_params}
    
    def get_device(self):
        # getting the device based on the title encoder...
        return self.xlnet_t.word_embedding.weight.device

    def freeze_title_encoder(self):
        for param in self.xlnet_t.parameters():
            param.requires_grad=False
    
    def freeze_question_encoder(self):
        for param in self.xlnet_q.parameters():
            param.requires_grad = False

# Define utility functions

In [None]:
from scipy.stats import spearmanr
def compute_spearman(y_true, y_pred):
    col = y_true.shape[1]
    lst = []
    y_true = np.round(y_true, 6)
    y_pred = np.round(y_pred, 6)
    for i in range(col):
        # p = round(spearmanr(y_true[:, i], y_pred[:, i])[0], 5)
        p = spearmanr(y_true[:, i] + np.random.normal(0, 1e-7, y_pred[:, i].shape[0]),
                        y_pred[:, i] + np.random.normal(0, 1e-7, y_pred[:, i].shape[0])).correlation
        if np.isnan(p):
            p = spearmanr(y_true[:, i] + np.random.normal(0, 1e-7, y_pred[:, i].shape[0]),
                        y_pred[:, i] + np.random.normal(0, 1e-7, y_pred[:, i].shape[0])).correlation
        lst.append(p)
    # print(lst)
    return np.array(lst), sum(lst)/len(lst)

In [None]:
import gc
def clear_mem(model_name='model'):
    if model_name in locals():
        print('deleting model...')
        del model
    for x in list(globals().keys()):
        variable = eval(x)
        if torch.is_tensor(variable) and variable.is_cuda:
            print(x)
            del variable
    gc.collect()
    torch.cuda.empty_cache()
    

In [None]:
clear_mem()

# Train

In [None]:
from sklearn.model_selection import KFold
import tqdm.notebook as tqdm
from scipy.stats import spearmanr


def fold_train(model, train_dataset, valid_dataset, optimizer, max_epoch,
               batch_schedule=[4, 2, 1], lr_decay=0.1, clip=50.0, patience=2):
    bi = 0
    valid_data_loader = DataLoader(dataset=valid_dataset, batch_size=batch_schedule[0],
                                   shuffle=False, collate_fn=valid_dataset.collate_fn)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,patience=2, factor=lr_decay,verbose=True)
    best_avg_valid_loss = float('inf')
    best_spearmanr = float('-inf')
    patience_count = patience
    for epoch in tqdm.trange(max_epoch, desc="training", unit="epoch"):
        total_loss = 0.0
        final_avg_loss = 0.0
        train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_schedule[bi],
                                       shuffle=True, collate_fn=train_dataset.collate_fn)
        with tqdm.tqdm(train_data_loader,desc="epoch {} train".format(epoch + 1),
                  unit="batch",total=len(train_data_loader)) as train_batch_iterator:
            model.train()
            for i, batch_data in enumerate(train_batch_iterator, start=1):
                optimizer.zero_grad()
                loss_data = model(seq_t=batch_data['question_title']['seq'],
                                  mask_t=batch_data['question_title']['mask'],
                                  seq_q=batch_data['question_body']['seq'],
                                  mask_q=batch_data['question_body']['mask'],
                                  seq_a=batch_data['answer']['seq'],
                                  mask_a=batch_data['answer']['mask'],
                                  labels=batch_data['target'])
                loss = loss_data.loss
                total_loss += loss.item()
                loss.backward()
                # _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                train_batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
                final_avg_loss = total_loss / i
            scheduler.step(final_avg_loss)
        total_valid_loss = 0.0
        final_avg_valid_loss = 0.0
        true_labels = []
        pred_labels = []
        with torch.no_grad():
            model.eval()
            with tqdm.tqdm(valid_data_loader,desc="epoch {} valid".format(epoch + 1),
                      unit="batch",total=len(valid_data_loader),leave=False) as valid_batch_iterator:
                
                for i, batch_data in enumerate(valid_batch_iterator, start=1):
                    loss_data = model(seq_t=batch_data['question_title']['seq'],
                                      mask_t=batch_data['question_title']['mask'],
                                      seq_q=batch_data['question_body']['seq'],
                                      mask_q=batch_data['question_body']['mask'],
                                      seq_a=batch_data['answer']['seq'],
                                      mask_a=batch_data['answer']['mask'],
                                      labels=batch_data['target'])
                    loss = loss_data.loss
                    total_valid_loss += loss.item()
                    valid_batch_iterator.set_postfix(mean_loss=total_valid_loss / i, current_loss=loss.item())
                    final_avg_valid_loss = total_valid_loss / i
                    pred_labels.append(loss_data.logits.sigmoid().to('cpu').numpy())
                    true_labels.append(batch_data['target'].to('cpu').numpy())
        true_labels = np.concatenate(true_labels, axis=0)
        pred_labels = np.concatenate(pred_labels, axis=0)
        sp_correlation = compute_spearman(pred_labels, true_labels)
        
        np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
        print("spearmanr scores:", sp_correlation[0])
        sorted_ind = np.argsort(sp_correlation[0])
        sorted_labels = [target_cols[ind] for ind in sorted_ind]
        print("predicted features from worst to best:")
        for r, (lb,sc) in enumerate(zip(sorted_labels, sp_correlation[0][sorted_ind])):
            print(f"\t{r}. {lb}: {sc}")
        np.set_printoptions()
        
        print(f"Validation results for epoch #{epoch + 1}: average_loss={final_avg_valid_loss}, spearman_rho={sp_correlation[-1]}")
        if sp_correlation[-1] < best_spearmanr:
            patience_count -= 1
        elif sp_correlation[-1] > best_spearmanr:
            # filepath = f"./best_fold{fold_num}.pt"
            print("Saving this model...")
            # filepath, best_model_info = save_model(model, filepath,
            #                                        avg_valid_loss=final_avg_valid_loss,
            #                                        spearmanr_corr=sp_correlation[-1])
            
        best_spearmanr = max(sp_correlation[-1], best_spearmanr)
        if patience_count == 0:
            print("Early Stopping: the average spearmanr did not improve.")
            break

n_fold=6

kf = KFold(n_splits=n_fold, shuffle=True, random_state=42)
max_epoch = 30
fold_num = 1
batch_size = 4
for train_index, valid_index in tqdm.tqdm(kf.split(df_train), desc="Cross Validation", unit="fold", total=n_fold):
    print('Fold {} starting...'.format(fold_num))
    clear_mem()
    model = XLNetForSequenceRegressionPyramid()
    # model.half()
    # model.unfreeze_decoder()
    model = model.cuda()
    # model.freeze_decoder()
    device_used = model.get_device()
    # model.freeze_title_encoder()
    # model.freeze_question_encoder()
    train_dataset = QUEST_XLNet_Seg_Dataset(df_train.iloc[train_index], tokenizer, [64, 512, 512],device=device_used)
    valid_dataset = QUEST_XLNet_Seg_Dataset(df_train.iloc[valid_index], tokenizer, [64, 512, 512], device=device_used)
    param_dict = model.get_encoder_classifier_params()
    xlnet_params = param_dict['encoder_params']
    classifier_params = param_dict['classifier_params']
    optimizer = AdamW([{'params': xlnet_params}, {'params': classifier_params, 'lr': 1e-4}], lr=2e-5, weight_decay=0.01, betas=(0.5, 0.999), correct_bias=True)
    fold_train(model, train_dataset, valid_dataset, optimizer, max_epoch)
    clear_mem()
    fold_num += 1