In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install nlp

In [None]:
!pip install --upgrade transformers

In [None]:
# import numpy as np
from sklearn.utils import shuffle
# import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.express as px

import transformers
import tokenizers

# Hugging Face new library for datasets (https://huggingface.co/nlp/)
import nlp
import time

import torch
# import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

## Loading Data

### 1. Original Dataset

In [None]:
original_train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

original_train = shuffle(original_train)
original_valid = original_train[:len(original_train) // 5]
original_train = original_train[len(original_train) // 5:]

original_train.shape[0], original_valid.shape[0]

In [None]:
print(f"original - training: {len(original_train)} examples")
original_train.head(2)

## 2. MNLI Multilingual Data

In [None]:
# mnli_multi_train = pd.read_csv("../input/mnli-multilanguage-dataset/mnli_train_translated.csv")

# mnli_multi_train = mnli_multi_train.dropna(how='any')
# mnli_multi_train = mnli_multi_train[mnli_multi_train.lang.isin(['ar', 'en', 'de', 'es', 'fr', 'th', 'ru'])]
# mnli_multi_train.shape[0]

In [None]:
# rand_idx = np.random.choice(mnli_multi_train.shape[0], int(0.2*mnli_multi_train.shape[0]), replace=False)
# mnli_multi_train = mnli_multi_train.iloc[rand_idx]
# mnli_multi_train.shape[0]

## 2. Data Loader

In [None]:
def _build_dataset_from_list(ds_list):
    df = pd.concat([d[['premise', 'hypothesis', 'label']] for d in ds_list])
    df = df[df['label'].isin([0,1,2])]
#     df_valid = pd.concat([d.loc[['premise', 'hypothesis', 'label']] for d in ds_valid_list])
    return df

def _get_sentence_pairs(df):
    return df['premise'].tolist(), df['hypothesis'].tolist()

In [None]:
class MultiLangDataset(Dataset):
    """Multi Lang Dataset For NLI Tasks"""
    def __init__(self, ds_list, model_name, tokenizer, max_len=300, device=None):
        df = _build_dataset_from_list(ds_list)
        
        self.tokenizer = tokenizer
        text, text_pair = _get_sentence_pairs(df)
        self.tokenized_ids = self.tokenizer(text=text, text_pair=text_pair,
                                   return_tensors='pt',
                                   max_length=max_len, 
                                   padding='max_length', 
                                   truncation=True,
                            )
        self.tokenized_att = self.tokenized_ids['attention_mask']
        self.tokenized_ids = self.tokenized_ids['input_ids']
        self.targets = torch.tensor(df['label'].values)
        self.len = df.shape[0]
        
        if device:
            self.tokenized = self.tokenized.to(device)
            self.targets = self.targets.to(device)
            
    def __len__(self):
        return self.len
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return ((self.tokenized_ids[idx], self.tokenized_att[idx]), self.targets[idx])
    def to(self, device):
        self.tokenized_ids = self.tokenized_ids.to(device)
        self.tokenized_att = self.tokenized_att.to(device)
        self.targets = self.targets.to(device)

## 3. Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set(style="darkgrid")
# %matplotlib inline
import time
import pylab
from IPython import display as dsp

def update_metrics_fig(epoch_df):
    epoch_long = epoch_df.set_index(['epoch', 'type', 'freezed']).stack().reset_index()\
                        .rename(columns={'level_3':'metric', 0:'value'})
    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
    
    ax_0 = sns.lineplot(x="epoch", y="value", hue='type', markers=True,
                          data=epoch_long[epoch_long['metric']=='accuracy'], 
                          ax=axs[0])
    ax_1 = sns.lineplot(x="epoch", y="value", hue='type', markers=True,
                          data=epoch_long[epoch_long['metric']=='loss'], 
                          ax=axs[1])
    ax_0.set_title('Accuracy', fontsize=18, pad=10)
    ax_1.set_title('Average Loss', fontsize=18, pad=10)
    
    fig.subplots_adjust(hspace=.3)
    
    # Clear output and re-plot
    dsp.clear_output(wait=True)
    dsp.display(_= plt.show())

## 4. Training Function

In [None]:
def loss_fn(outputs, targets):
    # pass in outputs and targets, return loss function
    return torch.nn.BCEWithLogitsLoss()(outputs, 
                                        torch.nn.functional.one_hot(targets.long(), 
                                                                    num_classes=3).float())

In [None]:
def train_model_gpu(model, train, valid, epochs, loss_fn, 
                    batch_size=36, warmup_ratio=0.3, lr=1e-5,
                    train_ratio=1.0, valid_ratio=1.0, gradient_accumulation=2, 
                    device='cuda', logs_df=None, num_labels=3, scheduler=False):
    torch.manual_seed(42)
    np.random.seed(42)
    time_glob_start = time.perf_counter()
    # Get loss function, optimizer, and model
    # Initializing Dataloaders  
    train.to(device)
    valid.to(device)
    
    train_loader = torch.utils.data.DataLoader(
        train,
        batch_size=batch_size,
        shuffle=True)
    
    valid_loader = torch.utils.data.DataLoader(
        valid,
        batch_size=batch_size,
        shuffle=False,
        drop_last=True)
    
    # Initializing model
    for param in model.base_model.parameters(): # unfreeze some layers
        param.requires_grad = False
    freezed = True
#     for param in model.base_model.parameters(): # freeze some layers
#         param.requires_grad = False
    model = model.to(device)
    # Grouping Parameters
    param_optimizer = list(model.named_parameters()) # model parameters to optimize
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # apply to weight decay
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

#     optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=lr)
    optimizer = optim.SGD(optimizer_grouped_parameters, lr=lr)
#     scheduler = OneCycleLR(optimizer, 
#                            learning_rate, 
#                            div_factor=10.0, 
#                            final_div_factor=50.0, 
#                            epochs=epochs,
#                            steps_per_epoch=len(train_loader))
    num_train_steps = int(len(train_loader) * epochs)
    if scheduler:
        scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=int(warmup_ratio*epochs*len(train_loader)), 
                                                    num_training_steps=num_train_steps)
    else:
        scheduler = None
    
    if logs_df is None:
        logs_df = pd.DataFrame(columns=['epoch', 'type', 'freezed', 'accuracy', 'loss'])
        epoch_start = logs_df.shape[0]+1
    else:
        epoch_start = 1
        
    def train_loop_fn(loader, model, optimizer, freezed, 
                      train_ratio, gradient_accumulation,
                      device='cuda', scheduler=None, epoch=None):
        if train_ratio>1.0:
            raise ValueError("Train ratio value cannot be larger than 1.0 .")
            
        model.train()
        train_loss = 0.0
        train_preds = np.array([])
        train_targets = np.array([])
        grad_ctr = 0
        loader_idx = np.random.choice(len(loader), int(train_ratio*len(loader)), replace=False)
        for x, (inputs, targets) in enumerate(loader):
#             inputs = inputs.to(device) 
#             targets = targets.to(device)
            if x in loader_idx:
                grad_ctr += 1
                if grad_ctr%gradient_accumulation==0:
                    optimizer.zero_grad()
                    grad_ctr = 0
                outputs = model(inputs[0], inputs[1])[0]
    #             loss = loss_fn(outputs.view(-1, num_labels), targets.view(-1))
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                # since the loss is on all 8 cores, reduce the loss values and print the average (as defined in reduce_fn)
    #             loss_reduced = xm.mesh_reduce('loss_reduce',loss,reduce_fn) 
                train_loss += loss.item()
                train_preds = np.append(train_preds, outputs.argmax(-1).cpu().detach().numpy())
                train_targets = np.append(train_targets, targets.cpu().detach().numpy())
                print("Training... Epoch: {:,}\t Is freezed: {}\tbatch {:,}/{:,}\tOptimizer LRs: {}\tBatch Loss:{}"
                      .format(epoch, freezed, x, len(loader), 
                              '\t'.join([str(pg['lr']) for pg in optimizer.param_groups]), loss.item()
                             ), end='\r')
        
        acc = 1.0*(train_preds==train_targets).sum().item()/train_preds.shape[0]
        avg_loss = train_loss/(x+1)
        return acc, avg_loss

    def eval_loop_fn(loader, model, device='cuda', valid_ratio=1.0):
        with torch.no_grad():
            total_samples, correct, loss = 0, 0, 0.0
            model.eval()
            time_start = time.perf_counter()
            idx_selected = np.random.choice(len(loader), int(1.0*valid_ratio*len(loader)), replace=False)
            for i, (inputs, targets) in enumerate(loader):
                if i in idx_selected:
                    print("Evaluation Step {:,} out of {:,} -\tTime Passed: {:.1f}s, Remaining: {:.1f}s"
                          .format(i, len(loader), time.perf_counter()-time_start, (time.perf_counter()-time_start)/((i+1)/len(loader))), 
                          end="\r")
                    outputs = model(inputs[0], inputs[1])[0]
                    loss += loss_fn(outputs.view(-1, 3), targets.view(-1)).item()
                    preds = outputs.argmax(1)#.cpu().numpy()
                    correct += (targets==preds).sum().item()#/preds.shape[0]
                    total_samples += preds.shape[0]
                else:
                    pass
            accuracy = 1.0 * correct / total_samples
            loss = 1.0*loss/total_samples
        model.train()
        return accuracy, loss

    # Train and eval loops
#     accuracy = []
    for epoch in range(epoch_start, epochs + epoch_start):
        start = time.time()
        train_acc, train_loss = train_loop_fn(train_loader, model=model, 
                                              optimizer=optimizer, device=device, 
                                              train_ratio=train_ratio,
                                              gradient_accumulation=gradient_accumulation,
                                              freezed=freezed, scheduler=scheduler, epoch=epoch)
        valid_acc, valid_loss = eval_loop_fn(valid_loader, model=model, 
                                             valid_ratio=valid_ratio, 
                                             device=device)
        # Updating metrics
        logs_df = logs_df.append(dict(
            epoch=epoch, type='train', freezed=freezed, accuracy=train_acc, loss=train_loss, 
            #lr=[pg['lr'] for pg in optimizer.param_groups][0]
        ), ignore_index=True)
        logs_df = logs_df.append(dict(
            epoch=epoch, type='valid', freezed=freezed, accuracy=valid_acc, loss=valid_loss, #lr=None
        ), ignore_index=True)
        # Plotting Results
        update_metrics_fig(logs_df)
        if (1.0*epoch/epochs)>=0.7:
            for param in model.base_model.parameters(): # unfreeze some layers
                param.requires_grad = False
            freezed=False
            
    # save our model
    torch.save(model.state_dict(), "xlm_roberta_model.bin")
        
#         if epoch == 15: #unfreeze
#                 for param in model.base_model.parameters():
#                     param.requires_grad = True
    print('')
    print('Training Finished!\nTotal Time Elapsed: {}'
          .format(int(1.0*(time.perf_counter()-time_glob_start)/60)))
    display(logs_df)
    return logs_df

## 5. Main Run

### 5.1. Tokenization

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

In [None]:
train_ds = MultiLangDataset(ds_list=[original_train], 
                            tokenizer=tokenizer, 
                            model_name="xlm-roberta-base", 
                            device=None)

In [None]:
del mnli_multi_train

In [None]:
valid_ds = MultiLangDataset(ds_list=[original_valid], 
                            tokenizer=tokenizer, 
                            model_name="xlm-roberta-base", 
                            device=None)

### 5.2. Loading XLM-Roberta Model

In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli", 
                                                                        output_hidden_states=False, 
                                                                        num_labels=3)#.to(device)

In [None]:
PATH = '/kaggle/input/my-dear-watson/xlm_roberta_multi_model_3.bin'
model.load_state_dict(torch.load(PATH))

### 5.3. Training

In [None]:
logs_df = train_model_gpu(model=model, train=train_ds, valid=valid_ds, epochs=2, loss_fn=loss_fn,
                          batch_size=16, lr=1e-5, scheduler=True,
                          train_ratio=1.0, valid_ratio=0.2,
                          warmup_ratio=0.05)

## 6. Sample Submission

In [None]:
def _build_submission_dataset_from_list(ds_list):
    df = pd.concat([d[['premise', 'hypothesis']] for d in ds_list])
#     df_valid = pd.concat([d.loc[['premise', 'hypothesis', 'label']] for d in ds_valid_list])
    return df

class SubmissionDataset(Dataset):
    """Multi Lang Dataset For NLI Tasks"""
    def __init__(self, ds_list, tokenizer, max_len=300, device=None):
        df = _build_submission_dataset_from_list(ds_list)
        
        self.tokenizer = tokenizer
        text, text_pair = _get_sentence_pairs(df)
        self.tokenized_ids = self.tokenizer(text=text, text_pair=text_pair,
                                   return_tensors='pt',
                                   max_length=max_len, 
                                   padding='max_length', 
                                   truncation=True,
                            )
        self.tokenized_att = self.tokenized_ids['attention_mask']
        self.tokenized_ids = self.tokenized_ids['input_ids']
        self.len = df.shape[0]
        if device:
            self.tokenized = self.tokenized.to(device)
            
    def __len__(self):
        return self.len
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return (self.tokenized_ids[idx], self.tokenized_att[idx])
    def to(self, device):
        self.tokenized_ids = self.tokenized_ids.to(device)
        self.tokenized_att = self.tokenized_att.to(device)

In [None]:
def predict(ds, model, batch_size=24, device='cuda'):
    model.to(device)
    ds.to(device)
    loader = torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False)
    predictions = []
    with torch.no_grad():
        model.eval()
        time_start = time.perf_counter()
        for i, inputs in enumerate(loader):
            print("Evaluation Step {:,} out of {:,} -\tTime Passed: {:.1f}s, Remaining: {:.1f}s"
                  .format(i, len(loader), time.perf_counter()-time_start, (time.perf_counter()-time_start)/((i+1)/len(loader))), 
                  end="\r")
            outputs = model(inputs[0], inputs[1])[0]
            preds = outputs.argmax(1).cpu().numpy().tolist()
            predictions.extend(preds)
    return predictions

### 6.1. Loading Submission Data

In [None]:
submission_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
print(f"original - test: {len(submission_df)} examples")
print("Number of languages: {}".format(len(submission_df.lang_abv.unique().shape)))
submission_df.head(2)

### 6.2. Prediction

In [None]:
submission_ds = SubmissionDataset(ds_list=[submission_df],
                                 tokenizer=tokenizer, 
                                 device=None)

In [None]:
submission_df['prediction'] = predict(submission_ds, model)

In [None]:
submission_df[['id', 'prediction']].to_csv('./submission.csv', index=False)