# DL Apps Class Project(Kaggle track)
### Competition: Contradictory, My Dear Watson

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import random
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import matplotlib.pyplot as plt
import os
import tqdm
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from argparse import Namespace

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# set random seed for reproduce
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(42)

## Get Dataset

In [None]:
full_dataset = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_dataset = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

# split train dataset into train dataset and validation dataset
train_dataset = full_dataset.sample(frac=0.8,random_state=200)
val_dataset = full_dataset.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [None]:
# check the dataset
print(train_dataset.head(3))
print(test_dataset.head(3))

In [None]:
# training dataset info
print('full data num:', len(full_dataset.premise.values))
print('train data num:', len(train_dataset.premise.values))
print('val data num:', len(val_dataset.premise.values))

In [None]:
# see language distribution of the datasets
labels, frequencies = np.unique(train_dataset.language.values, return_counts = True)

print('train_dataset')
plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

labels, frequencies = np.unique(val_dataset.language.values, return_counts = True)

print('val_dataset')
plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

labels, frequencies = np.unique(test_dataset.language.values, return_counts = True)

print('test_dataset')
plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

In [None]:
# see label distribution of the datasets 
print('train_dataset')
labels, frequencies = np.unique(train_dataset.label.values, return_counts = True)

plt.figure(figsize = (5,5))

label_name = ['entailment', 'neutral', 'contradiction'] 

plt.bar(labels, frequencies, width=0.5)
plt.xticks(labels, label_name)

plt.show()

print('val_dataset')
labels, frequencies = np.unique(val_dataset.label.values, return_counts = True)

plt.figure(figsize = (5,5))

label_name = ['entailment', 'neutral', 'contradiction'] 

plt.bar(labels, frequencies, width=0.5)
plt.xticks(labels, label_name)

plt.show()

## Data preprocessing

In [None]:
class DataBert(Dataset):

    def __init__(self, train_df, val_df, test_df, tokenizer):
        self.label_dict = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        
        self.tokenizer = tokenizer
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.init_data()

    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        self.test_data = self.load_data(self.test_df, train=False)

    def load_data(self, df, train=True):
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        if train:
            hypothesis_list = df['hypothesis'].to_list()
            premise_list = df['premise'].to_list()
            label_list = df['label'].to_list()

            for (hypothesis, premise, label) in zip(hypothesis_list, premise_list, label_list):
                hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
                premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
                pair_token_ids = [self.tokenizer.cls_token_id] + hypothesis_id + [self.tokenizer.sep_token_id] + premise_id + [self.tokenizer.sep_token_id]
                hypothesis_len = len(hypothesis_id)
                premise_len = len(premise_id)

                segment_ids = torch.tensor([0] * (hypothesis_len + 1) + [1] * (premise_len + 2))  # sentence 0 and sentence 1
                attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

                token_ids.append(torch.tensor(pair_token_ids))
                seg_ids.append(segment_ids)
                mask_ids.append(attention_mask_ids)
                y.append(label)

            token_ids = pad_sequence(token_ids, batch_first=True)
            mask_ids = pad_sequence(mask_ids, batch_first=True)
            seg_ids = pad_sequence(seg_ids, batch_first=True)
            y = torch.tensor(y)
            dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        else:
            hypothesis_list = df['hypothesis'].to_list()
            premise_list = df['premise'].to_list()

            for (hypothesis, premise) in zip(hypothesis_list, premise_list):
                hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
                premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
                pair_token_ids = [self.tokenizer.cls_token_id] + hypothesis_id + [self.tokenizer.sep_token_id] + premise_id + [self.tokenizer.sep_token_id]
                hypothesis_len = len(hypothesis_id)
                premise_len = len(premise_id)

                segment_ids = torch.tensor([0] * (hypothesis_len + 1) + [1] * (premise_len + 2))  # sentence 0 and sentence 1
                attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

                token_ids.append(torch.tensor(pair_token_ids))
                seg_ids.append(segment_ids)
                mask_ids.append(attention_mask_ids)

            token_ids = pad_sequence(token_ids, batch_first=True)
            mask_ids = pad_sequence(mask_ids, batch_first=True)
            seg_ids = pad_sequence(seg_ids, batch_first=True)
            dataset = TensorDataset(token_ids, mask_ids, seg_ids)

        return dataset

    def get_data_loaders(self, batch_size=32, shuffle=True):
        train_loader = DataLoader(
          self.train_data,
          shuffle=shuffle,
          batch_size=batch_size
        )
        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=batch_size
        )
        test_loader = DataLoader(
          self.test_data,
          batch_size=1
        )
        return train_loader, val_loader, test_loader

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences

nli_dataset = DataBert(train_dataset, val_dataset, test_dataset, bert_tokenizer)

## Get BERT model

In [None]:
device = "cuda"
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

## Training

In [None]:
# helper function
def get_acc(pred, gt):
    gt = gt.to('cpu')
    acc = (torch.tensor(pred) == gt).sum().float() / float(gt.size(0))
    return acc

In [None]:
EPOCHS = 7

def train(model, train_loader, val_loader, optimizer, scheduler=None):  
#     total_step = len(train_loader)
    device = "cuda"
    train_acc_history = []
    train_loss_history = []
    val_acc_history = []
    val_loss_history = []
    prev_loss = 100
    prev_acc = 0
    for epoch in range(EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc  = 0
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)

            outputs = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids,
                            labels=labels)
            prediction = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            loss = outputs.loss
            total_train_loss += loss.item()
            total_train_acc += get_acc(prediction, labels).item()            

            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()
            

        train_acc  = total_train_acc/len(train_loader)
        train_loss = total_train_loss/len(train_loader)

        train_acc_history.append(train_acc)
        train_loss_history.append(train_loss)
        
        model.eval()
        total_val_acc  = 0
        total_val_loss = 0
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                outputs = model(pair_token_ids, 
                                token_type_ids=seg_ids, 
                                attention_mask=mask_ids,
                                labels=labels)

                prediction = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]

                loss = outputs.loss
                total_val_loss += loss.item()
                total_val_acc += get_acc(prediction, labels).item()   

        val_acc  = total_val_acc/len(val_loader)
        val_loss = total_val_loss/len(val_loader)

        val_acc_history.append(val_acc)
        val_loss_history.append(val_loss)
        
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)

        print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        # best loss?
        if val_loss > prev_loss:
            return epoch, train_acc_history, train_loss_history, val_acc_history, val_loss_history
#         # best acc?
#         if val_acc < prev_acc:
#             return epoch, train_acc_history, train_loss_history, val_acc_history, val_loss_history
        prev_loss = val_loss
        prev_acc = val_acc
    return train_acc_history, train_loss_history, val_acc_history, val_loss_history

In [None]:
train_loader, val_loader, test_loader = nli_dataset.get_data_loaders(batch_size=8)

# optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# lr decay
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*EPOCHS)

last_epoch, t_a_his, t_l_his, v_a_his, v_l_his = train(model, train_loader, val_loader, optimizer, scheduler)

In [None]:
# show the history
# accuracy
plt.plot(range(1, last_epoch + 2), t_a_his)
plt.plot(range(1, last_epoch + 2), v_a_his)
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.title('Accuracy')
plt.legend(['train', 'validation'])
plt.show()

# loss
plt.plot(range(1, last_epoch + 2), t_l_his)
plt.plot(range(1, last_epoch + 2), v_l_his)
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Loss')
plt.legend(['train', 'validation'])
plt.show()

## Generating prediction

In [None]:
def eval(model, test_loader, scheduler=None):
    model.eval()
    start = time.time()
    pred = []
    with torch.no_grad():
        for batch_idx, (pair_token_ids, mask_ids, seg_ids) in enumerate(test_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)

            outputs = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids)
            
            prediction = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            
            for i in range(len(prediction)):
                pred.append(prediction[i])

    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    return np.array(pred, dtype=np.int64)

In [None]:
preds = eval(model, test_loader)

In [None]:
submission = test_dataset.id.copy().to_frame()
submission['prediction'] = preds
submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)