# Project Goals
* Establish a basic pipeline to complete a modelling iteration, including dataset preparation, model training, model evaluation, and inference for submission
* Use PyTorch Lightning to organize PyTorch code and access TPU/GPU resources
* Fine-tune a pre-trained bert-base-multilingual-cased model to be a baseline model

# PyTorch Lightning Resources
* [PyTorch Lightning Github](https://github.com/PyTorchLightning/pytorch-lightning)
* [PyTorch Lightning Documentation](https://pytorch-lightning.readthedocs.io/en/latest/) 
* [PyTorch Lightning BERT Tutorial](https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/04-transformers-text-classification.ipynb)

# Environment Setup
* [Kaggle TPU Support Documentation](https://www.kaggle.com/docs/tpu)
* [PyTorch Lightning TPU Support Documentation](https://pytorch-lightning.readthedocs.io/en/latest/tpu.html)

In [None]:
# Check if TPU/GPU is available
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    DEVICE = "tpu"
except ValueError:
    if tf.test.is_gpu_available():
        DEVICE = "gpu"
    else:
        DEVICE = "cpu"

print("Accelerator: {}".format(DEVICE))

In [None]:
# Set up an environment for accessing TPU
if DEVICE == "tpu":
    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev
    !pip install pytorch-lightning
    import torch_xla
    import torch_xla.core.xla_model as xm

In [None]:
import os
os.environ["WANDB_API_KEY"] = "0"  # to silence warning

In [None]:
import gc
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
try:
    from pytorch_lightning import LightningDataModule, LightningModule, Trainer, loggers, seed_everything
    from pytorch_lightning.metrics import functional as FM
except OSError:  # Reloading pytorch_lightning again to resolve OSError issues with TPU
    from pytorch_lightning import LightningDataModule, LightningModule, Trainer, loggers, seed_everything
    from pytorch_lightning.metrics import functional as FM

In [None]:
# Remove temp installation files to release space
gc.collect()
paths = glob.glob("/kaggle/working/*")
for path in paths:
    try:
        if os.path.isfile(path):
            os.remove(path)
        elif os.path.isdir(path):
            shutil.rmtree(path)
    except:
        print("Not removable: {}".format(path))

# Define Global Variables
Note: For some reason, the draft session with TPU can crash easily because of CPU memory allocation issues. In the situation, you may want to use different configurations when editing and testing your scripts in a draft session; for example, NUM_WORKERS could be 1, instead of 4.

In [None]:
# Global Variables
SEED = 2020
PRETRAINED_MODEL = "bert-base-multilingual-cased"
MAX_EPOCHS = 2
if DEVICE == "tpu":
    BATCH_SIZE = 8
    MAX_TOKEN_LEN = 50
    TPU_CORES = 1
    GPUS = 1
    NUM_WORKERS = 4
else:
    BATCH_SIZE = 64
    MAX_TOKEN_LEN = 50
    TPU_CORES = 1
    GPUS = 1
    NUM_WORKERS = 4

# Exploratory Data Analysis

In [None]:
# Dataset with labels
dataset_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
dataset_df.head()

In [None]:
len(dataset_df)

In [None]:
dataset_df["language"].value_counts()

In [None]:
dataset_df["label"].value_counts()

In [None]:
dataset_df.groupby(["language", "label"]).size()

In [None]:
# Production dataset for inference and submission
prod_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
prod_df.head()

In [None]:
len(prod_df)

In [None]:
prod_df["language"].value_counts()

# Dataset Preparation
* [Transformer Tokenizer Documentation](https://huggingface.co/transformers/main_classes/tokenizer.html)
* [PyTorch DataLoader and Dataset Documentation](https://pytorch.org/docs/stable/data.html)
* [LightningDataModule Documentation](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html)

In [None]:
# # Figure out how transformers tokenizer encodes sentences
# tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
# premise_example = dataset_df.premise.values[1]
# hypothesis_example = dataset_df.hypothesis.values[1]
# print("Original premise: {}".format(premise_example))
# print("Original hypothesis: {}".format(hypothesis_example))
# print()
# encoded_sents = tokenizer.encode_plus(premise_example, 
#                                       hypothesis_example,
#                                       add_special_tokens=True, 
#                                       pad_to_max_length=True,
#                                       max_length=MAX_TOKEN_LEN, 
#                                       truncation=True, 
#                                       return_attention_mask=True, 
#                                       return_token_type_ids=True,
#                                       return_tensors="pt")
# print(encoded_sents)
# print()
# decoded_sents = tokenizer.decode(encoded_sents["input_ids"][0])
# print(decoded_sents)

In [None]:
class NLIDataset(Dataset):
    def __init__(self, 
                 dataset: pd.DataFrame, 
                 model: str = PRETRAINED_MODEL,
                 max_token_len: int = MAX_TOKEN_LEN,
                 production: bool = False
                ):
        self.dataset = dataset
        self.model = model
        self.max_token_len = max_token_len
        self.production = production
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index: int):
        premise = self.dataset.premise.values[index]
        hypothesis = self.dataset.hypothesis.values[index]
        tokenizer = BertTokenizer.from_pretrained(self.model)
        encoded_sents = tokenizer.encode_plus(premise, 
                                              hypothesis,
                                              add_special_tokens=True, 
                                              pad_to_max_length=True, 
                                              max_length=self.max_token_len, 
                                              truncation=True, 
                                              return_attention_mask=True, 
                                              return_token_type_ids=True,
                                              return_tensors="pt")
        
        inputs = {
            "input_ids": encoded_sents["input_ids"][0],
            "token_type_ids": encoded_sents["token_type_ids"][0],
            "attention_mask": encoded_sents["attention_mask"][0]
        }
        
        if self.production:
            row_id = self.dataset.id.values[index]
            return inputs, row_id
        else:
            label = self.dataset.label.values[index]
            return inputs, label

In [None]:
class NLIDataModule(LightningDataModule):
    def __init__(self, 
                 dataset: pd.DataFrame, 
                 model: str = PRETRAINED_MODEL,
                 batch_size: int = BATCH_SIZE,
                 random_state: int = SEED,
                 num_workers: int = NUM_WORKERS
                ):
        super().__init__()
        self.dataset = dataset
        self.model = model
        self.batch_size = batch_size
        self.random_state = random_state
        self.num_workers = num_workers
        self.train_df = None
        self.val_df = None
        self.test_df = None
        self.train_set = None
        self.val_set = None
        self.test_set = None

    def prepare_data(self): 
        tmp, self.test_df = train_test_split(self.dataset, 
                                             test_size=0.15, 
                                             random_state=self.random_state, 
                                             stratify=self.dataset[['label', 'language']])
        self.train_df, self.val_df = train_test_split(tmp,
                                                      test_size=0.15, 
                                                      random_state=self.random_state, 
                                                      stratify=tmp[['label', 'language']])
        
    def setup(self, stage: str = None):
        if stage == "fit" or stage is None:
            self.train_set = NLIDataset(self.train_df)
            self.val_set = NLIDataset(self.val_df)
        elif stage == "test":
            self.test_set = NLIDataset(self.test_df)

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

In [None]:
# # Test NLIDataset & NLIDataModule
# dm = NLIDataModule(dataset_df)
# dm.prepare_data()
# dm.setup('fit')
# next(iter(dm.train_dataloader()))

# Model Training
* [PyTorch Lightning Style Guide](https://pytorch-lightning.readthedocs.io/en/latest/style_guide.html)
* [LightningModule Documentation](https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html)
* [PyTorch Lightning Trainer Documentation](https://pytorch-lightning.readthedocs.io/en/latest/trainer.html)

In [None]:
class NLIMultilingualBERT(LightningModule):
    def __init__(self, model: BertForSequenceClassification):
        super().__init__()
        self.model = model

    def forward(self, inputs):
        predictions = self.model(input_ids=inputs["input_ids"], 
                                 attention_mask=inputs["attention_mask"],
                                 token_type_ids=inputs["token_type_ids"])
        return predictions

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        _, predictions = self.model(input_ids=inputs["input_ids"],
                                    attention_mask=inputs["attention_mask"],
                                    token_type_ids=inputs["token_type_ids"],
                                    labels=labels)
        train_loss = F.cross_entropy(predictions, labels)
        self.log('train_loss', train_loss, prog_bar=True)
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        _, predictions = self.model(input_ids=inputs["input_ids"],
                                    attention_mask=inputs["attention_mask"],
                                    token_type_ids=inputs["token_type_ids"],
                                    labels=labels)
        
        val_loss = F.cross_entropy(predictions, labels)
        val_acc = FM.accuracy(predictions, labels)
        metrics = {'val_acc': val_acc, 'val_loss': val_loss}
        self.log_dict(metrics, prog_bar=True)
        return metrics

    def test_step(self, batch, batch_idx):
        metrics = self.validation_step(batch, batch_idx)
        metrics = {'test_acc': metrics['val_acc'], 'test_loss': metrics['val_loss']}
        self.log_dict(metrics, prog_bar=True)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=2e-5)
        return optimizer

In [None]:
# General settings
seed_everything(SEED)
logger = loggers.TensorBoardLogger('logs')

# Prepare DataModule
data_module = NLIDataModule(dataset_df)
data_module.prepare_data()
data_module.setup('fit')

# Prepare Model
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, 
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
print(model)
model_module = NLIMultilingualBERT(model)

# Set up Trainer
trainer = None
if DEVICE == "tpu":
    trainer = Trainer(tpu_cores=[TPU_CORES], logger=logger, max_epochs=MAX_EPOCHS, accelerator="dp")
elif DEVICE == "gpu":
    trainer = Trainer(gpus=GPUS, logger=logger, max_epochs=MAX_EPOCHS)
else:
    trainer = Trainer(logger=logger, max_epochs=MAX_EPOCHS)

trainer.fit(model_module, data_module.train_dataloader(), data_module.val_dataloader())

# Model Evaluation
* [PyTorch Lightning Trainer Documentation - Testing](https://pytorch-lightning.readthedocs.io/en/latest/trainer.html#testing)

In [None]:
data_module.setup('test')
trainer.test(model_module, test_dataloaders=data_module.test_dataloader())

# Inference for Submission
* [PyTorch Lightning Trainer Documentation - Deployment / Prediction](https://pytorch-lightning.readthedocs.io/en/latest/trainer.html#deployment-prediction)

In [None]:
prod_set = NLIDataset(prod_df, production=True)
prod_dataloader = DataLoader(prod_set, batch_size=1, shuffle=False)
model_module.freeze()  # eval
pred_lists = []

for inputs, row_id in prod_dataloader:
    if DEVICE == "tpu":
        device = xm.xla_device()
    elif DEVICE == "gpu":
        device = torch.device("cuda") 
    else:
        device = torch.device("cpu") 
    
    inputs["input_ids"] = inputs["input_ids"].to(device)
    inputs["token_type_ids"] = inputs["token_type_ids"].to(device)
    inputs["attention_mask"] = inputs["attention_mask"].to(device)
    
    predictions = model_module.forward(inputs)
    pred_label = torch.argmax(predictions[0], dim=1).cpu().numpy()
    pred_lists.append([row_id[0], pred_label[0]])

pred_pd = pd.DataFrame(pred_lists, columns=["id", "prediction"])
pred_pd.to_csv('submission.csv', index=False)