In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
import torch

In [None]:
## Setting device for PyTorch to GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) avaiable" % torch.cuda.device_count())
    print("We will user the GPU: ",torch.cuda.get_device_name(0))

else:
    print("No GPU available using the CPU instead")
    device =  torch.device("cpu")

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv(dirname + "/train.csv")

In [None]:
train_df = df.iloc[0: 9697]
val_df = df.iloc[9697:]

In [None]:
labels, frequencies = np.unique(df['language'].values, return_counts=True)

plt.figure(figsize=(10, 10))
plt.pie(frequencies, labels=labels, autopct='%1.1f%%')
plt.show()

In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
model.config.id2label = {"0": "entailment", "1": "neutral", "2": "contradiction"}
model.config.label2id = {"entailment": "0", "neutral": "1", "contradiction": "2"}

### Bert takes in input three variables
- input_ids: ids of the tokens - tensor of integer values
- attention_mask: attention mask
- token_type_ids: To encode 2 sequences as different

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class MNLIDataset(Dataset):
    
    def __init__(self, df, tokenizer, transform=None, is_test_dataset=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.is_test_dataset = is_test_dataset
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        tokenized_data = self.tokenizer(item.premise, item.hypothesis, padding="max_length", truncation=True, max_length=512,
                                       return_tensors='pt')
        if not self.is_test_dataset:
            tokenized_data['labels'] = torch.tensor(item.label)
        return tokenized_data

In [None]:
from torch.utils.data import DataLoader
train_dataset = MNLIDataset(train_df, tokenizer)
val_dataset = MNLIDataset(val_df, tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, drop_last=True)

In [None]:
model.to(device)

In [None]:
from transformers import AdamW
from transformers import get_scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def batch_accuracy(logits, labels):
    return accuracy_score(np.argmax(logits, axis=1).flatten(), labels.flatten())

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    batches_accuracy = 0    
    for batch_dict in train_dataloader:
        batch_dict = {k: v.squeeze().to(device) for k, v in batch_dict.items()}
        outputs = model(**batch_dict)

        loss = outputs.loss
        logits = outputs.logits.cpu().detach().numpy()

        labels = batch_dict['labels'].cpu().detach().squeeze()
        batches_accuracy += batch_accuracy(logits, labels)

        loss.backward()        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    num_batches = len(train_dataloader)
    epoch_avg_acc = batches_accuracy / num_batches
    print(f"Average train accuracy for epoch {epoch}: {epoch_avg_acc}")

In [None]:
## To clear the GPU memory occupied by PyTorch
torch.cuda.empty_cache()

In [None]:
model.eval()

batch_val_acc = 0
    
for batch_dict in tqdm(val_dataloader):
    batch_dict = {k: v.squeeze().to(device) for k, v in batch_dict.items()}
    outputs = model(**batch_dict)

    loss = outputs.loss
    logits = outputs.logits.cpu().detach().numpy()

    labels = batch_dict['labels'].cpu().detach().squeeze()
    batch_val_acc += batch_accuracy(logits, labels)

num_batches = len(val_dataloader)
val_accuracy = batch_val_acc / num_batches

In [None]:
print(f"Validation accuracy: {val_accuracy}")

### Submitting Results

In [None]:
test_df = pd.read_csv(dirname + "/test.csv")

In [None]:
test_dataset = MNLIDataset(test_df, tokenizer, is_test_dataset=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()
predictions = []
for batch_dict in tqdm(test_dataloader):
    batch_dict = {k: v.squeeze(axis=1).to(device) for k, v in batch_dict.items()}
    outputs = model(**batch_dict)

    loss = outputs.loss
    logits = outputs.logits.cpu().detach().numpy()
    pred = np.argmax(logits) 
    predictions.append(pred)

In [None]:
submission = test_df.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)