In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
!pip install googletrans==3.1.0a0

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
# import torch_xla
# import torch_xla.core.xla_model as xm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import copy
import math
import os
import re
import json
from sklearn.utils import shuffle
import googletrans

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
translator = googletrans.Translator()

In [None]:
EPOCHS = 5
WARMUP_EPOCHS = 5
CKPT_PATH = "ckpt/"
try:
    os.makedirs(CKPT_PATH)
except:
    print("path exist")

In [None]:
data = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
data = shuffle(data)
l_0 = data[(data.label == 0)]
l_1 = data[(data.label == 1)]
l_2 = data[(data.label == 2)]
train = pd.concat([l_0[:len(l_0)//10*8], l_1[:len(l_1)//10*8], l_2[:len(l_2)//10*8]])
val = pd.concat([l_0[len(l_0)//10*8:], l_1[len(l_1)//10*8:], l_2[len(l_2)//10*8:]])
print(len(data), len(l_0), len(l_1), len(l_2), len(train), len(val))

In [None]:
def text_clean(text):
    # 全轉半形
    new_text = ''
    for t in text:
        u_code = ord(t)
        if u_code == 12288:  # 全形空格直接轉換
            u_code = 32
        elif 65281 <= u_code <= 65374:  # 全形字元（除空格）根據關係轉化
            u_code -= 65248
        s = chr(u_code)
        # 英文轉統一小寫
        if s.isupper(): s = s.lower()
        new_text += s

    new_text = re.sub(r'[^\w\s]','',new_text)
    return new_text

In [None]:
class dataset(Dataset):
    def __init__(
            self,
            data,
            tokenizer, 
            maxLengh=512
    ):
        self.data = data
        self.maxLengh = maxLengh
        self.tokenizer = tokenizer
    
    def __getitem__(self, i):
        premise = text_clean(self.data.iloc[i]['premise'])
        hypothesis = text_clean(self.data.iloc[i]['hypothesis'])
        language = text_clean(self.data.iloc[i]['language'])
        label = int(self.data.iloc[i]['label'])

        if language == "Thai":
            premise = translator.translate(premise, dest='en', src='th').text
            hypothesis = translator.translate(hypothesis, dest='en', src='th').text
        elif language == "Vietnamese":
            premise = translator.translate(premise, dest='en', src='vi').text
            hypothesis = translator.translate(hypothesis, dest='en', src='vi').text

        if label == 0:
            lb = [1, 0, 0]
        elif label == 1:
            lb = [0, 1, 0]
        else: 
            lb = [0, 0, 1]

        encoded = self.tokenizer.encode_plus(premise, hypothesis, max_length=512, 
                                                truncation=True, add_special_tokens=True, 
                                                padding='max_length', return_tensors='pt')
        token = encoded['input_ids'][0]
        attn_mask = encoded['attention_mask'][0]

        # token = torch.from_numpy(np.concatenate(token, axis=0))
        # attn_mask = torch.from_numpy(np.concatenate(attn_mask, axis=0))
        label = torch.from_numpy(np.array(lb, dtype='float32'))

        return {
            'token':token,
            'attn_mask':attn_mask,
            'label':label,
        }

    def __len__(self):
        return len(self.data)

In [None]:
train_dataset = dataset(train, tokenizer)
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            batch_size=1, # Trains with this batch size.
            shuffle=True
        )
batch = next(iter(train_dataloader))
# print(tokenizer.convert_ids_to_tokens(batch['token'][0]))
print(batch['token'].shape)
print(batch['attn_mask'].shape)
print(batch['label'].shape)

In [None]:
class ClsBert(nn.Module):
    def __init__(self):
        super(ClsBert, self).__init__()
        self.encoder = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.cls_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(in_features=768, out_features=3),
            # nn.Softmax(dim=-1)
        )
    def forward(self, x, attn_mask):
        # print(x.size())
        outputs = self.encoder(x, attention_mask=attn_mask)
        # print(outputs[0].shape)
        # print(outputs[1].shape)
        x = self.cls_head(outputs[1])

        return x

In [None]:
class AccMetric(nn.Module):
    def __init__(self):
        super(AccMetric, self).__init__()
            
    def forward(self, y_pred, y_true):
        y_pred = nn.functional.softmax(y_pred, dim=-1)
        y_pred = (y_pred > 0.5).float()
        # tp = (y_pred*y_true).sum()
        # tn = ((1-y_pred)*(1-y_true)).sum()
        tp = (y_pred*y_true).sum()
        acc = (tp+1e-7)/(y_true.size()[0]+1e-7)


        return acc.item()

In [None]:
# move optimizer to gpu
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

In [None]:
train_dataset = dataset(train, tokenizer)
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            batch_size=8, # Trains with this batch size.
            shuffle=True,
            num_workers=2
        )
val_dataset = dataset(val, tokenizer)
val_dataloader = DataLoader(
            val_dataset,  # The training samples.
            batch_size=8, # Trains with this batch size.
            shuffle=False,
            num_workers=2
        )
dataloaders = {'train':train_dataloader,
               'val':val_dataloader}
net = ClsBert()
optimizer = AdamW(net.parameters(), lr=5e-5)
warm_up_lr = lambda epoch: epoch / WARMUP_EPOCHS if epoch < WARMUP_EPOCHS else (EPOCHS - epoch) / (EPOCHS - WARMUP_EPOCHS)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warm_up_lr)
scaler = torch.cuda.amp.GradScaler()
cate_ce = nn.CrossEntropyLoss()
acc_metric = AccMetric()
accumulated_step = 4
try:
    state = torch.load(os.path.join(CKPT_PATH, 'train_state.pt'), map_location='cpu')
    net.load_state_dict(state['model_state_dict'])
    print("net state loaded")
    optimizer.load_state_dict(state['optimizer_state_dict'])
    optimizer_to(optimizer, device)
    print("optimizer state loaded")
    print(f"current learning rate {optimizer.param_groups[0]['lr']}") 
    scheduler.load_state_dict(state['scheduler_state_dict'])
    print("scheduler state loaded")
    scaler.load_state_dict(state['scaler_state_dict'])
    print("scaler state loaded")
    history = pd.read_csv(os.path.join(CKPT_PATH, "history.csv"))
    print(f"history loaded")
    print(f"best train loss {history['loss'].min():.10f}")
    print(f"best train acc {history['acc'].max():.10f}")
    print(f"best val acc {history['val_acc'].max():.10f}")
    best_train_acc = history['acc'].max()
    last_epoch = state['epoch']
    best_val_acc = max(history['val_acc'].to_list())

    del state
except:
    print("did not find trained model")
    best_train_acc = 0.
    best_val_acc = 0.
    data = {
        'loss':[np.inf], 
        'acc':[0], 
        'val_loss':[np.inf], 
        'val_acc':[0], 
    }
    history = pd.DataFrame(data)
    history.to_csv(os.path.join(CKPT_PATH, "history.csv"), index=False)
    last_epoch = 0


if torch.cuda.is_available():
    net.to(device)
    cate_ce.to(device)
    acc_metric.to(device)

In [None]:
for epoch in range(0, EPOCHS):
  print()
  print(f"Epoch {epoch+1} start")
  for mode in ['train', 'val']:
    if mode == 'train':
      epoch_loss = []
      epoch_acc = []

      print("===== Training =====")
      net.train()

      for i, batch in enumerate(dataloaders[mode]):
#         print(f"data {i+1} / {len(dataloaders[mode])}")
        x = batch['token']
        y_true = batch['label']
        attn_mask = batch['attn_mask']
        if torch.cuda.is_available():
          x = x.to(device)
          y_true = y_true.to(device)
          attn_mask = attn_mask.to(device)
      
        with torch.cuda.amp.autocast():
          y_pred = net(x, attn_mask)
          loss = cate_ce(y_pred, torch.argmax(y_true, dim=-1))

        loss = loss / accumulated_step
        scaler.scale(loss).backward()
        if (i + 1) % accumulated_step == 0:
#             print("optimizer step")
            scaler.step(optimizer)
            optimizer.zero_grad()
            scaler.update()
        # torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
        
        # loss.backward()
        # optimizer.step()

        running_loss = loss.item() * accumulated_step
        if not math.isnan(running_loss):
          epoch_loss.append(running_loss)
#         print(f"running loss {running_loss:.10f}")
        
        running_acc = acc_metric(y_pred, y_true)
        epoch_acc.append(running_acc)
#         print(f"running acc : {running_acc:.10f}")

        if (i+1)%100 == 0: 
            print(f"data {i+1} / {len(dataloaders[mode])}")
            print(f"running loss {running_loss:.10f}")
            print(f"running acc : {running_acc:.10f}")
          

      epoch_loss = np.mean(epoch_loss)
      epoch_acc = np.mean(epoch_acc)

      print()
      print(f"epoch loss {epoch_loss:.10f}")
      print(f"epoch avg acc {epoch_acc:.10f}")
      now_lr = optimizer.param_groups[0]['lr']
      print(f"now lr {now_lr:.10f}")

      train_loss = epoch_loss
      acc = epoch_acc

      #save model
      torch.save({
            'epoch': epoch+1,
            'lr':now_lr,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict':scheduler.state_dict(),
            'scaler_state_dict':scaler.state_dict()
            }, os.path.join(CKPT_PATH, "train_state.pt"))
      print("saving training weights to path", os.path.join(CKPT_PATH, f"train_state.pt"))

      scheduler.step()

    else:
      epoch_loss = []
      epoch_acc = []
      print("===== Validating =====")
      net.eval()
      for i, batch in enumerate(dataloaders[mode]):
#         print(f"data {i+1} / {len(dataloaders[mode])}")
        x = batch['token']
        y_true = batch['label']
        attn_mask = batch['attn_mask']
        if torch.cuda.is_available():
          x = x.to(device)
          y_true = y_true.to(device)
          attn_mask = attn_mask.to(device)
        
        with torch.no_grad():
          with torch.cuda.amp.autocast():
            y_pred = net(x, attn_mask)
            loss = cate_ce(y_pred, torch.argmax(y_true, dim=-1))

        running_loss = loss.item()
        if not math.isnan(running_loss):
          epoch_loss.append(running_loss)
#         print(f"running loss {running_loss:.10f}")

        running_acc = acc_metric(y_pred, y_true)
        epoch_acc.append(running_acc)
#         print(f"running acc : {running_acc:.10f}")

        if (i+1)%100 == 0: 
            print(f"data {i+1} / {len(dataloaders[mode])}")
            print(f"running loss {running_loss:.10f}")
            print(f"running acc : {running_acc:.10f}")
          

      epoch_loss = np.mean(epoch_loss)
      epoch_acc = np.mean(epoch_acc)

      print()
      print(f"epoch loss {epoch_loss:.10f}")
      print(f"epoch avg acc {epoch_acc:.10f}")
      val_loss = epoch_loss
      val_acc = epoch_acc

      # save model
      if (epoch_acc > best_val_acc):
        print(f"validating acc improved from {best_val_acc:.10f} to {epoch_acc:.10f}")
        print("saving weights to path", os.path.join(CKPT_PATH, f"val_acc.pt"))
        torch.save(net.state_dict(), os.path.join(CKPT_PATH, f"val_acc.pt"))
        best_val_acc = epoch_acc


  append_data = {
        'loss':train_loss, 
        'acc':acc, 
        'val_loss':val_loss, 
        'val_acc':val_acc, 
        'lr':now_lr
    }

  history = history.append(
    append_data,
    ignore_index=True
    )
  
  history.to_csv(os.path.join(CKPT_PATH, "history.csv"), index=False)
  print("Saving history to", os.path.join(CKPT_PATH, "history.csv"))
  print(f"Epoch {epoch+1} end")
  # print()
  # print()

In [None]:
history = pd.read_csv(os.path.join(CKPT_PATH, "history.csv"))

train_loss = history['loss']
train_acc = history['acc']

val_loss = history['val_loss']
val_acc = history['val_acc']


lr = history['lr']


fig, axs = plt.subplots(2, 2, figsize=(20, 20))
# axs[0].set_ylim(0, 5)
axs[0, 0].plot(train_loss, label=f'train loss {train_loss.min():.3f}')
axs[0, 0].plot(val_loss, label=f'val loss {val_loss.min():.3f}')
axs[0, 0].legend(loc="upper right", fontsize=20)
axs[0, 0].set_title('Loss', fontsize=20)
axs[0, 0].set_xlabel('epoch', fontsize=20)
axs[0, 0].set_ylabel('loss', fontsize=20)

axs[0, 1].plot(train_acc, label=f'train acc {train_acc.max():.3f}')
axs[0, 1].plot(val_acc, label=f'val acc {val_acc.max():.3f}')
axs[0, 1].legend(loc="best", fontsize=20)
axs[0, 1].set_title('Acc', fontsize=20)
axs[0, 1].set_xlabel('epoch', fontsize=20)
axs[0, 1].set_ylabel('acc', fontsize=20)

x = val_acc.argmax()
y = val_acc.max()
print(x, y)
axs[0, 1].plot(x, y, 'r*')
axs[0, 1].annotate(f"{y:.3f}", (x, y), fontsize=15)


axs[1, 0].plot(lr)
axs[1, 0].set_title('lr', fontsize=20)
axs[1, 0].set_xlabel('epoch', fontsize=20)
axs[1, 0].set_ylabel('lr', fontsize=20)

In [None]:
net = ClsBert()
state = torch.load(os.path.join(CKPT_PATH, 'train_state.pt'), map_location='cpu')
net.load_state_dict(state['model_state_dict'])
net.eval()
net.to(device)

In [None]:
preds = []
for i in range(len(test)):
    premise = text_clean(test.iloc[i]['premise'])
    hypothesis = text_clean(test.iloc[i]['hypothesis'])
    language = text_clean(test.iloc[i]['language'])

    if language == "Thai":
        premise = translator.translate(premise, dest='en', src='th').text
        hypothesis = translator.translate(hypothesis, dest='en', src='th').text
    elif language == "Vietnamese":
        premise = translator.translate(premise, dest='en', src='vi').text
        hypothesis = translator.translate(hypothesis, dest='en', src='vi').text

    encoded = tokenizer.encode_plus(premise, hypothesis, max_length=512, 
                                                truncation=True, add_special_tokens=True, 
                                                padding='max_length', return_tensors='pt')
    token = encoded['input_ids']
    attn_mask = encoded['attention_mask']

    token = token.to(device)
    attn_mask = attn_mask.to(device)

    # print(token.shape)
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            y_pred = net(token, attn_mask)

    y_pred = nn.functional.softmax(y_pred, dim=-1)
    y_pred = torch.argmax(y_pred).item()
    preds.append(y_pred)

In [None]:
submission = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/sample_submission.csv")
submission

In [None]:
submission.prediction = preds
submission

In [None]:
submission.to_csv("submission.csv", index=False)