In [None]:
#  XLM-Roberta model

In [None]:
# To use TPU

!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5116  100  5116    0     0  33657      0 --:--:-- --:--:-- --:--:-- 33657
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200515 ...
Found existing installation: torch 1.10.0+cu111
Collecting cloud-tpu-client
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.5 MB/s 
Uninstalling torch-1.10.0+cu111:
Installing collected packages: google-api-python-client, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.8
    Uninstalling google-api-python-client-1.12.8:
      Succ

In [None]:
# Imports required to use TPUs with Pytorch.
# https://pytorch.org/xla/release/1.5/index.html

import torch_xla
import torch_xla.core.xla_model as xm

In [None]:
!pip install transformers[sentencepiece] --quiet

[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 63.9 MB/s 
[K     |████████████████████████████████| 895 kB 75.8 MB/s 
[K     |████████████████████████████████| 59 kB 7.2 MB/s 
[K     |████████████████████████████████| 596 kB 73.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 70.0 MB/s 
[?25h

In [None]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import transformers
from transformers import BertTokenizer, BertForSequenceClassification 
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

1.6.0a0+bf2bbd9


XLM means Cross-lingual Language Model. XLM-RoBERTa (XLM-R) is a pre-trained multilingual model that outperforms multiligual BERT. One reason for this is that XLM-R was trained using a lot more data. XLM-R was also trained on 100 languages.

Several versions of xlm roberta are available in the Transformers library. Here are two:

- xlm-roberta-base
- xlm-roberta-large

In [None]:
# xml-roberta vocabulary
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

MODEL_TYPE = 'xlm-roberta-base'

tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

In [None]:
# Check the vocab size

tokenizer.vocab_size

250002

In [None]:
# What are the special tokens

tokenizer.special_tokens_map

{'bos_token': '<s>',
 'cls_token': '<s>',
 'eos_token': '</s>',
 'mask_token': '<mask>',
 'pad_token': '<pad>',
 'sep_token': '</s>',
 'unk_token': '<unk>'}

### For one input sentence

In [None]:
MAX_LEN = 10 # This value could be set as 256, 512 etc.

sentence1 = 'Hello there.'

encoded_dict = tokenizer.encode_plus(
            sentence1,                
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt' # return pytorch tensors
       )


encoded_dict

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[    0, 35378,  2685,     5,     2,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [None]:
# These have already been converted to torch tensors.
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

print(input_ids)
print(att_mask)

tensor([    0, 35378,  2685,     5,     2,     1,     1,     1,     1,     1])
tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])


### For two input sentences

In [None]:
MAX_LEN = 15

sentence1 = 'Hello there.'
sentence2 = 'How are you?'

encoded_dict = tokenizer.encode_plus(
            sentence1, sentence2,      
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,   
            return_tensors = 'pt' # return pytorch tensors
       )


encoded_dict

{'input_ids': tensor([[    0, 35378,  2685,     5,     2,     2, 11249,   621,   398,    32,
             2,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [None]:
input_ids = encoded_dict['input_ids'][0]
att_mask = encoded_dict['attention_mask'][0]

# These are torch tensors.
print(input_ids)
print(att_mask)

tensor([    0, 35378,  2685,     5,     2,     2, 11249,   621,   398,    32,
            2,     1,     1,     1,     1])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0])


## Decoding a sequence of tokens

In [None]:
# input_ids from above

input_ids = encoded_dict['input_ids'][0]

print(input_ids)

tensor([    0, 35378,  2685,     5,     2,     2, 11249,   621,   398,    32,
            2,     1,     1,     1,     1])


In [None]:
# https://huggingface.co/transformers/main_classes/tokenizer.html
# skip_special_tokens – if set to True, will replace special tokens.

a = tokenizer.decode(input_ids,
                skip_special_tokens=False)

b = tokenizer.decode(input_ids,
                skip_special_tokens=True)



print(a)
print(b)

<s> Hello there.</s></s> How are you?</s><pad><pad><pad><pad>
Hello there. How are you?


| <a id='Manual_formatting_of_model_input_data'></a>

In [None]:
MAX_LEN = 15 # This value could be set as 256, 512 etc.

sentence1 = 'Hello there. How are you? Have a nice day. This is a test?'


encoded_dict = tokenizer.encode_plus(
            sentence1,                
            max_length = MAX_LEN,
            stride=0,
            pad_to_max_length = True,
            return_overflowing_tokens=True,
       )


encoded_dict

{'overflowing_tokens': [83, 10, 3034, 32], 'num_truncated_tokens': 4, 'input_ids': [0, 35378, 2685, 5, 11249, 621, 398, 32, 31901, 10, 26267, 5155, 5, 3293, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
MAX_LEN = 15 # This value could be set as 256, 512 etc.

sentence1 = 'Hello there. How are you? Have a nice day. This is a test?'


encoded_dict = tokenizer.encode_plus(
            sentence1,                
            max_length = MAX_LEN,
            stride=3,
            pad_to_max_length = True,
            return_overflowing_tokens=True,
       )


encoded_dict

{'overflowing_tokens': [5155, 5, 3293, 83, 10, 3034, 32], 'num_truncated_tokens': 4, 'input_ids': [0, 35378, 2685, 5, 11249, 621, 398, 32, 31901, 10, 26267, 5155, 5, 3293, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# Here you can see the overlap.

print(encoded_dict['input_ids'])
print(encoded_dict['overflowing_tokens'])

[0, 35378, 2685, 5, 11249, 621, 398, 32, 31901, 10, 26267, 5155, 5, 3293, 2]
[5155, 5, 3293, 83, 10, 3034, 32]


In [None]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json



In [None]:
! kaggle competitions download contradictory-my-dear-watson

Downloading test.csv.zip to /content
  0% 0.00/536k [00:00<?, ?B/s]
100% 536k/536k [00:00<00:00, 78.6MB/s]
Downloading train.csv.zip to /content
  0% 0.00/1.23M [00:00<?, ?B/s]
100% 1.23M/1.23M [00:00<00:00, 84.6MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/66.0k [00:00<?, ?B/s]
100% 66.0k/66.0k [00:00<00:00, 60.2MB/s]


In [None]:
!unzip train.csv.zip
!unzip test.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                


In [None]:

df_train = pd.read_csv('./train.csv')

print(df_train.shape)

df_train.head()

(12120, 6)


Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [None]:

df_test = pd.read_csv('./test.csv')

print(df_test.shape)

df_test.head()

(5195, 5)


Unnamed: 0,id,premise,hypothesis,lang_abv,language
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی...",ur,Urdu
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...,ar,Arabic
2,e98005252c,et cela est en grande partie dû au fait que le...,Les mères se droguent.,fr,French
3,58518c10ba,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。,zh,Chinese
4,c32b0d16df,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась.",ru,Russian


| <a id='Create_5_Folds'></a>

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

# shuffle
df = shuffle(df_train)

# initialize kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1024)

# for stratification
y = df['label']

# Note:
# Each fold is a tuple ([train_index_values], [val_index_values])
# fold_0, fold_1, fold_2, fold_3, fold_5 = kf.split(df, y)

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df, y))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)
    
    

print(len(train_df_list))
print(len(val_df_list))

5
5


In [None]:
# Display one train fold

df_train = train_df_list[0]

df_train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
8670,89b8ed82f3,There's nothing like the trendy resort cloth...,There is no trendy resort clothing available h...,en,English,2
3520,458324e69c,"So far, however, the number of mail pieces los...",The amount of mail lost is too smal to leave ...,en,English,0
8547,6d101fabf4,Um-hum vizuri na ningekuwa nasema kuwa kuna ma...,Mimi nilikuwa naenda kupendekeza kupunguzwa kw...,sw,Swahili,0
2647,e7cc1fd1d7,"In addition, Saracens invaded the Provencal co...",The Magyar armies did not attack anyone.,en,English,2
6225,8917f23f75,what was the problem,i know exactly what the problem is.,en,English,2


In [None]:
# Display one val fold

df_val = val_df_list[0]

df_val.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
7026,d8aac27733,The chain wielder smiled at her.,The chain wielder frowned at her.,en,English,2
3982,1bcdb7611d,"Oh! I exclaimed, much relieved.",He was going to get the operation to remove hi...,en,English,1
4065,e42294e557,Là tổ chức chủ nhà địa phương cho Hội đồng quố...,Trung tâm nói không có du khách nào có thể đến.,vi,Vietnamese,2
4272,88675218ef,یہودی مقصود کے مقابلے میں عام قسمت کا احساس کہ...,یہودی لوگوں کو اپنے عقائد کو سکھانے کے لئے چاہ...,ur,Urdu,1
5477,5f47d9eebe,Mfululizo wa mashindano ya uvuvi hufanya msimu...,Watu hupenda kunywa pombe nyingi kwenye baa wa...,sw,Swahili,1


In [None]:
# Train Roberta model
MODEL_TYPE = 'xlm-roberta-base'


L_RATE = 1e-5
MAX_LEN = 256

NUM_EPOCHS = 3
BATCH_SIZE = 32
NUM_CORES = os.cpu_count()

NUM_CORES

40

In [None]:
df_train = train_df_list[0]

df_train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
8670,89b8ed82f3,There's nothing like the trendy resort cloth...,There is no trendy resort clothing available h...,en,English,2
3520,458324e69c,"So far, however, the number of mail pieces los...",The amount of mail lost is too smal to leave ...,en,English,0
8547,6d101fabf4,Um-hum vizuri na ningekuwa nasema kuwa kuna ma...,Mimi nilikuwa naenda kupendekeza kupunguzwa kw...,sw,Swahili,0
2647,e7cc1fd1d7,"In addition, Saracens invaded the Provencal co...",The Magyar armies did not attack anyone.,en,English,2
6225,8917f23f75,what was the problem,i know exactly what the problem is.,en,English,2


In [None]:
df_val = val_df_list[0]

df_val.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
7026,d8aac27733,The chain wielder smiled at her.,The chain wielder frowned at her.,en,English,2
3982,1bcdb7611d,"Oh! I exclaimed, much relieved.",He was going to get the operation to remove hi...,en,English,1
4065,e42294e557,Là tổ chức chủ nhà địa phương cho Hội đồng quố...,Trung tâm nói không có du khách nào có thể đến.,vi,Vietnamese,2
4272,88675218ef,یہودی مقصود کے مقابلے میں عام قسمت کا احساس کہ...,یہودی لوگوں کو اپنے عقائد کو سکھانے کے لئے چاہ...,ur,Urdu,1
5477,5f47d9eebe,Mfululizo wa mashindano ya uvuvi hufanya msimu...,Watu hupenda kunywa pombe nyingi kwenye baa wa...,sw,Swahili,1


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# xlm-roberta-large
print('Loading XLMRoberta tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

Loading XLMRoberta tokenizer...


In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [None]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           # Sentences to encode.
                    add_special_tokens = True,      # Add the special tokens.
                    max_length = MAX_LEN,           # Pad & truncate all sentences.
                    pad_to_max_length = True,
                    truncation = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        # Convert the target to a torch tensor
        target = torch.tensor(self.df_data.loc[index, 'label'])

        sample = (padded_token_list, att_mask, target)


        return sample


    def __len__(self):
        return len(self.df_data)
    
    
    
    
    

class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           # Sentence to encode.
                    add_special_tokens = True,      # Add the special tokens.
                    max_length = MAX_LEN,           # Pad & truncate all sentences.
                    pad_to_max_length = True,
                    truncation = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
               

        sample = (padded_token_list, att_mask)


        return sample


    def __len__(self):
        return len(self.df_data)

In [None]:
train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = TestDataset(df_test)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

303
76
163


In [None]:
# Get one train batch

padded_token_list, att_mask, target = next(iter(train_dataloader))

print(padded_token_list.shape)
print(att_mask.shape)
print(target.shape)

torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])


In [None]:
# Get one val batch

padded_token_list, att_mask, target = next(iter(val_dataloader))

print(padded_token_list.shape)
print(att_mask.shape)
print(target.shape)

torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])


In [None]:
# Get one test batch

padded_token_list, att_mask = next(iter(test_dataloader))

print(padded_token_list.shape)
print(att_mask.shape)

torch.Size([32, 256])
torch.Size([32, 256])


In [None]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = 3, # The number of output labels. 2 for binary classification.
)

# Send the model to the device.
model.to(device)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [None]:
# Create a batch of train samples


train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=8,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

b_input_ids, b_input_mask, b_labels = next(iter(train_dataloader))

print(b_input_ids.shape)
print(b_input_mask.shape)
print(b_labels.shape)

torch.Size([8, 256])
torch.Size([8, 256])
torch.Size([8])


In [None]:
# Pass a batch of train samples to the model.

batch = next(iter(train_dataloader))

# Send the data to the device
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)

# Run the model
outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

# The ouput is a tuple (loss, preds).
outputs

Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.7/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/util.py", line 133, in _remove_temp_dir
    rmtree(tempdir)
  File "/usr/lib/python3.7/shutil.py", line 498, in rmtree
    onerror(os.rmdir, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 496, in rmtree
    os.rmdir(path)
OSError: [Errno 39] Directory not empty: '/tmp/pymp-ec9_6_l0'


SequenceClassifierOutput([('loss', tensor(1.1962, device='xla:1')),
                          ('logits', tensor([[ 0.3614, -0.0760,  0.3286],
                                   [ 0.3700, -0.0999,  0.3229],
                                   [ 0.3561, -0.0863,  0.3226],
                                   [ 0.3579, -0.1016,  0.3209],
                                   [ 0.3281, -0.0359,  0.2813],
                                   [ 0.3232, -0.0537,  0.3000],
                                   [ 0.3711, -0.0893,  0.3195],
                                   [ 0.3499, -0.0808,  0.3103]], device='xla:1'))])

## Inspect the model's output

In [None]:
outputs

SequenceClassifierOutput([('loss', tensor(1.1962, device='xla:1')),
                          ('logits', tensor([[ 0.3614, -0.0760,  0.3286],
                                   [ 0.3700, -0.0999,  0.3229],
                                   [ 0.3561, -0.0863,  0.3226],
                                   [ 0.3579, -0.1016,  0.3209],
                                   [ 0.3281, -0.0359,  0.2813],
                                   [ 0.3232, -0.0537,  0.3000],
                                   [ 0.3711, -0.0893,  0.3195],
                                   [ 0.3499, -0.0808,  0.3103]], device='xla:1'))])

In [None]:
# The output is a tuple: (loss, preds)

len(outputs)

2

In [None]:
# This is the loss.

outputs[0]

tensor(1.1962, device='xla:1')

In [None]:
# These are the predictions.

outputs[1]

tensor([[ 0.3614, -0.0760,  0.3286],
        [ 0.3700, -0.0999,  0.3229],
        [ 0.3561, -0.0863,  0.3226],
        [ 0.3579, -0.1016,  0.3209],
        [ 0.3281, -0.0359,  0.2813],
        [ 0.3232, -0.0537,  0.3000],
        [ 0.3711, -0.0893,  0.3195],
        [ 0.3499, -0.0808,  0.3103]], device='xla:1')

In [None]:
preds = outputs[1].detach().cpu().numpy()

y_true = b_labels.detach().cpu().numpy()
y_pred = np.argmax(preds, axis=1)

y_pred

array([0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
# This is the accuracy without fine tuning.

val_acc = accuracy_score(y_true, y_pred)

val_acc

0.125

In [None]:
# The loss and preds are Torch tensors

print(type(outputs[0]))
print(type(outputs[1]))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


## Define the Optimizer

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(),
              lr = L_RATE, 
              eps = 1e-8 
            )

## Train the Model

In [None]:
# Create the dataloaders.

train_data = CompDataset(df_train)
val_data = CompDataset(df_val)
test_data = TestDataset(df_test)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                       num_workers=NUM_CORES)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

303
76
163


In [None]:
%%time


# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []


# For each epoch...
for epoch in range(0, NUM_EPOCHS):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    

    stacked_val_labels = []
    targets_list = []

    # ========================================
    #               Training
    # ========================================
    
    print('Training...')
    
    # put the model into train mode
    model.train()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)


    # Reset the total loss for this epoch.
    total_train_loss = 0

    for i, batch in enumerate(train_dataloader):
        
        train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))
        
        print(train_status, end='\r')


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        


        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask,
                    labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_train_loss = total_train_loss + loss.item()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        
        # Use the optimizer to update the weights.
        
        # Optimizer for GPU
        # optimizer.step() 
        
        # Optimizer for TPU
        # https://pytorch.org/xla/
        xm.optimizer_step(optimizer, barrier=True)

    
    print('Train loss:' ,total_train_loss)


    # ========================================
    #               Validation
    # ========================================
    
    print('\nValidation...')

    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)
    
    
    # Reset the total loss for this epoch.
    total_val_loss = 0
    

    for j, batch in enumerate(val_dataloader):
        
        val_status = 'Batch ' + str(j) + ' of ' + str(len(val_dataloader))
        
        print(val_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)      


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask, 
                labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_val_loss = total_val_loss + loss.item()
        

        # Get the preds
        preds = outputs[1]


        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        
        # Move the labels to the cpu
        targets_np = b_labels.to('cpu').numpy()

        # Append the labels to a numpy list
        targets_list.extend(targets_np)

        if j == 0:  # first batch
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

    
    # Calculate the validation accuracy
    y_true = targets_list
    y_pred = np.argmax(stacked_val_preds, axis=1)
    
    val_acc = accuracy_score(y_true, y_pred)
    
    
    print('Val loss:' ,total_val_loss)
    print('Val acc: ', val_acc)


    # Save the Model
    torch.save(model.state_dict(), 'model.pt')
    
    # Use the garbage collector to save memory.
    gc.collect()


Training...
Train loss: 327.86276692152023

Validation...
Val loss: 75.82385063171387
Val acc:  0.5371287128712872

Training...
Train loss: 268.60993725061417

Validation...
Val loss: 56.32665252685547
Val acc:  0.6951320132013201

Training...
Train loss: 218.642987459898

Validation...
Val loss: 53.42127624154091
Val acc:  0.7157590759075908
CPU times: user 7min 27s, sys: 56.8 s, total: 8min 24s
Wall time: 16min 34s


In [None]:
for j, batch in enumerate(test_dataloader):
        
        inference_status = 'Batch ' + str(j+1) + ' of ' + str(len(test_dataloader))
        
        print(inference_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask)
        
        
        # Get the preds
        preds = outputs[0]


        # Move preds to the CPU
        preds = preds.detach().cpu().numpy()
        
        # Move the labels to the cpu
        targets_np = b_labels.to('cpu').numpy()

        # Append the labels to a numpy list
        targets_list.extend(targets_np)
        
        # Stack the predictions.

        if j == 0:  # first batch
            stacked_preds = preds

        else:
            stacked_preds = np.vstack((stacked_preds, preds))



In [None]:
stacked_preds

array([[-0.591747  , -0.46209022,  0.47971505],
       [-1.4384645 ,  0.59447134,  0.41534314],
       [ 1.6042563 , -0.89423877, -1.1976119 ],
       ...,
       [ 0.8453864 , -0.5440602 , -0.9371083 ],
       [ 2.2397861 , -1.3524817 , -1.075075  ],
       [-0.91773313, -1.6126078 ,  2.4981465 ]], dtype=float32)

## Process the Predictions

In [None]:
# Take the argmax. This returns the column index of the max value in each row.

preds = np.argmax(stacked_preds, axis=1)

preds

array([2, 1, 0, ..., 0, 0, 2])

## Create a submission csv file

In [None]:

df_sample = pd.read_csv('./sample_submission.csv')

print(df_sample.shape)

df_sample.head()

(5195, 2)


Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,1
3,58518c10ba,1
4,c32b0d16df,1


In [None]:
# Assign the preds to the prediction column

df_sample['prediction'] = preds

df_sample.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,2
1,cefcc82292,1
2,e98005252c,0
3,58518c10ba,1
4,c32b0d16df,2


In [None]:

df_sample.to_csv('xlmroberta_submission.csv', index=False)

In [None]:
# Check the distribution of the predicted classes.

df_sample['prediction'].value_counts()

1    1766
2    1734
0    1695
Name: prediction, dtype: int64