In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade pip
!pip install --upgrade allennlp
!pip install transformers==4.0.1

In [None]:
# for TPU
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [None]:
import transformers
import pandas as pd
import torch

In [None]:
# for TPU
import torch_xla
import torch_xla.core.xla_model as xm

In [None]:
print('Transformers version: ', transformers.__version__)
print('Pytorch version: ', torch.__version__)

# Import Data

In [None]:
data_dir = '/kaggle/input/contradictory-my-dear-watson/'
train_df = pd.read_csv(data_dir+'train.csv').sample(frac=1, random_state=100)
test_df = pd.read_csv(data_dir+'test.csv')
print(train_df['label'].value_counts())
train_df.head(5)

# Tokenization & Make input

In [None]:
#PRE_TRAINED_MODEL = 'bert-base-multilingual-cased'
PRE_TRAINED_MODEL = 'xlm-roberta-large'
tokenizer = transformers.AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL)

In [None]:
MAX_LEN = 84
split_idx = int(train_df.shape[0] * 0.8)

In [None]:
def get_encode (data):
    tokenized_data=tokenizer(
        text=list(data['premise']), text_pair=list(data['hypothesis']),
                                            max_length=MAX_LEN,
                                            pad_to_max_length=True,
                                            add_special_tokens=True,
                                            truncation=True, 
                                            return_attention_mask=True, 
                                            return_token_type_ids=True,
                                             return_tensors='pt')
    return tokenized_data
    

In [None]:
#Data Preprocessing and tensor generation
seed=2

tokenized_train=get_encode(train_df[:split_idx])
labels_train=torch.tensor(train_df.label.values[:split_idx])

tokenized_valid=get_encode(train_df[split_idx:])
labels_valid=torch.tensor(train_df.label.values[split_idx:])

tokenized_test=get_encode(test_df)

In [None]:
print (tokenized_train['input_ids'][0])
print (tokenized_train['token_type_ids'][0])
print (tokenized_train['attention_mask'][0])

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size=64

train_data=TensorDataset(torch.tensor(tokenized_train['input_ids']),
                         torch.tensor(tokenized_train['token_type_ids']),torch.tensor(tokenized_train['attention_mask'])
                         ,labels_train)
train_sampler=RandomSampler(train_data)
train_dataloader=DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data=TensorDataset(torch.tensor(tokenized_valid['input_ids']),
                         torch.tensor(tokenized_valid['token_type_ids']),
                         torch.tensor(tokenized_valid['attention_mask'])
                         ,labels_valid)
valid_sampler=SequentialSampler(valid_data)
valid_dataloader=DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

test_data=TensorDataset(torch.tensor(tokenized_test['input_ids']),
                        torch.tensor(tokenized_test['token_type_ids']),
                        torch.tensor(tokenized_test['attention_mask']))
test_dataloader=DataLoader(test_data, batch_size=batch_size)

# Model Fine Tuning

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL,
                                                      num_labels = 3,
                                                      output_attentions = False,
                                                      output_hidden_states = False)


In [None]:
# for GPU / CPU
'''
if torch.cuda.is_available():
    print(model.cuda())
else :
    print(model.cpu())

if torch.cuda.is_available():
    device = torch.device("cuda")
    print ('%d GPU(s) available' % torch.cuda.device_count())
else:
    device = torch.device("cpu")
    print ('No GPU avaailable, using CPU.')
'''
    
# for TPU
device = xm.xla_device()
torch.set_default_tensor_type('torch.FloatTensor')
print(model.to(device))
print ('TPU available')


In [None]:
def accuracy(predictions, labels):
    prediction_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(prediction_flat == labels_flat) / len(labels_flat)

In [None]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5
                 )
epochs = 20
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
import datetime
import random

random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)

losses = []

for i in range(0, epochs):
    print ('Epoch {:} of {:} Training...'.format(i+1, epochs))
    
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 10 == 0 and step != 0:
            print ('[{}] Batch {:>5,} of {:>5,}'
               .format(datetime.datetime.now().strftime('%H:%M:%S'), step, len(train_dataloader)))
        train_batch_input = batch[0].to(device)
        train_batch_input_types = batch[1].to(device)
        train_batch_mask = batch[2].to(device)
        train_batch_label = batch[3].to(device)
        
        model.zero_grad()
        outputs = model(train_batch_input, token_type_ids = train_batch_input_types, 
                        attention_mask = train_batch_mask, labels = train_batch_label)
        loss = outputs[0]        
        
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # for GPU / CPU
        #optimizer.step()
        # for TPU
        xm.optimizer_step(optimizer, barrier=True)
        scheduler.step()
    
    average_train_loss = total_loss / len(train_dataloader)
    losses.append(average_train_loss)
    print ('Training loss={:.2f}'.format(average_train_loss))
    
    model.eval()
    eval_accuracy = 0
    eval_count = 0
    
    for batch in valid_dataloader:
        valid_batch_input = batch[0].to(device)
        valid_batch_input_types = batch[1].to(device)
        valid_batch_mask = batch[2].to(device)
        valid_batch_labels = batch[3].to(device)
        
        with torch.no_grad():
            outputs = model(valid_batch_input, token_type_ids = valid_batch_input_types,
                           attention_mask = valid_batch_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = valid_batch_labels.to('cpu').numpy()
        batch_accuracy = accuracy(logits, label_ids)
        eval_accuracy += batch_accuracy
        eval_count += 1

    print ('Validation accuracy={:.2f}'.format(eval_accuracy / eval_count) )
    print ('')
    
    if i == 4:
        break

print ("Training done")
    

Make submission

In [None]:
model.eval()
submissions = []
for batch in test_dataloader:
    test_batch_input = batch[0].to(device)
    test_batch_input_types = batch[1].to(device)
    test_batch_mask = batch[2].to(device)
    
    with torch.no_grad():
        outputs = model(test_batch_input, token_type_ids = test_batch_input_types,
                       attention_mask = test_batch_mask)
        
    logits = outputs[0]
    submissions.extend(np.argmax(logits.detach().cpu().numpy(), axis=1).flatten())


In [None]:
output = pd.DataFrame({'id': test_df.id,
                       'prediction': submissions})
output.to_csv('submission.csv', index=False)