In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

from transformers import XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sns.displot(train.label)
plt.show()

In [None]:
fig, axes = plt.subplots(1,2)
sns.countplot(train.lang_abv, ax=axes[0])
sns.countplot(test.lang_abv, ax=axes[1])
plt.show()

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

In [None]:
train['sentence'] = '<s>' + train['premise'] + '</s></s>' + train['hypothesis'] + '</s>'
test['sentence'] = '<s>' + test['premise'] + '</s></s>' + test['hypothesis'] + '</s>'

In [None]:
train_input_ids, train_attention_masks = np.zeros((1, 300), dtype='int64'), np.zeros((1, 300), dtype='int64')
test_input_ids, test_attention_masks = np.zeros((1, 300), dtype='int64'), np.zeros((1, 300), dtype='int64')

In [None]:
for i in range(train.shape[0]):
    input_ids, attention_mask = tokenizer(train.iloc[i, -1], truncation=True, padding='max_length', max_length=300).values()
    train_input_ids = np.append(train_input_ids, np.array(input_ids).reshape(1, -1), axis=0)
    train_attention_masks = np.append(train_attention_masks, np.array(attention_mask).reshape(1, -1), axis=0)

In [None]:
for i in range(test.shape[0]):
    input_ids, attention_mask = tokenizer(test.iloc[i, -1], truncation=True, padding='max_length', max_length=300).values()
    test_input_ids = np.append(test_input_ids, np.array(input_ids).reshape(1, -1), axis=0)
    test_attention_masks = np.append(test_attention_masks, np.array(attention_mask).reshape(1, -1), axis=0)

In [None]:
train_input_ids, train_attention_masks, test_input_ids, test_attention_masks = train_input_ids[1:], train_attention_masks[1:], test_input_ids[1:], test_attention_masks[1:]

In [None]:
train_input_ids, valid_input_ids, train_attention_masks, valid_attention_masks, train_labels, valid_labels = train_test_split(train_input_ids, train_attention_masks, train['label'].values.tolist(), test_size=0.2, random_state=42)

In [None]:
train_input_ids, train_attention_masks, train_labels = torch.tensor(train_input_ids), torch.tensor(train_attention_masks), torch.tensor(train_labels)
valid_input_ids, valid_attention_masks, valid_labels = torch.tensor(valid_input_ids), torch.tensor(valid_attention_masks),  torch.tensor(valid_labels)
test_input_ids, test_attention_masks = torch.tensor(test_input_ids), torch.tensor(test_attention_masks)

In [None]:
train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
valid_data = torch.utils.data.TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=8, sampler=torch.utils.data.RandomSampler(train_data))
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=8, sampler=torch.utils.data.RandomSampler(valid_data))
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=8, sampler=torch.utils.data.SequentialSampler(test_data))

In [None]:
class RobertaClassifier(torch.nn.Module):
    def __init__(self, hidden_neurons):
        super(RobertaClassifier, self).__init__()
        config = XLMRobertaConfig.from_pretrained('xlm-roberta-base', num_labels=3)
        self.model = XLMRobertaModel.from_pretrained('xlm-roberta-base', config=config)
        self.hidden_neurons = hidden_neurons
        self.linear_1 = torch.nn.Linear(config.hidden_size, self.hidden_neurons)
        self.linear_2 = torch.nn.Linear(self.hidden_neurons, int(self.hidden_neurons // 2))
        self.linear_3 = torch.nn.Linear(int(self.hidden_neurons // 2), 3)
        #self.activation = torch.nn.Softmax()
        self.dropout = torch.nn.Dropout()
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        bert_last_layer = bert_output[0]
        pooled_output = bert_last_layer.mean(axis=1)
        output_linear_1 = self.linear_1(pooled_output)
        output_linear_2 = self.linear_2(output_linear_1)
        output_linear_3 = self.linear_3(output_linear_2)
        output_linear_3 = self.dropout(output_linear_3)
        #logits = self.activation(output_linear_3)
        return output_linear_3
    
clf_model = RobertaClassifier(384).cuda()

In [None]:
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(clf_model.parameters(), lr=1e-5)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
epochs = 7
train_losses, valid_losses = [], []
train_accuracies, valid_accuracies = [], []
real_train_labels, predicted_train_labels = [], []
real_valid_labels, predicted_valid_labels = [], []

for epoch in range(epochs):
    clf_model.train()
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        train_input_ids, train_attention_masks, train_labels = batch
        optimizer.zero_grad()
        probs_train = clf_model.forward(train_input_ids, train_attention_masks)
        loss_train = loss(probs_train, train_labels)
        train_losses.append(loss_train.item())
        predicted_train_labels.extend(probs_train.argmax(axis=1).tolist())
        real_train_labels.extend(train_labels.tolist())
        loss_train.backward()
        optimizer.step()
        
    clf_model.eval()
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        valid_input_ids, valid_attention_masks, valid_labels = batch

        with torch.no_grad():
            probs_valid = clf_model.forward(valid_input_ids, valid_attention_masks)
            loss_valid = loss(probs_valid, valid_labels)
            valid_losses.append(loss_valid.item())
            predicted_valid_labels.extend(probs_valid.argmax(axis=1).tolist())
            real_valid_labels.extend(valid_labels.tolist())

    train_accuracy = accuracy_score(predicted_train_labels, real_train_labels)
    valid_accuracy = accuracy_score(predicted_valid_labels, real_valid_labels)

    train_accuracies.append(train_accuracy)
    valid_accuracies.append(valid_accuracy)
    
    print(f'Epoch {epoch}, train accuracy = {train_accuracy:.2f}, valid accuracy = {valid_accuracy:.2f}, train loss = {loss_train.item():.2f}, valid loss = {loss_valid.item():.2f}')

In [None]:
plt.plot(train_losses)
plt.show()

In [None]:
plt.plot(valid_losses)
plt.show()

In [None]:
plt.plot(train_accuracies)
plt.plot(valid_accuracies)
plt.show()

In [None]:
submission = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')
predicted_test_labels = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    test_input_ids, test_attention_masks = batch

    with torch.no_grad():
        probs_test = clf_model.forward(test_input_ids, test_attention_masks)
        predicted_test_labels.extend(probs_test.argmax(axis=1).tolist())
        
submission['prediction'] = predicted_test_labels
submission.set_index('id', inplace=True)
submission.to_csv('submission.csv')