In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sentencepiece

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoConfig, TFAutoModel    
from transformers import XLMRobertaConfig, XLMRobertaTokenizer, TFXLMRobertaModel         

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.backend as K

from sklearn.metrics import classification_report, f1_score, accuracy_score

os.environ["WANDB_API_KEY"] = "0" # to silence warning

np.random.seed(0)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
BATCH_SIZE= 16 * strategy.num_replicas_in_sync
AUTO = tf.data.experimental.AUTOTUNE

# Data Loading

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

Load backtranslated data.

In [None]:
train_aug = pd.read_csv("../input/contradictorywatsontwicetranslatedaug/thrice_translation_aug_train.csv")
train_aug2 = pd.read_csv("../input/contradictorywatsontwicetranslatedaug/twice_translated_aug_train.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
df2 = train_aug[train_aug['language']!='English']
df2 = df2.drop('Unnamed: 0',axis=1)
df2.head()

Remove augmented data of english language as we only want to upsample other languages.

In [None]:
df3 = train_aug2[train_aug['language']!='English']
df3.head()

Combine the data with augmented data of other languages to increase samples. Drop duplicate rows using pandas drop_duplicates(). Remove any NA values and shuffle the new dataframe.

In [None]:
new_train = pd.concat([train,df2,df3],ignore_index = True)

In [None]:
new_train = new_train.drop_duplicates()
new_train = new_train.dropna()
new_train = new_train.sample(frac = 1).reset_index(drop=True)
new_train

Original language ratios

In [None]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

Original dataset Label distribution

In [None]:
Accuracy=pd.DataFrame()
Accuracy['Type']=train.label.value_counts().index
Accuracy['Count']=train.label.value_counts().values
Accuracy['Type']=Accuracy['Type'].replace(0,'Entailment')
Accuracy['Type']=Accuracy['Type'].replace(1,'Neutral')
Accuracy['Type']=Accuracy['Type'].replace(2,'Contradiction')
Accuracy

New training set distributions

In [None]:
labels, frequencies = np.unique(new_train.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val = train_test_split(new_train, test_size=0.20, random_state=42 )#stratify=train['lang_abv']

In [None]:
X_train

Splitting data into folds for kfold crossvalidation.

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
val_score=[]
history=[]

In [None]:
target = new_train.loc[:,'label']

In [None]:
fold_no = 1
for train_index, test_index in skf.split(new_train, target):
    ktrain = new_train.loc[train_index,:]
    kval = new_train.loc[test_index,:]

In [None]:
kval.info()

In [None]:
labels, frequencies = np.unique(ktrain.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

We can see that the folds have same ratio as the new augmented dataset.

# Input Encoding

In [None]:
model_name = 'jplu/tf-xlm-roberta-large'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

In [None]:
def encoder(df, tokenizer, max_len=100):

    df1 = df[['premise', 'hypothesis']].values.tolist()
    df_encoded = tokenizer.batch_encode_plus(df1,max_length=max_len, padding=True, truncation=True, 
                                               add_special_tokens=True, return_attention_mask=True)
    
    inputs = np.array(df_encoded['input_ids'])
    return inputs

In [None]:
X_train_input = encoder(new_train, tokenizer)
X_test_input = encoder(test,tokenizer)

In [None]:
y_train = new_train.label.values
y_train

In [None]:
X_train_input

# Creating & Training Model

In [None]:
def build_model(max_len=100):
    
    tf.random.set_seed(12345)

    robertaModel = TFXLMRobertaModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    print('one')
    embedding = robertaModel([input_word_ids])[0] 
    sequence_output = embedding[:,0,:]
    print('two')
    output = tf.keras.layers.Dense(3, activation="softmax")(sequence_output)  
    
    model = tf.keras.Model(inputs=[input_word_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

Function to make a tensor dataset with encoded inputs

In [None]:
def create_dist_dataset(X, y,val,batch_size= BATCH_SIZE):
    
    
    dataset = tf.data.Dataset.from_tensor_slices((X,y)).shuffle(len(X))
          
    if not val:
        dataset = dataset.repeat().batch(batch_size).prefetch(AUTO)
    else:
        dataset = dataset.batch(batch_size).prefetch(AUTO)

    
    
    return dataset



test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test_input))
    .batch(BATCH_SIZE)
)

In [None]:
pred_test = 0

In [None]:
for fold,(train_ind,valid_ind) in enumerate(skf.split(X_train_input,y_train)):
    
    if fold < 4:
    
        print("fold",fold+1)
        
       
        tf.tpu.experimental.initialize_tpu_system(tpu)
        
        train_data = create_dist_dataset(X_train_input[train_ind],y_train[train_ind],val=False)
        valid_data = create_dist_dataset(X_train_input[valid_ind],y_train[valid_ind],val=True)
    
        Checkpoint=tf.keras.callbacks.ModelCheckpoint(f"roberta_base.h5", monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min')
        
        with strategy.scope():
            model = build_model()
            
        

        n_steps = len(train_ind)//BATCH_SIZE
        print("training model {} ".format(fold+1))

        train_history = model.fit(
        train_data,
        steps_per_epoch=n_steps,
        validation_data=valid_data,
        epochs=10,callbacks=[Checkpoint],verbose=1)
        
        print("Loading model...")
        model.load_weights(f"roberta_base.h5")
        
        

        print("fold {} val acc {}".format(fold+1,np.mean(train_history.history['val_accuracy'])))
        print("fold {} val loss {}".format(fold+1,np.mean(train_history.history['val_loss'])))
        
        history.append(train_history)

        val_score.append(np.mean(train_history.history['val_accuracy']))
        
        print('predict on test....')
        preds=model.predict(test_dataset,verbose=1)

        pred_test+=preds/4

In [None]:
pred_test

In [None]:
test_prediction = np.argmax(pred_test, axis=1)
test_prediction

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(model.history['loss'], label='train loss')
plt.plot(model.history['val_loss'], label='validation loss')
plt.title('Average Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = test_prediction

In [None]:
submission.to_csv("submission.csv", index = False)