Let's add the libraries where they are really needed, not all of them at the first line

In [None]:
import pandas as pd

### our data frames

In [None]:
train_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_df  = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
sample_df = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')

### Initiative knowledge about our data

In [None]:
train_df

In [None]:
train_df.isna().sum()

In [None]:
train_df['language'].value_counts()

In [None]:
train_df['label'].value_counts()

In [None]:
test_df

In [None]:
test_df['language'].value_counts()

In [None]:
sample_df

### Modeling

In [None]:
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() 

In [None]:
from transformers import TFAutoModel, AutoTokenizer

In [None]:
from tensorflow.keras.layers import Dense, Input

In [None]:
from tensorflow.keras.models import Model

In [None]:
from tensorflow.keras.optimizers import Adam

In [None]:
def model_watson(strategy,transformer):
    with strategy.scope():
        transformer_encoder = TFAutoModel.from_pretrained(transformer)
        
        input_layer = Input(shape=(100,), dtype=tf.int32, name="input_layer")
        sequence_output = transformer_encoder(input_layer)[0]
        
        cls_token = sequence_output[:, 0, :]
        
        output_layer = Dense(3, activation='softmax')(cls_token)
        
        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        
        return model

In [None]:
model = model_watson(strategy,"jplu/tf-xlm-r-ner-40-lang")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-xlm-r-ner-40-lang")

In [None]:
train_data = train_df[['premise', 'hypothesis']].values.tolist()
test_data = test_df[['premise', 'hypothesis']].values.tolist()

In [None]:
train_encoded=tokenizer.batch_encode_plus(train_data,pad_to_max_length=True,max_length=100)
test_encoded=tokenizer.batch_encode_plus(test_data,pad_to_max_length=True,max_length=100)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_encoded['input_ids'], train_df.label.values, test_size=0.2)
x_test = test_encoded['input_ids']

In [None]:
train_dataset = (tf.data.Dataset.from_tensor_slices((x_train, y_train)).repeat().shuffle(2048).batch(20 * strategy.num_replicas_in_sync).prefetch(tf.data.experimental.AUTOTUNE))
valid_dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(20 * strategy.num_replicas_in_sync).cache().prefetch(tf.data.experimental.AUTOTUNE))
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(20 * strategy.num_replicas_in_sync))

In [None]:
model.summary()

In [None]:
history = model.fit(train_dataset,steps_per_epoch=len(train_df) // 20 * strategy.num_replicas_in_sync,validation_data=valid_dataset,epochs= 5)

### Our prediction output 

In [None]:
predictions = model.predict(test_dataset, verbose=1)
sample_df['prediction'] = predictions.argmax(axis=1)

In [None]:
import os
os.chdir(r'/kaggle/working')

In [None]:
sample_df.to_csv(r'submission.csv',index= False)

In [None]:
sample_df.head(10)