In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
df.head()

In [None]:
df['label'].hist() # Approximately uniformly distributed

In [None]:
!pip install googletrans

In [None]:


import tensorflow_hub as hub

embedding_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'

#model = tf.keras.Sequential()

embedding_layer = hub.KerasLayer(embedding_url, trainable=True)
premises = tf.keras.layers.Input(shape=(), name="Input1", dtype=tf.string)
conclusion = tf.keras.layers.Input(shape=(), name="Input2", dtype=tf.string)
x1 = embedding_layer(premises)
x2 = embedding_layer(conclusion) 
subtracted = tf.keras.layers.Add()([x1, x2])
subtracted = tf.keras.layers.BatchNormalization()(subtracted)
subtracted = tf.keras.layers.Dropout(0.5)(subtracted)
output = tf.keras.layers.Dense(512, activation='relu')(subtracted)
output = tf.keras.layers.BatchNormalization()(output)
output = tf.keras.layers.Dense(128, activation='relu')(subtracted)
output = tf.keras.layers.BatchNormalization()(output)
output = tf.keras.layers.Dense(3)(output)

model = tf.keras.Model(inputs=[premises, conclusion], outputs=output)
optimizer = tf.keras.optimizers.Adamax()
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
from random import randint
from googletrans import Translator
translator = Translator()

def generate_batch(batch_size=32):
    premise_tensor = []
    conclusion_tensor = []
    label_tensor = []
    
    for i in range(batch_size):
        index = randint(0, len(df) - 1)
        premise_tensor.append(translator.translate(df.iloc[index, :].premise).text)
        conclusion_tensor.append(translator.translate(df.iloc[index, :].hypothesis).text)
        label_tensor.append(df.iloc[index, :].label)
    
    return tf.data.Dataset.from_tensor_slices(({"Input1": premise_tensor, "Input2": conclusion_tensor}, label_tensor)).batch(batch_size)

In [None]:
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
test.head()

Cannot use checkpoints because the fit method is called multiple times

In [None]:
epochs = 5
batch_size = 32
steps_per_epoch = len(df) // batch_size
steps = int(steps_per_epoch * epochs // 6)

best_loss = 1000

for i in range(steps):
    for X, y in generate_batch(batch_size).take(1):
        #model.fit(X, y)
        model.train_on_batch(X, y)
        
        if i % 5 == 0:
            for X_val, y_val in generate_batch(batch_size).take(1):
                val_loss = model.evaluate(X_val, y_val)
                if val_loss < best_loss:
                    model.save('./model_weights')
                    best_loss = val_loss

In [None]:
df.head()

In [None]:
model = tf.keras.models.load_model('./model_weights')

In [None]:
test_premises = test.loc[:, 'premise']
test_conclusion = test.loc[:, 'hypothesis']
test_ids = test.loc[:, 'id']
preds = np.argmax(model.predict({"Input1" : test_premises, "Input2": test_conclusion}), axis=-1)
submission_df = pd.DataFrame({'id' : test_ids, 'prediction' : preds})
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)