In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ROOT_DIR = "../input/contradictory-my-dear-watson/train.csv"
TEST_DIR = "../input/contradictory-my-dear-watson/test.csv"

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv(ROOT_DIR)
test_data = pd.read_csv(TEST_DIR)


print("total data: ", len(data))

data.head()

In [None]:
from tensorflow.keras import layers

In [None]:
def encode_sent(sentences, tokenizer):
  text_to_seq = tokenizer.texts_to_sequences(sentences)

  return text_to_seq[0]

In [None]:
def plot(lss, acc, lss_label="loss", acc_label="acc", w_h=(10, 7)):
  plt.figure(figsize=w_h)
  plt.subplot(1, 2, 1)
  plt.plot(lss)
  plt.title(lss_label)

  plt.subplot(1, 2, 2)
  plt.plot(acc)
  plt.title(acc_label)

  plt.legend()

  plt.show()

In [None]:
def create_dataset(premise, hypothesis, tokenizer, MAXLEN, fit_text=True):
  if fit_text:
      tokenizer.fit_on_texts(premise)
      tokenizer.fit_on_texts(hypothesis)

  tokenizer.fit_on_texts(['start sep end'])

  START_TOKEN = [tokenizer.word_index['start']]
  SEP_TOKEN = [tokenizer.word_index['sep']]
  END_TOKEN = [tokenizer.word_index['end']]

  dataset = [
             START_TOKEN + encode_sent([sent1], tokenizer)[:MAXLEN] + SEP_TOKEN 
             + encode_sent([sent2], tokenizer) + END_TOKEN
             for sent1, sent2 in zip(premise, hypothesis)
  ]

  dataset = [
             np.asarray(pad_sequences([seq], maxlen=MAXLEN, padding='post'), dtype=np.int32)
             for seq in dataset
  ]

  dataset = np.array(dataset)
  dataset = np.squeeze(dataset, axis=1)

  return dataset

In [None]:
MAX_WORDS = 55000
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='oov')
MAXLEN = 100

dataset = create_dataset(list(data['premise']), list(data['hypothesis']), tokenizer, MAXLEN)

print("Dataset: ", dataset.shape)

In [None]:
labels = np.array(list(data['label']), dtype=np.int32)
labels = np.expand_dims(labels, axis=-1)

print("labels shape: ", labels.shape)

In [None]:
def create_model2(maxlen, inp_dim, out_dim, lstm_unit, ffn_dim, n_classes, rate=0.5):

  input = layers.Input(shape=(maxlen, ))
  embed_oup = layers.Embedding(inp_dim, out_dim, mask_zero=True)(input)
  
  lstm_oup = layers.Bidirectional(layers.LSTM(lstm_unit, return_sequences=False, kernel_regularizer=tf.keras.regularizers.L2(0.001)))(embed_oup)
  dense1_oup =  layers.Dense(ffn_dim, kernel_regularizer=tf.keras.regularizers.L2(0.001))(lstm_oup)
  dropout = layers.Dropout(rate)(dense1_oup)
  layernorm = layers.LayerNormalization(epsilon=1e-6)(dropout)

  # lstm_oup = layers.Bidirectional(layers.LSTM(lstm_unit, kernel_regularizer=tf.keras.regularizers.L1(0.001)))(layernorm)
  # dense2_oup = dense1_oup =  layers.Dense(ffn_dim, kernel_regularizer=tf.keras.regularizers.L1(0.001))(lstm_oup)
  # dropout = layers.Dropout(rate)(dense2_oup)
  # layernorm = layers.LayerNormalization(epsilon=1e-6)(dropout)

  output = layers.Dense(n_classes, activation='softmax')(layernorm)

  model = tf.keras.Model(inputs=input, outputs=output)
  return model

In [None]:
# maxlen, inp_dim, oup_dim, lstm_unit, ffn_dim, final_oup, mask_zero=True, rate=0.5

embed_dim = 16
ff_dim = 16
LSTM_UNIT = 16
CLASSES = 3
vocab_size = len(tokenizer.word_index) + 1

print("Vaocab_size: ", vocab_size)

In [None]:
model = create_model2(MAXLEN, vocab_size+1, embed_dim, LSTM_UNIT, ff_dim, CLASSES)
adam = tf.keras.optimizers.Adam(learning_rate=0.00005)

model.compile(loss="sparse_categorical_crossentropy", optimizer=adam, metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(dataset, labels, epochs=30, validation_split=0.2, batch_size=32, shuffle=True)

In [None]:
plot(history.history['loss'], history.history['accuracy'])

plot(history.history['val_loss'], history.history['val_accuracy'], lss_label="val_loss", acc_label="val_accuracy")

In [None]:
testing_data = create_dataset(list(test_data['premise']), list(test_data['hypothesis']), tokenizer, MAXLEN)
print(testing_data.shape)

In [None]:
y_pred = model.predict(testing_data).argmax(axis=-1)
print(y_pred.shape)

In [None]:
test_data.columns

In [None]:
test_data['prediction'] = y_pred
test_data = test_data.drop(['premise', 'hypothesis', 'lang_abv', 'language'], axis=1)

In [None]:
test_data.head()

In [None]:
test_data.to_csv('./submission.csv', index=False)

In [None]:
x = pd.read_csv('./submission.csv')
x.head()