In [None]:
import tensorflow as tf
from tensorflow import keras
from csv import reader
import os
import numpy as np
import pandas as pd
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import concatenate, Dense, Input, Dropout, TimeDistributed, Embedding, BatchNormalization, Bidirectional, LSTM, GRU
import string
import tempfile

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#pip install -U deep_translator
#from deep_translator import GoogleTranslator
#translator = Translator()
#def translate_sentence(x):
#  return GoogleTranslator('auto', 'en').translate(x)
#train.premise[train.lang_abv!= 'en']=train.premise[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
#train.hypothesis[train.lang_abv!= 'en']=train.hypothesis[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
#train.to_csv("/content/drive/MyDrive/Contradictory, My Dear Watson/train_translated.csv")

## Translation to English

In [None]:
# !pip install deep_translator

In [None]:
# from deep_translator import GoogleTranslator

# def translate_sentence(x):
#   return GoogleTranslator('auto', 'en').translate(x)

# translate_sentence('hola mundo')

In [None]:
# train = pd.read_csv("drive/MyDrive/Contradictory, My Dear Watson/train.csv")
# test = pd.read_csv("drive/MyDrive/Contradictory, My Dear Watson/test.csv")

# train.premise[train.lang_abv!= 'en']=train.premise[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
# train.hypothesis[train.lang_abv!= 'en']=train.hypothesis[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
# train.to_csv("/content/drive/MyDrive/Contradictory, My Dear Watson/train_translated.csv")

# test.premise[test.lang_abv!= 'en']=test.premise[test.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
# test.hypothesis[test.lang_abv!= 'en']=test.hypothesis[test.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
# test.to_csv("/content/drive/MyDrive/Contradictory, My Dear Watson/test_translated.csv")

In [None]:
# read csv file as a list of lists
with open('../input/train-translated/train_translated.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Pass reader object to list() to get a list of lists
    train_translated = list(csv_reader)[1:]
    print(train_translated[1][-1])

## Get the Data in the Appropriate Format for the Model

In [None]:
def get_data(raw_data, limit=None, test=False):
    if not test:
        premises = [' '.join(premise).translate(str.maketrans('', '', string.punctuation)).split() for n, _id, premise, hypothesis, lang_abv, language, l in raw_data]
        hypotheses = [''.join(hypothesis).translate(str.maketrans('', '', string.punctuation)).split() for n, _id, premise, hypothesis, lang_abv, language, l in raw_data]
        Y = np.array([int(l) for n, _id, premise, hypothesis, lang_abv, language, l in raw_data])
        Y = to_categorical(Y, 3)
        return (premises, hypotheses, Y)
    else:
        premises = [' '.join(premise).translate(str.maketrans('', '', string.punctuation)).split() for n, _id, premise, hypothesis, lang_abv, language in raw_data]
        hypotheses = [''.join(hypothesis).translate(str.maketrans('', '', string.punctuation)).split() for n, _id, premise, hypothesis, lang_abv, language in raw_data]
        return (premises, hypotheses)

In [None]:
# os.remove(GLOVE_STORE + '.npy')

## The Model

In [None]:
training = get_data(train_translated)

tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training[0] + training[1])

# Lowest index from the tokenizer is 1 - we need to include 0 in our vocab count
VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
# RNN = LSTM
# RNN = lambda *args, **kwargs: Bidirectional(LSTM(*args, **kwargs))
# RNN = GRU
RNN = lambda *args, **kwargs: Bidirectional(GRU(*args, **kwargs))
# Summation of word embeddings
# RNN = None
# RNN = None
LAYERS = 1
USE_GLOVE = True
TRAIN_EMBED = False
EMBED_HIDDEN_SIZE = 300
SENT_HIDDEN_SIZE = 300
BATCH_SIZE = 512
PATIENCE = 4 # 8
MAX_EPOCHS = 42
MAX_LEN = 42
DP = 0.4
L2 = 4e-6
ACTIVATION = 'relu'
OPTIMIZER = 'adam'
print('RNN / Embed / Sent = {}, {}, {}'.format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE))
print('GloVe / Trainable Word Embeddings = {}, {}'.format(USE_GLOVE, TRAIN_EMBED))

to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)

print('Build model...')
print('Vocab size =', VOCAB)

GLOVE_STORE = 'precomputed_glove.weights'
if USE_GLOVE:
  if not os.path.exists(GLOVE_STORE + '.npy'):
    print('Computing GloVe')
  
    embeddings_index = {}
    f = open('../input/glove840b300dtxt/glove.840B.300d.txt')
    for line in f:
      values = line.split(' ')
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
    f.close()
    
    # prepare embedding matrix
    embedding_matrix = np.zeros((VOCAB, EMBED_HIDDEN_SIZE))
    for word, i in tokenizer.word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
      else:
        print('Missing from GloVe: {}'.format(word))
  
    np.save(GLOVE_STORE, embedding_matrix)

  print('Loading GloVe')
  embedding_matrix = np.load(GLOVE_STORE + '.npy')

  print('Total number of null word embeddings:')
  print(np.sum(np.sum(embedding_matrix, axis=1) == 0))

  embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=TRAIN_EMBED)
else:
  embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, input_length=MAX_LEN)

rnn_kwargs = dict(units=SENT_HIDDEN_SIZE, dropout=DP, recurrent_dropout=DP)
SumEmbeddings = keras.layers.Lambda(lambda x: K.sum(x, axis=1), output_shape=(SENT_HIDDEN_SIZE, ))

translate = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION))

premise = Input(shape=(MAX_LEN,), dtype='int32')
hypothesis = Input(shape=(MAX_LEN,), dtype='int32')

prem = embed(premise)
hypo = embed(hypothesis)

prem = translate(prem)
hypo = translate(hypo)

if RNN and LAYERS > 1:
  for l in range(LAYERS - 1):
    rnn = RNN(return_sequences=True, **rnn_kwargs)
    prem = rnn(prem)
    hypo = rnn(hypo)
    prem = BatchNormalization()(prem)
    hypo = BatchNormalization()(hypo)
rnn = SumEmbeddings if not RNN else RNN(return_sequences=False, **rnn_kwargs)
prem = rnn(prem)
hypo = rnn(hypo)
prem = BatchNormalization()(prem)
hypo = BatchNormalization()(hypo)


joint = concatenate([prem, hypo])
joint = Dropout(DP)(joint)
for i in range(3):
  joint = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, kernel_regularizer=l2(L2) if L2 else None)(joint)
  joint = Dropout(DP)(joint)
  joint = BatchNormalization()(joint)

pred = Dense(len(LABELS), activation='softmax')(joint)

model = Model(inputs=[premise, hypothesis], outputs=pred)
model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

print('_embeding')
checkpoint_filepath = './checkpoint'
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [EarlyStopping(patience=PATIENCE), ModelCheckpoint(checkpoint_filepath, save_best_only=True, save_weights_only=True, monitor='accuracy', mode='max')]
model.fit([training[0], training[1]], training[2], batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, callbacks=callbacks, validation_split=0.2)

# Restore the best found model during validation
model.load_weights(checkpoint_filepath)

In [None]:
# read csv file as a list of lists
with open('../input/test-translated/test_translated.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Pass reader object to list() to get a list of lists
    test_translated = list(csv_reader)[1:]

In [None]:
to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]))

testing = get_data(test_translated, test=True)

testing = prepare_data(testing)
test_df = pd.read_csv('../input/test-translated/test_translated.csv')
predictions = [np.argmax(i) for i in model.predict([testing[0], testing[1]])]
submission = test_df.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.to_csv("submission.csv", index = False)