In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
train = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

plt.figure(figsize = (10,10))
plt.pie(frequencies,labels = labels, autopct = '%1.1f%%')
plt.show()

### **Bert**




In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
# def encode_sentence(s):
#    tokens = list(tokenizer.tokenize(s))
#    tokens.append('[SEP]')
   
#    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
# encode_sentence("I love machine learning")

In [None]:
lengths = []

def encode_sentence(s):
  tokens = list(tokenizer.tokenize(s))
  # lengths.append(len(tokens))
  padding  = ['[PAD]']*(150-len(tokens))
  tokens = tokens + padding
  tokens = tokens[:150]
  lengths.append(len(tokens))
  return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  # TRIAL #1
  # sentence1 = tf.ragged.constant([
  #     encode_sentence(s)
  #     for s in np.array(hypotheses)])
  # sentence2 = tf.ragged.constant([
  #     encode_sentence(s)
  #      for s in np.array(premises)])

  # TRIAL #2
  # sentence = tf.ragged.constant([
  #   encode_sentence(s1,s2)
  #   for s1,s2 in zip(hypothesis, premises)
  # ])

  # TRIAL #3
  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*len(hypotheses) # wont error
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(hypotheses)])
  
  sep = [tokenizer.convert_tokens_to_ids(['[SEP]'])]*len(hypotheses) # wont error

  sentence2 = tf.ragged.constant([
      encode_sentence(s)
      for s in np.array(premises)])
  
  sentence1 = tf.concat([sentence1, sep], axis=-1)
  sentence2 = tf.concat([sentence2, sep], axis=-1)

  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [None]:
lengths = []
train_input = bert_encode(train.hypothesis.values, train.premise.values, tokenizer)
test_input = bert_encode(test.hypothesis.values, test.premise.values, tokenizer)

In [None]:
plt.plot(lengths)

In [None]:
max_len = 303

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    # Adding more FC layers
    FC1 = tf.keras.layers.Dense(2048)(embedding[:,0,:])
    FC2 = tf.keras.layers.Dense(512)(FC1)
    output = tf.keras.layers.Dense(3, activation='softmax')(FC2)
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    # cross categorical vs sparse cat
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
train.label.values.size

In [None]:
model.fit(train_input, train.label.values, epochs = 5, verbose = 1, batch_size = 64, validation_split = 0.2)

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.to_csv("submission.csv", index = False)

In [None]:
!pip install transformers

In [None]:
#pip install -U deep_translator
#from deep_translator import GoogleTranslator
#translator = Translator()
#def translate_sentence(x):
#  return GoogleTranslator('auto', 'en').translate(x)
#train.premise[train.lang_abv!= 'en']=train.premise[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
#train.hypothesis[train.lang_abv!= 'en']=train.hypothesis[train.lang_abv!= 'en'].apply(lambda x: translate_sentence(x))
#train.to_csv("/content/drive/MyDrive/Contradictory, My Dear Watson/train_translated.csv")

In [None]:
train_translated = pd.read_csv("../input/train-translated/train_translated.csv")

In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l2

In [None]:
def extract_tokens_from_binary_parse(parse):
    return parse.replace('(', ' ').replace(')', ' ').replace('-LRB-', '(').replace('-RRB-', ')').split()
def yield_examples(fn, skip_no_majority=True, limit=None):
  for i, line in enumerate(open(fn)):
    if limit and i > limit:
      break
    data = json.loads(line)
    label = data['gold_label']
    s1 = ' '.join(extract_tokens_from_binary_parse(data['sentence1_binary_parse']))
    s2 = ' '.join(extract_tokens_from_binary_parse(data['sentence2_binary_parse']))
    if skip_no_majority and label == '-':
      continue
    yield (label, s1, s2)

def get_data(fn, limit=None):
  raw_data = list(yield_examples(fn=fn, limit=limit))
  left = [s1 for _, s1, s2 in raw_data]
  right = [s2 for _, s1, s2 in raw_data]
  print(max(len(x.split()) for x in left))
  print(max(len(x.split()) for x in right))
  LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
  Y = np.array([LABELS[l] for l, s1, s2 in raw_data])
  Y = np_utils.to_categorical(Y, len(LABELS))

  return left, right, Y

In [None]:
print(train_translated.premise[0])

In [None]:
from gensim.models import word2vec
corpus = [
          'Text of the first document.',
          'Text of the second document made longer.',
          'Number three.',
          'This is number four.',
]
# we need to pass splitted sentences to the model
tokenized_sentences = [sentence.split() for sentence in corpus]
model1 = word2vec.Word2Vec(tokenized_sentences, min_count=1)

In [None]:
training = train_translated
VOCAB = len(tokenizer.word_counts) + 1
LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
training_premise_seq_vec = []
training_hypothesis_seq_vec = []
training = train_translated
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training.premise)
for words in training.premise[:20]:
    training_premise_seq = tokenizer.texts_to_sequences(words)
    training_premise_seq_vec.append(training_premise_seq)
print(len(training_premise_seq_vec))

tokenizer2 = Tokenizer(lower=False, filters='')
tokenizer2.fit_on_texts(training.hypothesis)
for words in training.hypothesis[:20]:
    training_hypothesis_seq = tokenizer2.texts_to_sequences(words)
    training_hypothesis_seq_vec.append(training_hypothesis_seq)
print(len(training_hypothesis_seq_vec))

training_label = str(training.label)
print(training_label)

labels1 = []
tokenizer3 = Tokenizer(lower=False, filters='')
tokenizer3.fit_on_texts(training_label)
for mylabels in training_label[:20]:
    mylabels1 = tokenizer3.texts_to_sequences(mylabels)
    labels1.append(mylabels1)
print(labels1)

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
max_tokens = 1000
MAX_LEN = 1000
embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, input_length=MAX_LEN)

prem1 = training_premise_seq_vec
hypo1 = training_hypothesis_seq_vec

premise = Input(shape=(MAX_LEN,),dtype='int32')
hypothesis = Input(shape=(MAX_LEN,),dtype='int32')
prem = embed(premise)
hypo = embed(hypothesis)

joint = keras.layers.concatenate([prem, hypo],dtype='float32')
joint = Dropout(DP, dtype='float32')(joint)
for i in range(3):
  joint = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION)(joint)
  joint = Dropout(DP)(joint)
  joint = BatchNormalization()(joint)
pred = Dense(3, activation='softmax')(joint)

model = keras.Model([premise, hypothesis], pred)

[print(i.shape) for i in model.inputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)
model.summary()
model.fit([prem1, hypo1], np.array(labels1))

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
max_tokens = 1000

prem = training_premise_seq_vec
hypo = training_hypothesis_seq_vec


model = keras.Sequential()
model.add(Embedding(max_tokens + 1, 128))
model.add(LSTM(64))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)
model.fit([prem, hypo], epochs=2,validation_split=0.2, batch_size = 512)
model.summary()

In [None]:

# Lowest index from the tokenizer is 1 - we need to include 0 in our vocab count

print('Build model...')
print('Vocab size =', VOCAB)
RNN = None
LAYERS = 1
USE_GLOVE = True
TRAIN_EMBED = False
EMBED_HIDDEN_SIZE = 300
SENT_HIDDEN_SIZE = 300
BATCH_SIZE = 512
PATIENCE = 4 # 8
MAX_EPOCHS = 42
MAX_LEN = 42
DP = 0.2
L2 = 4e-6
ACTIVATION = 'relu'

premise = Input(shape=(MAX_LEN,), dtype='int32')
hypothesis = Input(shape=(MAX_LEN,), dtype='int32')

if RNN and LAYERS > 1:
  for l in range(LAYERS - 1):
    rnn = RNN(return_sequences=True, **rnn_kwargs)
    prem = BatchNormalization()(rnn(prem))
    hypo = BatchNormalization()(rnn(hypo))
rnn = SumEmbeddings if not RNN else RNN(return_sequences=False, **rnn_kwargs)
prem = rnn(prem)
hypo = rnn(hypo)
prem = BatchNormalization()(prem)
hypo = BatchNormalization()(hypo)

joint = merge([prem, hypo], mode='concat')
joint = Dropout(DP)(joint)
for i in range(3):
  joint = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION, W_regularizer=l2(L2) if L2 else None)(joint)
  joint = Dropout(DP)(joint)
  joint = BatchNormalization()(joint)

pred = Dense(len(LABELS), activation='softmax')(joint)

model = word2vec.Word2Vec(input=[premise, hypothesis], output=pred)
model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

print('Training')
_, tmpfn = tempfile.mkstemp()
# Save the best model during validation and bail out of training early if we're not improving
callbacks = [EarlyStopping(patience=PATIENCE), ModelCheckpoint(tmpfn, save_best_only=True, save_weights_only=True)]
model.fit([training[0], training[1]], training[2], batch_size=BATCH_SIZE, nb_epoch=MAX_EPOCHS, validation_data=([validation[0], validation[1]], validation[2]), callbacks=callbacks)

# Restore the best found model during validation
model.load_weights(tmpfn)

loss, acc = model.evaluate([test[0], test[1]], test[2], batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))