In [None]:
import pandas as pd

train_data = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
train_data.head()

In [None]:
from sklearn.model_selection import train_test_split

# Prepare training and validation sets
X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3
)

print(X_train)
print(X_val)
print(y_train)
print(y_val)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
train_data.groupby(["language"]).sum().plot(kind="pie", y="label", figsize=(14, 14), autopct='%1.1f%%')

In [None]:
# Build a english-base model for tokenization
from keras.preprocessing.text import Tokenizer

corpus_data = pd.concat([X_train["premise"], X_train["hypothesis"]])
tokenizer = Tokenizer(lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(corpus_data)
word_index = tokenizer.word_index
print(len(word_index))

In [None]:
# Model Parameters
input_size = 1000
num_words = len(word_index)

In [None]:
from keras.preprocessing.sequence import pad_sequences
# Largest possible sequence length in training dataset
max_train_seq_len = corpus_data.apply(lambda x: len(x)).max()

train_premises = tokenizer.texts_to_sequences(X_train["premise"])
train_hypothesis = tokenizer.texts_to_sequences(X_train['hypothesis'])

train_premises = pad_sequences(train_premises, padding="post", maxlen=1000)
train_hypothesis = pad_sequences(train_hypothesis, padding="post", maxlen=1000)

val_premises = tokenizer.texts_to_sequences(X_val["premise"])
val_hypothesis = tokenizer.texts_to_sequences(X_val['hypothesis'])

val_premises = pad_sequences(val_premises, padding="post", maxlen=1000)
val_hypothesis = pad_sequences(val_hypothesis, padding="post", maxlen=1000)

In [None]:
# Convert y values to categorical
from keras.utils import np_utils

cat_y_train = np_utils.to_categorical(y_train)
cat_y_val = np_utils.to_categorical(y_val)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

premise_input = keras.Input(
    shape=(input_size,), name="premise"
) # Handle premises in a head
hypothesis_input = keras.Input(
    shape=(input_size,), name="hypothesis"
) # Handle hypotheses in a different head

embedding_layer = layers.Embedding(num_words, 64)
premise_features = embedding_layer(premise_input)
hypothesis_features = embedding_layer(hypothesis_input)

lstm_layer = layers.LSTM(128)
premise_features = lstm_layer(premise_features)
hypothesis_features = lstm_layer(hypothesis_features)

x = layers.concatenate([premise_features, hypothesis_features])

hidden_layer = layers.Dense(64, activation="relu")(x)
hidden_layer = layers.Dropout(0.2)(hidden_layer)

statement_class = layers.Dense(3, activation="softmax")(hidden_layer)

model = keras.Model(
    inputs = [premise_input, hypothesis_input],
    outputs = [statement_class]
)

keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics="accuracy"
)

history = model.fit(
    {"premise": train_premises, "hypothesis": train_hypothesis},
    cat_y_train,
    epochs=10,
    batch_size=101
)