In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, GRU ,Dense , Embedding,Dropout ,Input,concatenate
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D,SpatialDropout1D,Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import re
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

np.random.seed(123)
pd.set_option('max_colwidth', 800)

In [None]:
df_train = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
df_test = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

In [None]:
#EDA
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.iloc[0]

In [None]:
print(f"Train: {df_train.shape}, Test: {df_test.shape}")

In [None]:
df_train.info()

In [None]:
print(f"Train:\n{df_train.isnull().sum()} \n\nTest:\n{df_test.isnull().sum()}")

In [None]:
labels, freqs = np.unique(df_train['language'].values, return_counts=True)

plt.figure(figsize=(10,10))
plt.pie(freqs, labels = labels , autopct = "%1.2f%%")
plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(df_train['language'].values , hue = df_train['label'])
plt.tight_layout()
plt.show();

In [None]:
train_eng = df_train[df_train['language'] == "English"]
test_eng = df_test[df_test["language"] == "English"]

In [None]:
def premise_hypothesis_join(x_list):
    array = []
    
    for i in x_list:
        array.append(" ".join(i))
        
    return pd.DataFrame({"res": array})

In [None]:
#train_join_eng = premise_hypothesis_join(train_eng[['premise', 'hypothesis']].values.tolist())
#test_join_eng = premise_hypothesis_join(test_eng[['premise', 'hypothesis']].values.tolist())

train_join_eng = train_eng[ ['premise', 'hypothesis']]
test_join_eng = test_eng[['premise', 'hypothesis']]

In [None]:
train_join_eng.head(2)

In [None]:
#Buyuk harfleri küçük harf yapma
col_name = ['premise', 'hypothesis']
lower_dataset_train = train_join_eng.copy()
lower_dataset_test = test_join_eng.copy()
for col in col_name:
    lower_dataset_train[col] = lower_dataset_train[col].apply(lambda x :  " ".join(word.lower() for word in x.split()))
    #test
    lower_dataset_test[col] = lower_dataset_test[col].apply(lambda x : " ".join(word.lower() for word in x.split()))
                                                 
lower_dataset_train.head(2)

In [None]:
#noktalama işaretlerini silme
point_dataset_train = lower_dataset_train.copy()
point_dataset_test = lower_dataset_test.copy()
for col in col_name:
    point_dataset_train[col] = point_dataset_train[col].str.replace("[^\w\s]","")
    #test
    point_dataset_test[col] = point_dataset_test[col].str.replace("[^\w\s]","")

point_dataset_train.head(2)

In [None]:
#stopWordsların silinmesi
stopWord_dataset_train = point_dataset_train.copy() 
stopWord_dataset_test = point_dataset_test.copy() 
sw = stopwords.words('english')
for col in col_name:
    stopWord_dataset_train[col] = stopWord_dataset_train[col].apply(lambda x : " ".join(word  for word in x.split()
                                                                       if word not in sw))
    #test
    stopWord_dataset_test[col] = stopWord_dataset_test[col].apply(lambda x : " ".join(word  for word in x.split()
                                                                        if word not in sw))

data_train = stopWord_dataset_train
target = train_eng["label"].values
data_train["target"] =  target 

data_test = stopWord_dataset_test

data_train.sample(3)

In [None]:
data_train.isnull().sum()

In [None]:
#x = data_train["res"].values.tolist()
x = data_train.drop("target", axis = 1)
y = data_train["target"].values.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.2, 
                                                   random_state = 42 , shuffle = True)

print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
#Tokenleştirme
#num_words = 10000 # bag of words
token_x = premise_hypothesis_join(x[['premise', 'hypothesis']].values.tolist())
token_x = token_x["res"].values.tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_x)
num_words = len(tokenizer.word_index) +1

In [None]:
x_train_token = {}
x_test_token  = {}

for col in col_name:
    x_train_token[col] = tokenizer.texts_to_sequences(x_train[col].values.tolist())
    x_test_token[col] = tokenizer.texts_to_sequences(x_test[col].values.tolist())

In [None]:
idx = tokenizer.word_index
inversed_idx  = dict(zip(idx.values(), idx.keys()))
def token_to_string(liste): 
    text = [inversed_idx[i]for i in liste]
    return " ".join(text)    

In [None]:
x_train_token['premise'][20]

In [None]:
token_to_string( x_train_token['premise'][20])

In [None]:
x_train['premise'].iloc[20]

In [None]:
#padding input için boyutları hepsini aynı yapma
max_token = {}

for col in col_name:
    

    num_tokens = [len(token) for token in x_train_token[col] + x_test_token[col]] #liste halinde token uzunlukları tutma
    num_tokens = np.array(num_tokens)

    #input uzunlugu mean + 2 * std = %95
    max_token[col] = int(np.mean(num_tokens) + 2 * np.std(num_tokens))
    print(max_token[col])

In [None]:
x_train_pad = {}
x_test_pad = {}

for col in col_name:
    

    x_train_pad[col] = pad_sequences(x_train_token[col], maxlen= max_token[col] , padding="post")
    x_test_pad[col] = pad_sequences(x_test_token[col], maxlen = max_token[col], padding = "post")
    
x_train_pad['premise'][20]

In [None]:
#/kaggle/input/glove6b50dtxt/glove.6B.50d.txtss
word2vec = {}
with open("/kaggle/input/glove6b50dtxt/glove.6B.50d.txt" , encoding="UTF-8") as f :
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype = "float32")
        word2vec[word] = vec


In [None]:
embedding_size = 50

In [None]:
embedding_matrix = np.random.uniform(-1,1, (num_words, embedding_size))

for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vektor = word2vec.get(word)
        if embedding_vektor is not None:
            embedding_matrix[i] = embedding_vektor

In [None]:
embedding_matrix.shape

In [None]:
#Model
premise_input = Input(shape=(max_token["premise"],) ,name ="premise")
hypot_input = Input(shape=(max_token["hypothesis"], ), name = "hypothesis")

x = Embedding(input_dim = num_words,
             output_dim = embedding_size,
             input_length = max_token["premise"],  
             weights = [embedding_matrix],
             trainable = True,
             name = "premise_embedding")(premise_input)

y = Embedding(input_dim = num_words,
             output_dim = embedding_size,
             input_length = max_token["hypothesis"] ,
             weights = [embedding_matrix],
             trainable = True,
             name = "hypothesis_embedding")(hypot_input)



x = Bidirectional(GRU(64, return_sequences = True, dropout = 0.1,
                     recurrent_dropout = 0.1))(x)

y = Bidirectional(GRU(64, return_sequences = True, dropout = 0.1,
                     recurrent_dropout = 0.1))(y)


x = Bidirectional(GRU(64, return_sequences = True, dropout = 0.1,
                     recurrent_dropout = 0.1))(x)

y = Bidirectional(GRU(64, return_sequences = True, dropout = 0.1,
                     recurrent_dropout = 0.1))(y)


x = GlobalMaxPooling1D()(x)
y = GlobalMaxPooling1D()(y)


x_y = concatenate([x ,y])


x_y = Dense(512, activation = "relu")(x_y)
x_y = Dropout(0.1)(x_y)
x_y = Dense(256)(x_y)
x_y = Dropout(0.3)(x_y)

out = Dense(3 , activation = "softmax", name = "output")(x_y)

model = Model(inputs = [premise_input, hypot_input],
             outputs = [out])

In [None]:
plot_model(model, "model.png", show_shapes = True)

In [None]:

model.compile(loss = "sparse_categorical_crossentropy", optimizer = Adam(learning_rate= 1e-3),
             metrics = ["accuracy"])

In [None]:
#train_y = to_categorical(y_train, 3)
#test_y = to_categorical(y_test, 3)

In [None]:
hist = model.fit(x = {'premise' : x_train_pad['premise'],
                      'hypothesis' :  x_train_pad['hypothesis']},
                 y = {"output" : y_train},
                 validation_data = ({'premise' : x_test_pad['premise'],
                                  'hypothesis' :  x_test_pad['hypothesis']},
                                  {"output" : y_test}),
                 epochs = 20 ,
                 batch_size = 32)

In [None]:
#model.save("model.h5")
model.save_weights("model_weights.h5")

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,6))


axes[0].plot(hist.history['accuracy'], label='train accuracy', color='g', axes=axes[0])
axes[0].plot(hist.history['val_accuracy'], label='val accuracy', color='r', axes=axes[0])
axes[0].set_title("Model Accuracy", fontsize=16)
axes[0].set_xlabel("Epoch")
axes[0].legend()

axes[1].plot(hist.history['loss'], label='train loss', color='g', axes=axes[1])
axes[1].plot(hist.history['val_loss'], label='val loss', color='r', axes=axes[1])
axes[1].set_title("Model Loss", fontsize=16) 
axes[1].set_xlabel("Epoch")
axes[1].legend()

plt.show();

In [None]:
#submission
test_token = {}
test_pad = {}
for col in col_name:
    test_token[col] = tokenizer.texts_to_sequences(data_test[col].values.tolist())
    
for col in col_name:
    test_pad[col] = pad_sequences(test_token[col], maxlen= max_token[col] , padding="post")

In [None]:
test_input = {'premise' : test_pad['premise'],
             'hypothesis' : test_pad['hypothesis']}

In [None]:

y_predict = model.predict(test_input)

y_pred = y_predict.argmax(axis=-1)

In [None]:
submission = test_eng.id.copy().to_frame()
submission['prediction'] = y_pred
submission.sample(5) 

In [None]:
submission.to_csv("submission.csv", index = False) 