In [1]:
import numpy as np
import tensorflow as tf
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import transformers
import tqdm
import pickle
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!unzip snli_1.0.zip

In [3]:
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [22]:
col_list = ['sentence1', 'sentence2', 'label1']
train_data = pd.read_csv("snli_1.0/snli_1.0_train.txt", sep="\t", usecols=col_list)
val_data = pd.read_csv("snli_1.0/snli_1.0_dev.txt", sep="\t", usecols=col_list)
test_data = pd.read_csv("snli_1.0/snli_1.0_test.txt", sep="\t", usecols=col_list)

In [23]:
train_data = train_data.dropna()
val_data = val_data.dropna()
test_data = test_data.dropna()

In [24]:
label_dic = {'contradiction' : 0, 'neutral' : 1, 'entailment' : 2}
train_data['label'] = train_data.apply(lambda row : label_dic[row['label1']], axis=1)
val_data['label'] = val_data.apply(lambda row : label_dic[row['label1']], axis=1)
test_data['label'] = test_data.apply(lambda row : label_dic[row['label1']], axis=1)

In [25]:
MAX_SEQ_LEN = max(train_data.sentence1.str.split().str.len().max(), train_data.sentence2.str.split().str.len().max())
MAX_SEQ_LEN

In [26]:
train_data

In [27]:
val_data

In [28]:
test_data

In [None]:
# for i in range(train_data.shape[0]):
#     train_data.iloc[i, 0]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", train_data.iloc[i, 0]).split())
#     train_data.iloc[i, 1]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", train_data.iloc[i, 1]).split())

# for i in range(val_data.shape[0]):
#     val_data.iloc[i, 0]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", val_data.iloc[i, 0]).split())
#     val_data.iloc[i, 1]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", val_data.iloc[i, 1]).split())

# for i in range(test_data.shape[0]):
#     test_data.iloc[i, 0]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", test_data.iloc[i, 0]).split())
#     test_data.iloc[i, 1]=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", test_data.iloc[i, 1]).split())

In [29]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [30]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    all_ids, all_masks = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        tokens = tokenizer.tokenize(doc)
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
        all_ids.append(ids)
        all_masks.append(masks)
    encoded = np.array([all_ids, all_masks])
    return encoded

In [31]:
train_data_1 = train_data['sentence1'].values
train_data_2 = train_data['sentence2'].values
train_labels = train_data['label'].values

val_data_1 = val_data['sentence1'].values
val_data_2 = val_data['sentence2'].values
val_labels = val_data['label'].values

test_data_1 = test_data['sentence1'].values
test_data_2 = test_data['sentence2'].values
test_labels = test_data['label'].values

In [32]:
train_data_1_features_ids, train_data_1_features_masks = create_bert_input_features(tokenizer, train_data_1, max_seq_length=MAX_SEQ_LEN)
train_data_2_features_ids, train_data_2_features_masks = create_bert_input_features(tokenizer, train_data_2, max_seq_length=MAX_SEQ_LEN)

val_data_1_features_ids, val_data_1_features_masks = create_bert_input_features(tokenizer, val_data_1, max_seq_length=MAX_SEQ_LEN)
val_data_2_features_ids, val_data_2_features_masks = create_bert_input_features(tokenizer, val_data_2, max_seq_length=MAX_SEQ_LEN)

test_data_1_features_ids, test_data_1_features_masks = create_bert_input_features(tokenizer, test_data_1, max_seq_length=MAX_SEQ_LEN)
test_data_2_features_ids, test_data_2_features_masks = create_bert_input_features(tokenizer, test_data_2, max_seq_length=MAX_SEQ_LEN)

In [33]:
inp_1_id = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype='int32', name="bert_input_1_ids")
inp_1_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype='int32', name="bert_input_1_masks")
inputs_1 = [inp_1_id, inp_1_mask]

inp_2_id = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype='int32', name="bert_input_2_ids")
inp_2_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype='int32', name="bert_input_2_masks")
inputs_2 = [inp_2_id, inp_2_mask]

bert_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-cased')
for layer in bert_layer.layers:
    layer.trainable = False

hidden_state_1 = bert_layer(inputs_1)[0]
hidden_state_2 = bert_layer(inputs_2)[0]

pooled_output_1 = hidden_state_1[:, 0]
pooled_output_2 = hidden_state_2[:, 0]
concatenated = tf.keras.layers.concatenate([pooled_output_1, pooled_output_2], axis=1)
dense1 = tf.keras.layers.Dense(256, activation='relu')(concatenated)
drop1 = tf.keras.layers.Dropout(0.1)(dense1)
dense2 = tf.keras.layers.Dense(256, activation='relu')(drop1)
drop2 = tf.keras.layers.Dropout(0.1)(dense2)
dense3 = tf.keras.layers.Dense(256, activation='relu')(drop2)
drop3 = tf.keras.layers.Dropout(0.1)(dense3)
output = tf.keras.layers.Dense(3, activation='softmax')(drop3)

model = tf.keras.Model(inputs=[inp_1_id, inp_1_mask, inp_2_id, inp_2_mask], outputs=output)
model.summary()

In [34]:
# tf.optimizers.Adam(learning_rate=1e-3, epsilon=1e-08)
model.compile(optimizer='adam',
              loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

In [35]:
enc = OneHotEncoder()
train_labels_one_hot = enc.fit_transform(np.array(train_labels).reshape(-1, 1)).toarray()
val_labels_one_hot = enc.transform(np.array(val_labels).reshape(-1, 1)).toarray()
test_labels_one_hot = enc.transform(np.array(test_labels).reshape(-1, 1)).toarray()

In [36]:
train_labels_one_hot.shape, val_labels_one_hot.shape, test_labels_one_hot.shape

In [None]:
history = model.fit([train_data_1_features_ids, train_data_1_features_masks, 
                     train_data_2_features_ids, train_data_2_features_masks], 
                    train_labels_one_hot, 
                    validation_data = ([val_data_1_features_ids, val_data_1_features_masks, 
                     val_data_2_features_ids, val_data_2_features_masks], val_labels_one_hot),
                    epochs=10, batch_size=64, shuffle=True)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy']) 
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
Y_pred = model.predict([test_data_1_features_ids, test_data_1_features_masks, test_data_2_features_ids, test_data_2_features_masks])
Y_pred = np.argmax(Y_pred, axis=1)
Y_test = test_labels

In [None]:
Y_pred.shape, Y_test.shape

In [None]:
plt.figure(figsize=(8, 6))
ticks = ['Contradiction', 'Neutral', 'Entailment']
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True, fmt='g', xticklabels=ticks, yticklabels=ticks)
plt.xlabel("Predicted labels")
plt.ylabel("True labels")

In [None]:
plt.figure(figsize=(8, 6))
report = classification_report(Y_test, Y_pred, output_dict=True, target_names=ticks)
sns.heatmap(pd.DataFrame(report).iloc[:-1, :3].T, annot=True)