In [None]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, GlobalMaxPooling1D, Conv1D, LSTM, Bidirectional, Dropout
from tensorflow.keras.metrics import CategoricalAccuracy, AUC, Precision, Recall, BinaryAccuracy

In [None]:
%matplotlib inline

sns.set_theme()

In [None]:
X = np.load("./dataset/FYP_train_X_SEQ.npy")
Y = np.load("./dataset/FYP_train_Y.npy")

tokenizer = pickle.load(open("./models/message_tokenizer.pkl",  "rb"))
embeddings = pickle.load(open("./dataset/glove_embeddings.pkl", "rb"))

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
embeddingMatrix = np.random.random((len(tokenizer.word_index) + 1, 50))
for word, i in tokenizer.word_index.items():
    embeddingVector = embeddings.get(word)
    if embeddingVector is not None:
        if len(embeddingMatrix[i]) != len(embeddingVector):
            print(
                "could not broadcast input array from shape", 
                str(len(embeddingMatrix[i])), 
                "into shape", 
                str(len(embeddingVector)), 
                "Please make sure your"
            )
            exit(1)
        embeddingMatrix[i] = embeddingVector

In [None]:
dnn_input_layer = Input(shape=(X.shape[1],))
dnn_embedding_layer = Embedding(len(tokenizer.word_index) + 1, embeddingMatrix.shape[1], trainable=True)(dnn_input_layer)
dnn_maxpool1d_layer = GlobalMaxPooling1D()(dnn_embedding_layer)
dnn_hidden_layer_1 = Dense(200, activation='relu')(dnn_maxpool1d_layer)
dnn_hidden_layer_2 = Dense(100, activation='relu')(dnn_hidden_layer_1)
dnn_hidden_layer_3 = Dense(50, activation='relu')(dnn_hidden_layer_2)
dnn_output_layer = Dense(1, activation='sigmoid')(dnn_hidden_layer_3)

model_dnn = Model(inputs=dnn_input_layer, outputs=dnn_output_layer)

model_dnn.compile(
    loss="binary_crossentropy", 
    optimizer="adam", 
    metrics=[BinaryAccuracy(name="binary_accuracy"), AUC(name="auc")]
)

In [None]:
model_dnn.summary()

In [None]:
plot_model(model_dnn)

In [None]:
cnn_input_layer = Input(shape=(X.shape[1],))
cnn_embedding_layer = Embedding(len(tokenizer.word_index) + 1, embeddingMatrix.shape[1], trainable=True)(cnn_input_layer)
cnn_conv1d_layer = Conv1D(400, 3, padding='valid', activation='relu', strides=1)(cnn_embedding_layer)
cnn_maxpool1d_layer = GlobalMaxPooling1D()(cnn_conv1d_layer)
cnn_hidden_layer_1 = Dense(200, activation='relu')(cnn_maxpool1d_layer)
cnn_hidden_layer_2 = Dense(100, activation='relu')(cnn_hidden_layer_1)
cnn_hidden_layer_3 = Dense(50, activation='relu')(cnn_hidden_layer_2)
cnn_output_layer = Dense(1, activation='sigmoid')(cnn_hidden_layer_3)

model_cnn = Model(inputs=cnn_input_layer, outputs=cnn_output_layer)

model_cnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=[BinaryAccuracy(name="binary_accuracy"), AUC(name="auc")])

In [None]:
model_cnn.summary()

In [None]:
plot_model(model_cnn)

In [None]:
bilstm_input_layer = Input(shape=(X.shape[1],))
bilstm_embedding_layer = Embedding(len(tokenizer.word_index) + 1, embeddingMatrix.shape[1], trainable=True)(bilstm_input_layer)
bilstm_layer_1 = Bidirectional(LSTM(32, return_sequences=True, recurrent_dropout=0.2))(bilstm_embedding_layer)
dropout_layer_1 = Dropout(0.2)(bilstm_layer_1)
bilstm_layer_2 = Bidirectional(LSTM(32, return_sequences=True, recurrent_dropout=0.2))(dropout_layer_1)
dropout_layer_2 = Dropout(0.2)(bilstm_layer_2)
bilstm_layer_3 = Bidirectional(LSTM(32, recurrent_dropout=0.2))(dropout_layer_2)
dropout_layer_3 = Dropout(0.2)(bilstm_layer_3)
bilstm_hidden_layer_1 = Dense(200, activation='relu')(dropout_layer_3)
bilstm_hidden_layer_2 = Dense(100, activation='relu')(bilstm_hidden_layer_1)
bilstm_hidden_layer_3 = Dense(50, activation='relu')(bilstm_hidden_layer_2)
bilstm_output_layer = Dense(1, activation='sigmoid')(bilstm_hidden_layer_3)

model_bilstm = Model(inputs=bilstm_input_layer, outputs=bilstm_output_layer)

model_bilstm.compile(loss="binary_crossentropy", optimizer="adam", metrics=[BinaryAccuracy(name="binary_accuracy"), AUC(name="auc")])

In [None]:
model_bilstm.summary()

In [None]:
plot_model(model_bilstm)

In [None]:
test_split_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

def train(X, Y, model, name):
    scores = {}
    
    print("Training", name, "\n")
    
    initial_weights = model.get_weights()
    
    for size in test_split_sizes:   
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=size, random_state=1)
        
        model.set_weights(initial_weights)

        model.fit(X_train, Y_train, batch_size=32, epochs=3, shuffle=True , validation_split=0.2, verbose=1)
        
        scores["{:.0f}%".format(size * 100)] = { }
                
        result = model.evaluate(X_train, Y_train, verbose=0)
        scores["{:.0f}%".format(size * 100)]["Train"] = {
            "Loss" : result[0],
            "Accuracy" : result[1],
            "AUC" : result[2]
        }

        result = model.evaluate(X_test, Y_test, verbose=0)
        scores["{:.0f}%".format(size * 100)]["Test"] = {
            "Loss" : result[0],
            "Accuracy" : result[1],
            "AUC" : result[2]
        }
        
        idx = "{:.0f}%".format(size * 100)
        
        print("Test Size={:.4f}".format(size))
        print("Train Loss={:.4f} Train Accuracy={:.4f} Train AUC={:.4f}".format(scores[idx]["Train"]["Loss"], scores[idx]["Train"]["Accuracy"], scores[idx]["Train"]["AUC"]))
        print("Test Loss={:.4f} Test Accuracy={:.4f} Test AUC={:.4f}".format(scores[idx]["Test"]["Loss"], scores[idx]["Test"]["Accuracy"], scores[idx]["Test"]["AUC"]))
        print("\n")
        
        del X_train, X_test, Y_train, Y_test
        
    return scores

In [None]:
dnn_result = train(X, Y, model_dnn, "DNN")

In [None]:
cnn_result = train(X, Y, model_cnn, "CNN")

In [None]:
bilstm_result = train(X, Y, model_bilstm, "BiLSTM")

In [None]:
def export_result(result, name):
    table = pd.DataFrame(columns=["Test Size", "Train Loss", "Train Accuracy", "Train AUC", "Test Loss", "Test Accuracy", "Test AUC"])
    
    for test_size in result:
        table.loc[table.shape[0]] = [
            test_size, 
            round(result[test_size]["Train"]["Loss"], 3), 
            round(result[test_size]["Train"]["Accuracy"], 3), 
            round(result[test_size]["Train"]["AUC"], 3), 
            round(result[test_size]["Test"]["Loss"], 3), 
            round(result[test_size]["Test"]["Accuracy"], 3),
            round(result[test_size]["Test"]["AUC"], 3)
        ]
    
    table.to_csv(f"./logs/{name}.csv")
    
    return table

In [None]:
dnn_result = export_result(dnn_result, "DNN")

In [None]:
cnn_result = export_result(cnn_result, "CNN")

In [None]:
bilstm_result = export_result(bilstm_result, "BiLSTM")

In [None]:
train_split_sizes = 1 - np.array(test_split_sizes)

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Train Accuracy"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Train Accuracy"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Train Accuracy"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Train Accuracy")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Test Accuracy"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Test Accuracy"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Test Accuracy"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Test Accuracy")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Train AUC"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Train AUC"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Train AUC"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Train AUC")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Test AUC"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Test AUC"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Test AUC"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Test AUC")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Train Loss"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Train Loss"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Train Loss"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Train Loss")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, dnn_result["Test Loss"], label="DNN")
plt.plot(train_split_sizes * 100, cnn_result["Test Loss"], label="CNN")
plt.plot(train_split_sizes * 100, bilstm_result["Test Loss"], label="BiLSTM")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Test Loss")

plt.legend()

plt.show()