# ***Import Statements for libraries***

In [None]:
import numpy as np
import pandas as pd
import wordcloud
import seaborn as sns
import matplotlib.pyplot as plt


from keras import backend as K

from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential
from keras.layers import Lambda, GlobalAveragePooling1D, Dense, Embedding,concatenate, Bidirectional
from keras.layers import Dropout, Input, LeakyReLU, Conv1D, GlobalMaxPooling1D,InputLayer, ReLU, LSTM


#Utils
from keras.utils.vis_utils import plot_model
from IPython.display import SVG
from keras.utils import vis_utils
from sklearn.manifold import TSNE

#!pip3 install keras_metrics
import tensorflow as tf
from tensorflow.python.keras.metrics import Metric
from tensorflow.keras.optimizers import Adam

import io
from google.colab import files


In [None]:
#metrics
from sklearn.metrics import f1_score , recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#Classifiers 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Time and counters
from time import perf_counter

#grid search of params
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

# NLP lib
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk import flatten , PorterStemmer, WordNetLemmatizer, FreqDist
from collections import Counter

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

#Mark Down print
from IPython.display import Markdown, display
def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))


# **Data Test Train Split for hyper-parameter tuning of : KNN and Gradient boosting**



In [None]:
#Test Train Split of Data
# load data

# uploaded = files.upload()
# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

data = pd.read_csv('drive/MyDrive/SPAM classification deep learning/Dataset/Augmented_emails.csv')

#Label
data['spam'] = [1 if x == 1.0 else 0 for x in data['spam']]

#remove duplicated data
print(f"duplicated entries {data.duplicated().sum()}\n")
data = data.drop_duplicates()
print(f"duplicated entries {data.duplicated().sum()}\n")

#remove Null values
print("Number of null features in the dataset :")
print(f"{data.isnull().sum()}\n")

data.dropna(subset=["spam"], inplace=True)

print("Number of null features in the dataset :")
print(f"{data.isnull().sum()}")
print()

print(f"shape of the dataset : {data.shape}, Number of rows and columns : {data.shape[0]} , {data.shape[1]}\n")

#Make a copy of the data set.
data_ANN_RNN_CNN = data.copy()

#Splitting the data - 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(data["X"] , data["spam"], test_size = 0.2, random_state = 99)
print(f"Training split input: {X_train.shape}")
print(f"Testing split input : {X_test.shape}")
print(f"Training split class: {y_train.shape}")
print(f"Testing split class : {y_test.shape}")

print(X_train.dtypes)
print(X_test.dtypes)
print(y_train.dtypes)
print(y_test.dtypes)

#TF IDF - vectorisation of data , feature extraction
tfidf = TfidfVectorizer()
X_train_vect = tfidf.fit_transform(X_train.values.astype('U'))
X_test_vect = tfidf.transform(X_test.values.astype('U'))

# Get feature names in the vector
#tfidf.get_feature_names()

X_train_vect.toarray()
print(f"Training data shape : {X_train_vect.shape}")
X_test_vect.toarray()
print(f"Testing data shape : {X_test_vect.shape}")

# ***Helper function to show Confusion diagram and ROC-AUC ***

### ROC-AUC : Compute Area Under the Curve (AUC) using the trapezoidal rule. This is a general function, given points on a curve. For computing the area under the ROC-curve.

### AUC - ROC curve is a performance measurement for the classification problems at various threshold settings. ROC is a probability curve and AUC represents the degree or measure of separability. It tells how much the model is capable of distinguishing between classes. Higher the AUC, the better the model is at predicting 0 classes as 0 and 1 classes as 1. By analogy, the Higher the AUC, the better the model is at distinguishing between patients with the disease and no disease.

In [None]:
def get_confusion_matrix_heatmap(y_test,y_pred,fName):
    # Confusion Matrix
    # sklearn builtin function to calculate confusion matrix values using true labels and predictions
    CF = confusion_matrix(y_test,y_pred.round())
    # list of labels that will be displayed on the image boxes
    labels = ['True Neg','False Pos','False Neg','True Pos']
    # list of all possible label values
    categories = ['Spam', 'Ham']
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    # count total values present in each cell of the matrix
    group_counts = ["{0:0.0f}".format(value) for value in CF.flatten()]
    # count percentage of total values present in each cell of the matrix
    group_percentages = ["{0:.2%}".format(value) for value in CF.flatten()/np.sum(CF)]
    # group the labels to plot in graph
    labels = [f"{v1}\n{v2}\n{v3}"for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    # reshape true label values according to the requirement
    labels = np.asarray(labels).reshape(2,2)
    # declare graph using heatmap function
    heatmap=sns.heatmap(CF, annot=labels, fmt='', cmap='Blues')
    # plot confusion matrix
    fig = heatmap.get_figure()
    # save confusion matrix as image in results folder
    fig.savefig('drive/MyDrive/SPAM classification deep learning/heatmaps/'+fName)
    # display confusion matrix as numeric values
    print(CF)

In [None]:
def ROC_AUC(y_test, y_pred, fname):
    # evluate true positive rate and false positive rate using sklearn builtin function
    lr_fpr, lr_tpr, _ = roc_curve(y_test, y_pred)
    # find area under curve score
    lr_auc = auc(lr_fpr, lr_tpr)

    # display auc score
    print("AUC:", lr_auc)
    # plot linear line with no learning
    plt.plot([0, 1], [0, 1], 'k--')
    # plot tpr and fpr ratio
    plt.plot(lr_fpr, lr_tpr, marker='.', label='lr (auc = %0.3f)' % lr_auc)
    # assign labels
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('Receiver Operating Characterisics')
    plt.legend(loc='lower right')
    plt.savefig(f"drive/MyDrive/SPAM classification deep learning/Visuals/{fname}")
    return lr_auc

# KNN - K nearest neighbours

# **Hyper parameter tuning : **
1.   leaf size
2.   n_neighbours
3.   P









In [None]:
"""
KNN()
"""
# Find the best hyperparameter with GridSearchCV
# Exhaustive search over specified parameter values for an estimator.

#List Hyperparameters that we want to tune.
leaf_size = [5,10,15,20,25,30]
n_neighbors = [5,10,15,20,25,30]
p=[1,2]

# #Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

# #Use GridSearch
grid = GridSearchCV(KNeighborsClassifier(), param_grid=hyperparameters, cv=10)
# #Fit the model
grid.fit(X_train_vect,y_train)

# # Create a DataFrame with the best Hyperparameters
KNN_hyperparameter_tuned = pd.DataFrame(grid.cv_results_)[['params','mean_test_score']]\
                               .sort_values(by="mean_test_score", ascending=False)

KNN_hyperparameter_tuned.to_csv("drive/MyDrive/SPAM classification deep learning/Visuals/KNN_hyperparameters.csv")

In [None]:
# Plot grid Seach results
# scores = [x[1] for x in grid.grid_scores_]
# scores = np.array(scores).reshape(len(leaf_size), len(n_neighbors))

# for ind, i in enumerate(leaf_size):
#     plt.plot(n_neighbors, scores[ind], label='C: ' + str(i))
# plt.legend()
# plt.xlabel('n neighbors')
# plt.ylabel('Mean score')
# plt.show()
# plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/KNN_Grid_search.jpeg")

In [None]:

# Display the best hyperparameters
grid.best_params_

In [None]:
leaf_size, n_neighbors, p = grid.best_params_['leaf_size'], grid.best_params_['n_neighbors'], grid.best_params_['p']
model = KNeighborsClassifier(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

model.fit(X_train_vect,y_train)
y_pred = model.predict(X_test_vect)

printmd(f'## Accuracy: {round(accuracy_score(y_test,y_pred),3)*100}%\n')

categories = ['Ham', 'Spam']
KNN_report = classification_report(y_test,y_pred,target_names=categories)
print(KNN_report)

#confusion matrix
get_confusion_matrix_heatmap(y_test, y_pred, "KNN.jpeg")

In [None]:
#AUC
lr_auc_knn_tuned = ROC_AUC(y_test, y_pred, "AUC_KNN.jpeg")

# ***Gradient Boosting : ***
### Hyper parameters : 


1.   Learning rate
2.   n_estimators
3.   max depth



In [None]:
"""
GradientBoostingClassifier()
"""
# Find the best hyperparameter with GridSearchCV
# Exhaustive search over specified parameter values for an estimator.

# List Hyperparameters that we want to tune.

hyperparameters = {
    "n_estimators":[5,50,100,150],
    "max_depth":[1,3,5,7],
    "learning_rate":[0.01,0.1,1]
}
start  = perf_counter()
#Use GridSearch
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=hyperparameters, cv=5)
#Fit the model
grid.fit(X_train_vect,y_train)

end = perf_counter() - start

print(f"Time taken : {end} sec's")

# Create a DataFrame with the best Hyperparameters
GB_hyperparameter_tuned = pd.DataFrame(grid.cv_results_)[['params','mean_test_score']]\
                               .sort_values(by="mean_test_score", ascending=False)
GB_hyperparameter_tuned.to_csv("drive/MyDrive/SPAM classification deep learning/Visuals/Gradient_boosting_hyperparameters.csv")


In [None]:
#Plot grid Seach results
# scores = [x[1] for x in grid.grid_scores_]
# scores = np.array(scores).reshape(len(leaf_size), len(n_neighbors))

# for ind, i in enumerate(leaf_size):
#     plt.plot(n_neighbors, scores[ind], label='C: ' + str(i))
# plt.legend()
# plt.xlabel('n neighbors')
# plt.ylabel('Mean score')
# plt.show()
# plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/GradientBoosting_Grid_search.jpeg")

In [None]:
grid.best_params_

In [None]:
n_estimators, max_depth, learning_rate = grid.best_params_['n_estimators'], grid.best_params_['max_depth'], grid.best_params_['learning_rate']
model = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)

model.fit(X_train_vect,y_train)
y_pred = model.predict(X_test_vect)

printmd(f'## Accuracy: {round(accuracy_score(y_test,y_pred),3)*100}%\n')

categories = ['Ham', 'Spam']
GB_report = classification_report(y_test,y_pred,target_names=categories)
print(GB_report)

#confusion matrix
get_confusion_matrix_heatmap(y_test, y_pred, "Gradient_boosting.jpeg")

In [None]:
#AUC
lr_auc_GradientBoosting_tuned = ROC_AUC(y_test, y_pred, "AUC_Gradient_Boosting.jpeg")

# Data Test Train Split for DEEP neural Networks : CNN , ANN, RNN

In [None]:
# ANN RNN-LSTM CNN 
#Splitting the data - 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(data_ANN_RNN_CNN["X"] , data_ANN_RNN_CNN["spam"], test_size = 0.2, random_state = 99)
print(f"Training split input: {X_train.shape}")
print(f"Testing split input : {X_test.shape}")
print(f"Training split class: {y_train.shape}")
print(f"Testing split class : {y_test.shape}")

maxLen = len(max(X_train.astype("U"), key=len).split())
print(f"Max length of the sentence in corpus : {maxLen}")

In [None]:
print(f"Type of X_train : {type(X_train)}")
print(f"Training sample shape: \n{X_train.shape}\n")
x_train = np.array(X_train.copy())
x_train = x_train.astype(str)
print(f"Type of X_train : {type(x_train)} with dtypes : {x_train.dtype}")
print(f"First training sample: \n{x_train[0]}\n")
print(f"Training sample shape: \n{x_train.shape}\n\n")

print(f"Type of X_test : {type(X_test)}")
print(f"First testing sample shape: \n{X_test.shape}\n\n")
x_test = np.array(X_test.copy())
x_test= x_test.astype(str)
print(f"Type of X_test : {type(x_test)} with dtypes : {x_test.dtype}")
print(f"First testing sample: \n{x_test[0]}\n\n")
print(f"Testing sample shape: \n{x_test.shape}\n\n")

print(f"Type of y_train : {type(y_train)}")
print(f"First training label shape: \n{y_train.shape}\n\n")
Y_train = np.array(y_train.copy())
print(f"Type of y_train : {type(Y_train)}")
print(f"First training label : \n{Y_train[0]}\n\n")
print(f"Training label shape: \n{Y_train.shape}\n\n")

print(f"Type of y_test : {type(y_test)}")
print(f"First testing label shape: \n{y_test.shape}\n\n")
Y_test = np.array(y_test.copy())
print(f"Type of y_test : {type(Y_test)}")
print(f"First testing label : \n{Y_test[0]}\n\n")
print(f"Testing label shape: \n{Y_test.shape}\n\n")


# ***Data set Processing***
1. Vocabulary creation 
2. Tokenization
3. Text to sequence / word to vector
4. Padding to max length
5. Implement sequence/index to word

In [None]:
"""
Input text Processing
steps : 

1. Vocabulary creation 
2. Tokenization
3. Text to sequence / word to vector
4. Padding to max length
5. Implement sequence/index to word
"""

# reference : https://www.kaggle.com/anirudhchandnani/ann-vs-lstm-vs-bi-lstm-on-nlp
#ANN
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)
tokenizer.word_index.keys()

In [None]:
# Index to word Function dict
idx_to_word = dict(map(reversed, tokenizer.word_index.items()))
print(f"length of word to index : {len(word_index)}")
print(f"length of index to word : {len(idx_to_word)}")

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen = 2000, padding="post")
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen = 2000, padding="post")

print("Training X Shape:",X_train.shape)
print("Testing X Shape:",X_test.shape)

print(f"Training : {type(x_train)}")
print(f"Testing: {type(x_test)}")

In [None]:
#The first 5 training samples
for i in range(5):
  print(x_train[i],"\n")

# **ANN : Artificial Neural Network : A shallow NN with hidden layer**

In [None]:
"""
ANN : Artificial Neural Network : A shallow NN with hidden layer.
"""
maxlen = 2000
ann = Sequential()
ann.add(Embedding(input_dim=vocab_size, 
                           output_dim=100, 
                           input_length=maxlen))
ann.add(GlobalMaxPooling1D())
ann.add(Dense(10, activation='relu'))
# Adding dropout to prevent overfitting
ann.add(Dropout(0.1))
ann.add(Dense(1, activation='sigmoid'))


In [None]:
ann.summary()
# Visualise the model another way
plot_model(
    ann,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/ann_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = ann.fit(x_train, Y_train, epochs=3, validation_data=(x_test,Y_test), validation_steps = len(x_test), steps_per_epoch= len(x_train))

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/ann_accuracy_loss.jpeg")


# Predicting the Test set results
y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
y_pred_ann = np.array(y_pred)

test_loss_ann, test_acc_ann = ann.evaluate(x_test, Y_test)
test_err_ann = 100 - test_acc_ann*100

print(f"Test Loss:     {test_loss_ann*100} %")
print(f"Test Accuracy: {test_acc_ann*100}  %")
print(f"Test error: {test_err_ann}  %")



In [None]:
#AUC
lr_auc_ann = ROC_AUC(Y_test, y_pred_ann, "AUC_ANN.jpeg")

In [None]:
#confusion matrix
get_confusion_matrix_heatmap(Y_test, y_pred_ann, "ANN.jpeg")

In [None]:
# ANN - TSNE - word Embedding visualisations
def Word_Embeddings_visualise_TSNE(model, idx_to_word, fname, lname, lim1, lim2, lim3, lim4, Flag=False):
  np.random.seed(1)
  print(f"First sample of the training data set vectorised \n{x_train[0]}\n")
  text = [idx_to_word[idx] if idx != 0 else "<UNK>" for idx in x_train[0]]
  print(f"First sample of the training data set using index to word : \n{' '.join(text)}\n")

  ## Extraction of word Embeddings
  word_embeddings = model.get_layer(lname).get_weights()[0]
  print('Shape of word_embeddings:', word_embeddings.shape)
  
  # # Visualizing the word Embeddings
  # idx_to_word[0] = "<PAD>"
  # index = idx_to_word.values()
  # Embeddings_ann = pd.DataFrame(word_embeddings, index=index)
  # Embeddings_ann.to_csv(f"drive/MyDrive/SPAM classification deep learning/Visuals/{fname}.csv")
  
  # Ploting the word embeddings using TSNE
  tsne = TSNE(perplexity=3, n_components=2, init='pca', n_iter=500, method='exact')
  np.set_printoptions(suppress=True)
  plot_only = 60

  T = tsne.fit_transform(word_embeddings[:plot_only, :])
  labels = [idx_to_word[i+1] for i in range(plot_only)]
  plt.figure(figsize=(14, 8))
  if(Flag):
    plt.ylim(lim1, lim2)
    plt.xlim(lim3, lim4)
  plt.scatter(T[:, 0], T[:, 1])
  for label, x, y in zip(labels, T[:, 0], T[:, 1]):
      plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points', ha='right',
  va='bottom')
  plt.savefig(f"drive/MyDrive/SPAM classification deep learning/Visuals/{fname}.jpeg")
  # plt.close()
  # Embeddings_ann.head(10)

In [None]:
# idx_to_word[0]
# ANN - TSNE - word Embedding visualisations
Word_Embeddings_visualise_TSNE(ann, idx_to_word, "ann_Embeddings_1", "embedding", -200, 200, -200, 200, False)

# **RNN - Recurrent Neural Network 
Variant used : LSTM : Long short term memory**

In [None]:
"""
RNN - Recurrent Neural Network 
Variant used : LSTM : Long short term memory
"""

rnn = Sequential()
rnn.add(Embedding(input_dim=vocab_size, 
                           output_dim=100, 
                           input_length=2000,
                           # Use masking to handle the variable sequence lengths
                           mask_zero=True))
rnn.add(Dropout(0.2))
# rnn.add(Bidirectional(LSTM(128, activation='relu', input_dim=50)))
rnn.add(Bidirectional(LSTM(100)))
rnn.add(Dense(64, activation='relu'))
# Adding dropout to prevent overfitting
rnn.add(Dropout(0.2))
rnn.add(Dense(1, activation='sigmoid'))



In [None]:
rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',])

In [None]:
history = rnn.fit(x_train, Y_train, epochs=3, validation_data=(x_test, Y_test), validation_steps = len(x_test), steps_per_epoch= len(x_train))

Epoch 1/3
 420/4396 [=>............................] - ETA: 2:16:09 - loss: 0.4842 - accuracy: 0.8381

In [None]:
rnn.summary()
plot_model(
    rnn,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/rnn_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
# Predicting the Test set results
y_pred = rnn.predict(x_test)
y_pred = (y_pred > 0.5)

y_pred_rnn = np.array(y_pred)

#confusion matrix
get_confusion_matrix_heatmap(Y_test, y_pred_rnn, "RNN.jpeg")

In [None]:
#AUC
lr_auc_rnn = ROC_AUC(Y_test, y_pred_rnn, "AUC_RNN.jpeg")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/rnn_accuracy_loss.jpeg")

test_loss_rnn, test_acc_rnn = rnn.evaluate(x_test, Y_test)
test_err_rnn = 100 - test_acc_rnn*100

print(f"Test Loss:     {test_loss_rnn*100} %")
print(f"Test Accuracy: {test_acc_rnn*100}  %")
print(f"Test error: {test_err_rnn}  %")

In [None]:
# ANN - TSNE - word Embedding visualisations
Word_Embeddings_visualise_TSNE(rnn, idx_to_word, "rnn_Embeddings_1", "embedding_1", -500, 500, -300, 200, True)

# ***CNN without Pretrained Word Embeddings ***
## Input : Embedding vector dim : (Embed Size : 100, max Length of sentences)

In [None]:
"""
CNN without Pretrained Word Embeddings
Input : Embedding vector dim : (Embed Size : 100, max Length of sentences)
"""
# total vocabualry size of dataset
VOCAB_SIZE=vocab_size

# maximum length of each sentence/instance in the dataset
maxLen=2000

# declare keras sequential model
cnn = Sequential()

# initialize first model layer as embedding layer, here we are not using any pretrained word embedding
cnn.add(Embedding(VOCAB_SIZE, 100, input_length=maxLen))

''' --------------------
- initialize second hidden layer as convolutional layer
- number of filters that cnn will return is set to 20
- kernel size is set to 3
- padding is et to valid, which means no padding will be applied
- number of strides is set to default that is 1
-------------------- '''
cnn.add(Conv1D(activation="relu", filters=20, kernel_size=3, padding="valid"))

# we use global max pooling to downsample the features that will simply
# select the maximum values from features representation
# this layer have no parameters because it just have to select max values no
# backpropagation is required
cnn.add(GlobalMaxPooling1D())

# fully connected layer, usually used to change the dimension of the output
# maps all the feature units to mentioned dimension
cnn.add(Dense(units=16))

# output layer, last layer of model must be fully connected layer that will
# map the outputs of previous layer to required output dimension
# as we required only 1 unit for output that will return 0 or 1
# sigmoid activation function will return value between 0 or 1
cnn.add(Dense(1,activation='sigmoid'))

# initialize the optimizer with learning rate 0.001
opt = Adam(learning_rate=0.001)

# model compilation where we defince the loss function and optimizer
cnn.compile( optimizer=opt, loss='binary_crossentropy',metrics=['accuracy'])

# display model layers
print(cnn.summary())
plot_model(
    cnn,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/cnn_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)


In [None]:
# start model training
history = cnn.fit(x_train, Y_train, epochs=5, validation_steps = len(x_test), steps_per_epoch= len(x_train), validation_data=(x_test, Y_test), verbose=1)


In [None]:
# evaluate trained model performance on test data
cnn.summary()
plot_model(
    cnn,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/cnn_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
# Predicting the Test set results
y_pred = cnn.predict(x_test)
y_pred = (y_pred > 0.5)
y_pred_cnn = np.array(y_pred)

#confusion matrix
get_confusion_matrix_heatmap(Y_test, y_pred_cnn, "CNN.jpeg")

In [None]:
#AUC
lr_auc_cnn = ROC_AUC(Y_test, y_pred_cnn, "AUC_RNN.jpeg")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/cnn_accuracy_loss.jpeg")


test_loss_cnn, test_acc_cnn = cnn.evaluate(x_test, Y_test)
test_err_cnn = 100 - test_acc_cnn*100

print(f"Test Loss:     {test_loss_cnn*100} %")
print(f"Test Accuracy: {test_acc_cnn*100}  %")
print(f"Test error: {test_err_cnn}  %")

In [None]:
# ANN - TSNE - word Embedding visualisations
Word_Embeddings_visualise_TSNE(cnn, idx_to_word, "cnn_Embeddings_1", "embedding_2", 1,1,1,1, False)

# **GLOVE : Global vectors for word representation**

In [None]:
"""
GLOVE : Global vectors for word representation

Definition : 
GloVe is an unsupervised learning algorithm for obtaining vector representations 
for words. Training is performed on aggregated global word-word co-occurrence statistics from a 
corpus, and the resulting representations showcase interesting linear substructures of the word 
vector space.

Reference : Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: 
            Global Vectors for Word Representation.

Usage : Using Glove Embedding matrix to convert input data/sample to embedding vectors
"""
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
    

In [None]:
"""
Data Preprocessing to be used with glove

Tasks : Convert input data/Train data to glove word index
"""
from nltk.tokenize import word_tokenize
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('drive/MyDrive/SPAM classification deep learning/Glove/glove.6B.100d.txt')
max_length = 2400
Embed_size = 100

print(len(word_to_index))
print(len(index_to_word))

# Data Pre-Processing 
1. Helper functions like sentence to index


In [None]:
def X_to_index(text, word_to_index, max_length):
  
  vectorised = word_tokenize(text)
  vectorised_index = np.zeros((max_length))
  # vectorised_index = [word_to_index[w] if w in word_to_index.keys() else  0 for w in vectorised]
  i = 0
  for w in vectorised:
    if w in word_to_index.keys():
      vectorised_index[i] = word_to_index[w]
    else:
      vectorised_index[i] = 0
    i += 1
    if i >= 2400:
      break
  return vectorised_index

def sentences_to_indices(X, word_to_index, max_len):
  """
  Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
  The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
  
  Arguments:
  X -- array of sentences (strings), of shape (m, 1)
  word_to_index -- a dictionary containing the each word mapped to its index
  max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
  
  Returns:
  X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
  """
  
  m = X.shape[0]                                   # number of training examples
  # Initialize X_indices as a numpy matrix of zeros and the correct shape
  X_indices = np.zeros((m, max_len))
  
  for i in range(m):                               # loop over training examples
      
      # Convert the ith training sentence in lower case and split is into words. Get a list of words.
      sentence_words =X[i].lower().split()
      
      # Initialize j to 0
      j = 0
      
      # Loop over the words of sentence_words
      for w in sentence_words:
          # Set the (i,j)th entry of X_indices to the index of the correct word.
          if w in word_to_index.keys():
            X_indices[i, j] = word_to_index[w]
          else:
            X_indices[i, j] = 0
          
          
          # Increment j to j + 1
          j = j + 1
          if j >= 2400:
            break
  return X_indices

data_ANN_RNN_CNN = data_ANN_RNN_CNN.filter(['X','spam'], axis=1)
data_ANN_RNN_CNN_index = data_ANN_RNN_CNN["X"].copy()
data_ANN_RNN_CNN_index = data_ANN_RNN_CNN_index.astype(str)
# data_ANN_RNN_CNN_index = data_ANN_RNN_CNN_index.apply(lambda x: X_to_index(x, word_to_index, max_length))
X = sentences_to_indices(data_ANN_RNN_CNN_index, word_to_index, max_length)


#Label
Y = data_ANN_RNN_CNN['spam'].copy()
Y = [1 if x == 1.0 else 0.0 for x in Y]
Y = np.asarray(Y).astype('float32') 

# X.shape
print(X.shape)
print(type(X))

In [None]:
print(type(X))

In [None]:
print(X.dtype)
X = X.astype("int32")
print(X.dtype)

In [None]:
for i in range(10):
  txt = data_ANN_RNN_CNN["X"][i]
  idx = X[i]
  print(f"{i}th training sample text : \n{txt}\n")
  print(f"{i}th training sample indexed : \n{idx}\n")

# ***Data splitting for ANN , RNN , CNN with glove pretrained embeddings model***

In [None]:
# type(Y)
# #Splitting the data - 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X , Y, test_size = 0.2, random_state = 99)
print(f"Training split input: {X_train.shape}")
print(f"Testing split input : {X_test.shape}")
print(f"Training split class: {y_train.shape}")
print(f"Testing split class : {y_test.shape}\n")

print("dtypes :")
print(f"Train : {X_train.dtype}")
print(f"Test : {X_test.dtype}")
print(f"Train label : {y_train.dtype}")
print(f"Test label : {y_test.dtype}\n")

print("Types :")
print(f"Train : {type(X_train.dtype)}")
print(f"Test : {type(X_test.dtype)}")
print(f"Train label : {type(y_train.dtype)}")
print(f"Test label : {type(y_test.dtype)}\n")

In [None]:
for i in range(3):
  print(X_train[i])

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding 
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of GloVe word vectors
    
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) 
    
    # Set the weights of the embedding layer to the embedding matrix. The layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

# **Helper Functions to make ANN with Glove Embeddings data **
# Complete ANN model Implementation

In [None]:
def ANN_with_glove(input_shape, word_to_vec_map, word_to_index):
  """
  Function creating the ANN_with_glove model's graph.

  Arguments:
  input_shape -- shape of the input, usually (max_len,)
  word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
  word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

  Returns:
  model -- a model instance in Keras
  """


  # Define sentence_indices as the input of the graph
  # It should be of shape input_shape and dtype 'int32'.
  sentence_indices = Input(input_shape, dtype='int32')

  # Create the embedding layer pretrained with GloVe Vectors
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

  # Propagate sentence_indices through your embedding layer, you get back the embeddings
  embeddings = embedding_layer(sentence_indices)    

  # Propagate the embeddings through a max pooling layer with default kernal
  X = GlobalMaxPooling1D()(embeddings)

  # Propagate X through a Dense layer with relu activation to get back activation of next layer
  X = Dense(20, activation='relu')(X)

  # Add dropout with a probability of 0.5
  X = Dropout(0.5)(X)

  # Propagate X through a Dense layer with sigmoid activation to get back activation of next layer
  X = Dense(10, activation='relu')(X)

  # Add dropout with a probability of 0.5
  X = Dropout(0.2)(X)

  # Propagate X through a Dense layer with sigmoid activation to get back y_pred
  X = Dense(1, activation='sigmoid')(X)
  
  # # Add a sigmoid activation
  # X = Activation('sigmoid')(X)

  # Create Model instance which converts sentence_indices into X.
  model = Model(inputs=sentence_indices, outputs=X)

  return model

# Helper Functions to make RNN with Glove Embeddings data * Complete RNN model Implementation

In [None]:
def RNN_with_glove(input_shape, word_to_vec_map, word_to_index):
  """
  Function creating the ANN_with_glove model's graph.

  Arguments:
  input_shape -- shape of the input, usually (max_len,)
  word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
  word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

  Returns:
  model -- a model instance in Keras
  """


  # Define sentence_indices as the input of the graph
  # It should be of shape input_shape and dtype 'int32'.
  sentence_indices = Input(input_shape, dtype='int32')

  # Create the embedding layer pretrained with GloVe Vectors
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

  # Propagate sentence_indices through your embedding layer, you get back the embeddings
  embeddings = embedding_layer(sentence_indices)    

  # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
  X = LSTM(128, return_sequences=True)(embeddings)

  # Add dropout with a probability of 0.5
  X = Dropout(0.5)(X)

  # Propagate X trough another LSTM layer with 128-dimensional hidden state
  X = LSTM(128, return_sequences=False)(X)

  # Add dropout with a probability of 0.5
  X = Dropout(0.5)(X)

  # Propagate X through a Dense layer with relu activation to get back activation
  X = Dense(5, activation='relu')(X)

  # Propagate X through a Dense layer with sigmoid activation to get back y_pred
  X = Dense(1, activation='sigmoid')(X)

  # # Add a sigmoid activation
  # X = Activation('sigmoid')(X)

  # Create Model instance which converts sentence_indices into X.
  model = Model(inputs=sentence_indices, outputs=X)

  return model

# Helper Functions to make CNN with Glove Embeddings data * Complete CNN model Implementation

In [None]:
def CNN_with_glove(input_shape, word_to_vec_map, word_to_index):
  """
  Function creating the ANN_with_glove model's graph.

  Arguments:
  input_shape -- shape of the input, usually (max_len,)
  word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
  word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

  Returns:
  model -- a model instance in Keras
  """


  # Define sentence_indices as the input of the graph
  # It should be of shape input_shape and dtype 'int32'.
  sentence_indices = Input(input_shape, dtype='int32')

  # Create the embedding layer pretrained with GloVe Vectors
  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

  # Propagate sentence_indices through your embedding layer, you get back the embeddings
  embeddings = embedding_layer(sentence_indices)    

  ''' --------------------
  - initialize second hidden layer as convolutional layer
  - number of filters that cnn will return is set to 20
  - kernel size is set to 3
  - padding is set to valid, which means no padding will be applied
  - number of strides is set to default that is 1
  -------------------- '''
  X = Conv1D(activation="relu",filters=20, kernel_size=3, padding="valid")(embeddings)
  

  # Propagate the embeddings through a max pooling layer with default kernal
  X = GlobalMaxPooling1D()(X)

  # Propagate X through a Dense layer with sigmoid activation to get back activation of next layer
  X = Dense(units = 20)(X)

  # Add dropout with a probability of 0.5
  X = Dropout(0.5)(X)

  # Propagate X through a Dense layer with sigmoid activation to get back activation of next layer
  X = Dense(10, activation='relu')(X)

  # Add dropout with a probability of 0.5
  X = Dropout(0.2)(X)

  # Propagate X through a Dense layer with sigmoid activation to get back y_pred
  X = Dense(1, activation='sigmoid')(X)

  # Create Model instance which converts sentence_indices into X.
  model = Model(inputs=sentence_indices, outputs=X)

  return model

# ANN with glove

In [None]:
"""
ANN with glove
"""
ann_glove = ANN_with_glove((max_length,), word_to_vec_map, word_to_index)
ann_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',])


In [None]:
# display model layers
print(ann_glove.summary())
plot_model(
    ann_glove,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/ann_glove_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
#Information about dtypes of layers of the model
[print(i.shape, i.dtype) for i in ann_glove.inputs]
print("\n\n")
[print(o.shape, o.dtype) for o in ann_glove.outputs]
print("\n\n")
[print(l.name, l.input_shape, l.dtype) for l in ann_glove.layers]

In [None]:
history = ann_glove.fit(X_train, y_train, epochs = 20, validation_steps = len(X_test), steps_per_epoch= len(X_train), validation_data=(X_test, y_test), verbose=1)

In [None]:
# Predicting the Test set results
y_pred = ann_glove.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred_ann_glove = np.array(y_pred)

#confusion matrix
get_confusion_matrix_heatmap(y_test, y_pred_ann_glove, "ANN_glove.jpeg")

In [None]:
#AUC
lr_auc_ann_glove = ROC_AUC(Y_test, y_pred_ann_glove, "AUC_ANN_GLOVE.jpeg")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/ann_glove_accuracy_loss.jpeg")


test_loss_ann_glove, test_acc_ann_glove = ann_glove.evaluate(X_test, y_test)
test_err_ann_glove = 100 - test_acc_ann_glove*100

print(f"Test Loss:     {test_loss_ann_glove*100} %")
print(f"Test Accuracy: {test_acc_ann_glove*100}  %")
print(f"Test error: {test_err_ann_glove}  %")

In [None]:
# ANN - TSNE - word Embedding visualisations
def Word_Embeddings_visualise_TSNE_glove(model, idx_to_word, fname, lname, lim1, lim2, lim3, lim4, Flag=False):
  np.random.seed(1)
  print(f"First sample of the training data set vectorised \n{X_train[0]}\n")
  text = [idx_to_word[idx] if idx!=0 else "<UNK>" for idx in X_train[0]]
  print(f"First sample of the training data set using index to word : \n{' '.join(text)}\n")

  ## Extraction of word Embeddings
  word_embeddings = model.get_layer(lname).get_weights()[0]
  print('Shape of word_embeddings:', word_embeddings.shape)
  # Visualizing the word Embeddings
  # Embeddings_model = pd.DataFrame(word_embeddings, index=idx_to_word.values())
  # Embeddings_model.to_csv(f"drive/MyDrive/SPAM classification deep learning/Visuals/{fname}.csv")
  
  # Ploting the word embeddings using TSNE
  tsne = TSNE(perplexity=3, n_components=2, init='pca', n_iter=500, method='exact')
  np.set_printoptions(suppress=True)
  plot_only = 60

  T = tsne.fit_transform(word_embeddings[:plot_only, :])
  labels = [idx_to_word[i+1] for i in range(plot_only)]
  plt.figure(figsize=(14, 8))
  if(Flag):
    plt.ylim(lim1, lim2)
    plt.xlim(lim3, lim4)
  plt.scatter(T[:, 0], T[:, 1])
  for label, x, y in zip(labels, T[:, 0], T[:, 1]):
      plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points', ha='right',
  va='bottom')
  plt.savefig(f"drive/MyDrive/SPAM classification deep learning/Visuals/{fname}.jpeg")
  # plt.close()
  # Embeddings_ann.head(10)
# ANN - TSNE - word Embedding visualisations
Word_Embeddings_visualise_TSNE_glove(ann_glove, index_to_word, "ann_glove_Embeddings_1", "embedding_3", -200,200,-200,200, True)

# RNN with glove

In [None]:
"""
RNN with glove
"""
rnn_glove = RNN_with_glove((max_length,), word_to_vec_map, word_to_index)
rnn_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',])

In [None]:
# display model layers
print(rnn_glove.summary())
plot_model(
    rnn_glove,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/rnn_glove_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
#Information about dtypes of layers of the model
[print(i.shape, i.dtype) for i in rnn_glove.inputs]
print("\n\n")
[print(o.shape, o.dtype) for o in rnn_glove.outputs]
print("\n\n")
[print(l.name, l.input_shape, l.dtype) for l in rnn_glove.layers]

In [None]:
history = rnn_glove.fit(X_train, y_train, epochs = 3, validation_steps = len(X_test), steps_per_epoch= len(X_train), validation_data=(X_test, y_test), verbose=1)

In [None]:
# Predicting the Test set results
y_pred = rnn_glove.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred_rnn_glove = np.array(y_pred)

#confusion matrix
get_confusion_matrix_heatmap(y_test, y_pred_rnn_glove, "RNN_glove.jpeg")

In [None]:
#AUC
lr_auc_rnn_glove = ROC_AUC(Y_test, y_pred_rnn_glove, "AUC_RNN_GLOVE.jpeg")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/rnn_glove_accuracy_loss.jpeg")


test_loss_rnn_glove, test_acc_rnn_glove = rnn_glove.evaluate(X_test, y_test)
test_err_rnn_glove = 100 - test_acc_rnn_glove*100

print(f"Test Loss:     {test_loss_rnn_glove*100} %")
print(f"Test Accuracy: {test_acc_rnn_glove*100}  %")
print(f"Test error: {test_err_rnn_glove}  %")

In [None]:
Word_Embeddings_visualise_TSNE_glove(rnn_glove, index_to_word, "rnn_glove_Embeddings_1", "embedding_4", -200,200,-200,200, True)

# CNN with glove

In [None]:
"""
CNN with glove
"""
cnn_glove = CNN_with_glove((max_length,), word_to_vec_map, word_to_index)
cnn_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',])

In [None]:
# display model layers
print(cnn_glove.summary())
plot_model(
    cnn_glove,
    to_file="drive/MyDrive/SPAM classification deep learning/Visuals/cnn_glove_1.jpeg",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
#Information about dtypes of layers of the model
[print(i.shape, i.dtype) for i in cnn_glove.inputs]
print("\n\n")
[print(o.shape, o.dtype) for o in cnn_glove.outputs]
print("\n\n")
[print(l.name, l.input_shape, l.dtype) for l in cnn_glove.layers]

In [None]:
history = cnn_glove.fit(X_train, y_train, epochs = 5, validation_steps = len(X_test), steps_per_epoch= len(X_train), validation_data=(X_test, y_test), verbose=1)

In [None]:
# Predicting the Test set results
y_pred = cnn_glove.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred_cnn_glove = np.array(y_pred)

#confusion matrix
get_confusion_matrix_heatmap(y_test, y_pred_cnn_glove, "CNN_glove.jpeg")

In [None]:
#AUC
lr_auc_cnn_glove = ROC_AUC(Y_test, y_pred_cnn_glove, "AUC_CNN_GLOVE.jpeg")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
plt.savefig("drive/MyDrive/SPAM classification deep learning/Visuals/cnn_glove_accuracy_loss.jpeg")


test_loss_cnn_glove, test_acc_cnn_glove = cnn_glove.evaluate(X_test, y_test)
test_err_cnn_glove = 100 - test_acc_cnn_glove*100

print(f"Test Loss:     {test_loss_cnn_glove*100} %")
print(f"Test Accuracy: {test_acc_cnn_glove*100}  %")
print(f"Test error: {test_err_cnn_glove}  %")

In [None]:
Word_Embeddings_visualise_TSNE_glove(cnn_glove, index_to_word, "cnn_glove_Embeddings_1", "embedding_5", -200,200,-200,200, True)

# Model comparison table with metrics : 
1. Accuracy
2. Loss
3. Error
4. Precision, Recall, F1 score
5. ROC AUC

In [None]:
"""
Model comparison table with metrics : 
1. Accuracy
2. Loss
3. Error
4. Precision, Recall, F1 score
5. ROC AUC
"""
def get_Metrics(y_test, y_pred, average="macro"):
    
    lr_fpr, lr_tpr, _ = roc_curve(y_test, y_pred)
    # find area under curve score
    lr_auc = auc(lr_fpr, lr_tpr)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1_score_ = f1_score(y_test, y_pred, average = average)
    accuracy = accuracy_score(y_test, y_pred)
    #print(f"precision : {precision} recall : {recall} f1_score : {f1_score_} accuracy : {accuracy}")
    return precision, recall, f1_score_, accuracy, lr_auc

Model_comparison = {
    "Ann without pretrained embeddings": {"model":ann, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0},
    "Rnn without pretrained embeddings": {"model":rnn, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0},
    "Cnn without pretrained embeddings": {"model":cnn, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0},
    "Ann with GLOVE embeddings": {"model":ann_glove, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0},
    "Rnn with GLOVE embeddings": {"model":rnn_glove, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0},
    "Cnn with GLOVE embeddings": {"model":cnn_glove, "acc":0, "loss" : 0, "err":0, "lr_auc":0, "f1_score":0, "precision":0, "recall":0, "y_pred":0}
}

for name, model in Model_comparison.items():
  if name == "Ann without pretrained embeddings":
      model["y_pred"]=y_pred_ann
      print(type(model["y_pred"]))
      print(model["y_pred"].shape)
  if name == "Rnn without pretrained embeddings":
    model["y_pred"]=y_pred_rnn
  if name == "Cnn without pretrained embeddings":
    model["y_pred"]=y_pred_cnn
  if name == "Ann with GLOVE embeddings":
    model["y_pred"]= y_pred_ann_glove
  if name == "Rnn with GLOVE embeddings":
    model["y_pred"]=y_pred_rnn_glove
  if name == "Cnn with GLOVE embeddings":
    model["y_pred"]=y_pred_cnn_glove

for name, model in Model_comparison.items():
    y_pred = model["y_pred"]
    y_pred = np.asarray(y_pred)
    precision, recall, f1_score_, accuracy, lr_auc = get_Metrics(Y_test, y_pred)
    if name == "Ann without pretrained embeddings":
      model["acc"]=test_acc_ann
      model["loss"]=test_loss_ann
      model["err"]=test_err_ann
      model["lr_auc"]=lr_auc_ann
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
      
    if name == "Rnn without pretrained embeddings":
      model["acc"]=test_acc_rnn
      model["loss"]=test_loss_rnn
      model["err"]=test_err_rnn
      model["lr_auc"]=lr_auc_rnn
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
    
    if name == "Cnn without pretrained embeddings":
      model["acc"]=test_acc_cnn
      model["loss"]=test_loss_cnn
      model["err"]=test_err_cnn
      model["lr_auc"]=lr_auc_cnn
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
      
    if name == "Ann with GLOVE embeddingss":
      model["acc"]=test_acc_ann_glove
      model["loss"]=test_loss_ann_glove
      model["err"]=test_err_ann_glove
      model["lr_auc"]=lr_auc_ann_glove
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
      
    if name == "Rnn with GLOVE embeddings":
      model["acc"]=test_acc_rnn_glove
      model["loss"]=test_loss_rnn_glove
      model["err"]=test_err_rnn_glove
      model["lr_auc"]=lr_auc_rnn_glove
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
     
    if name == "Cnn with GLOVE embeddings":
      model["acc"]=test_acc_cnn_glove
      model["loss"]=test_loss_cnn_glove
      model["err"]=test_err_cnn_glove
      model["lr_auc"]=lr_auc_cnn_glove
      model["f1_score"]=precision
      model["precision"]=recall
      model["recall"]=f1_score_
    # print(model)

models_metrics = []
for name, model in Model_comparison.items():
  #print(model["y_pred"].shape)
  precision, recall, f1_score_,  = model["precision"], model["recall"], model["f1_score"]
  accuracy, lr_auc, loss, error = model["acc"], model["lr_auc"], model["loss"], model["err"]
  #print(f"precision : {precision} recall : {recall} f1_score : {f1_score_} accuracy : {accuracy}")
  models_metrics.append([name, precision, recall, f1_score_, accuracy, error, loss, lr_auc])

df_metrics = pd.DataFrame(models_metrics)
df_metrics.columns = ['Model', 'Precision', 'Recall', 'f1 score', 'Accuracy', 'Error', 'Loss', 'ROC-AUC']
df_metrics.sort_values(by = 'Accuracy', ascending = False, inplace=True)
df_metrics.reset_index(drop = True, inplace=True)
df_metrics
    