In [None]:
%reload_ext tensorboard

In [None]:
import os
import re
import pickle
import numpy as np
import random as rn
import pandas as pd
from tqdm import tqdm
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, auc, roc_curve
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, Concatenate, MaxPool1D, Flatten, Dropout, Dense, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, TerminateOnNaN, Callback
from tensorflow.keras.optimizers import RMSprop, SGD, Adam
from tensorflow.keras.utils import to_categorical

In [None]:
if not os.path.isfile("text_label.csv"):
    files = os.listdir("./documents")
    label = []
    text = []
    num = []
    
    for file in tqdm(files):
        f = open("./documents"+"/"+file, "r")
        text.append(f.read())
        f.close()
        label.append(file.split("_")[0])
        num.append(file.split("_")[1].split(".")[0])
        
    d = {"Text": text, "D_Number": num, "Label": label}
    data = pd.DataFrame(d)    
    data.to_csv("text_label.csv", index = False, header = True)
    
else:
    data = pd.read_csv("text_label.csv", index_col = False)

In [None]:
data.head()

In [None]:
print("Number of classes: ", len(data.Label.value_counts()))
print("Number of documents of each class\n", data.Label.value_counts())

In [None]:
def Preprocess(text_data):
    """"
    Function to preprocess text file, generate preprocessed email domain, subjects, and text
    """
    
    preprocessed_email = []
    preprocessed_subject = []
    preprocessed_text = []
    
    for sentence in tqdm(text_data):
        
        #preprocessing email
        domain = re.findall("@[\w.]+", sentence)
        email = ""
        for items in domain:
            items = items.replace("@", "")    
            items = items.split(".")
            for i in set(items):
                if((len(i) > 2) and i != "com" and i != "COM"):
                    email += i + " "
        preprocessed_email.append(email.strip())
        
        #preprocessing subject    
        text_split = sentence.split("\n")
        for item in text_split:
            if(item.startswith("Subject:")):
                subject = ""
                for word in item.split():
                    if not word.endswith(":"):
                        subject += word + " "
                subject = re.sub("[^0-9a-zA-Z\s]", " ", subject)
                subject = " ".join(subject.split()).strip()
        preprocessed_subject.append(subject.lower())
        
        #preprocessing text
        #https://towardsdatascience.com/how-i-preprocessed-text-data-using-regular-expressions-for-my-text-classification-task-cnn-cb206e7274ed

        text = re.sub(r"(.*)Subject:(.*?)(.*)\n", " ", sentence)   #remove subject line
        text = re.sub(r"(.*)From:(.*?)(.*)\n", " ", text)          #remove from line
        text = re.sub(r"(.*)Write to:(.*?)(.*)\n", " ", text)      #remove write to line
        text = re.sub(r"(.*):(.*?)", " ", text)                    #remove words ending with :

        #decontract
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"can\'t", "can not", text)
        text = re.sub(r"n\'t", " not", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'s", " is", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'t", " not", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'m", " am", text)

        text = re.sub(r"[\w\-\.]+@[\w\.-]+\b", " ", text)          #remove all emails
        text = re.sub(r"[\n\t]"," ", text)                         #remove line feeds and tabs
        text = re.sub(r"<.*>", " ", text)                          #remove text within angular brackets
        text = re.sub(r"\(.*\)", " ", text)                        #remove text within parantheses
        text = re.sub(r"\[.*\]", " ", text)                        #remove text within square brackets
        text = re.sub(r"\{.*\}", " ", text)                        #remove text within curly braces
        text = re.sub("[0-9]+", " ", text)                         #remove all digts
        text = re.sub("[^A-Za-z\s]", " ", text)                    #remove all characters except alphabets and spaces

        #https://towardsdatascience.com/how-i-preprocessed-text-data-using-regular-expressions-for-my-text-classification-task-cnn-cb206e7274ed
        chunks = []
        chunks = (list(ne_chunk(pos_tag(word_tokenize(text)))))

        for i in chunks:
            if(type(i) == Tree):
                if i.label() == "GPE":
                    j = i.leaves()
                    if len(j)>1:   #if a city name has two or more words we combine it with underscore
                        gpe = "_".join([term for term, pos in j])
                        text = re.sub(rf"{j[1][0]}", gpe, text)
                        text = re.sub(rf"{j[0][0]}", " ", text)
                if i.label() == "PERSON":
                    for term, pos in i.leaves():
                        text = re.sub(re.escape(term), "", text)
                
        #https://stackoverflow.com/questions/20802056/python-regular-expression-1
        text = re.sub(r"\b_([a-zA-z]+)_\b", r"\1", text) #replace _word_ to word
        text = re.sub(r"\b_([a-zA-z]+)\b", r"\1", text) #replace_word to word
        text = re.sub(r"\b([a-zA-z]+)_\b", r"\1", text) #replace word_ to word

        text = re.sub(r"\b[a-zA-Z]{1}_([a-zA-Z]+)", r"\1", text) 
        text = re.sub(r"\b[a-zA-Z]{2}_([a-zA-Z]+)", r"\1", text)

        text = text.lower()
        text_split = text.split()
        text = ""
        for words in text_split:
            if((len(words) > 2) and (len(words) < 15)):
                text += words + " "
        preprocessed_text.append(text.strip())
    
    return (preprocessed_email, preprocessed_subject, preprocessed_text)

In [None]:
if not os.path.isfile("preprocessed_final.csv"):
    processed_email, processed_subject, processed_text = Preprocess(data.Text.values)
    data["Email"] = processed_email
    data["Subject"] = processed_subject
    data["Processed_text"] = processed_text
    
    data.to_csv("preprocessed_final.csv", index = False, header = True)
    
else:
    data = pd.read_csv("preprocessed_final.csv", index_col = False)

In [None]:
data.head()

In [None]:
#checking if preprocessing is correct
#document 49960 is in index 0

print("Label: ", data.Label[0])
print("-"*25)
print("Email: ", data.Email[0])
print("-"*25)
print("Subject: ", data.Subject[0])
print("-"*25)
print("Processed Text\n", data.Processed_text[0])
print("-"*25)

In [None]:
data["combined_data"] = data["Email"].astype(str)+" "+data["Subject"].astype(str)+" "+data["Processed_text"].astype(str)
data = data.drop(["Email", "Subject", "Processed_text"], axis = 1)
data.head()

In [None]:
X = data["combined_data"]
y = data["Label"]
y = pd.get_dummies(data["Label"].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 42)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
class F1_Score(Callback):

    def on_train_begin(self, logs = {}):
        self.f1 = []       

    def on_epoch_end(self, epoch, logs = {}):
        y_pred = []
        y_true = []

        val_predict = np.asarray(self.model.predict(padded_test))
        for y in val_predict:
            y_pred.append(np.argmax(y))
        for y in np.asarray(y_test):
            y_true.append(np.argmax(y))

        val_f1 = f1_score(np.asarray(y_true), np.asarray(y_pred), average = "micro")
        self.f1.append(val_f1)
        print("\t F1 Score: ", val_f1)

## Model-1 Using CNN with word embeddings

In [None]:
!rm -rf ./logs/

In [None]:
os.environ["PYTHONHASHSEED"] = "0"
tf.keras.backend.clear_session()
np.random.seed(42)
rn.seed(42)

In [None]:
t = Tokenizer(filters = "_")
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)
max_length = 100                   
padded_train = pad_sequences(encoded_train, maxlen = max_length, padding = "post", truncating = "post")
padded_test = pad_sequences(encoded_test, maxlen = max_length, padding = "post", truncating = "post")

with open("./glove_vectors", "rb") as f:
    glove = pickle.load(f)
    glove_words = set(glove.keys())
    
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    if word in glove_words:
        embedding_vector = glove[word]
        embedding_matrix[i] = embedding_vector

In [None]:
input_layer = Input(shape = (max_length, ), name = "Input_Layer")

embed_layer = Embedding(input_dim = vocab_size, output_dim = 300, weights = [embedding_matrix], 
                        input_length = max_length, trainable = False, 
                        name = "Embedding_Layer")(input_layer)

conv_1 = Conv1D(128, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_1")(embed_layer)

conv_2 = Conv1D(126, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_2")(embed_layer)

conv_3 = Conv1D(124, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_3")(embed_layer)              

concat_layer_1 = Concatenate()([conv_1, conv_2, conv_3]) 


max_pool_layer_1 = MaxPool1D(pool_size = 2, padding = "valid",
                             name = "MaxPooling_layer_1")(concat_layer_1)

conv_4 = Conv1D(68, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_4")(max_pool_layer_1)
conv_5 = Conv1D(64, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_5")(max_pool_layer_1)
conv_6 = Conv1D(62, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_6")(max_pool_layer_1)

concat_layer_2 = Concatenate()([conv_4, conv_5, conv_6])

max_pool_layer_2 = MaxPool1D(pool_size = 2, padding = "valid",
                             name = "MaxPooling_layer_2")(concat_layer_2)

drop_1 = Dropout(0.8, name = "Dropout_layer_1")(max_pool_layer_2)

conv_7 = Conv1D(32, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Convolution_layer_7")(drop_1)

flatten = Flatten(name = "Flatten_layer")(conv_7)

drop_2 = Dropout(0.1, name = "Dropout_layer_2")(flatten)

dense_1 = Dense(100, activation = "relu", 
                kernel_initializer = tf.keras.initializers.HeUniform(),
                name = "Dense_Layer_1")(drop_2)

drop_3 = Dropout(0.05, name = "Dropout_layer_3")(dense_1)

output = Dense(20, activation = "softmax", 
               name = "Output_Layer")(drop_3)

model1 = Model(inputs = input_layer, outputs = output, 
               name = "Model-1")

model1.summary()

In [None]:
tf.keras.utils.plot_model(model1, show_shapes = True, to_file = "model1.png")

In [None]:
optimizer = Adam(learning_rate = 0.001) 

model1.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])

f1 = F1_Score()

checkpoint = ModelCheckpoint(filepath = "best_model_1_{val_accuracy:.4f}.hdf5",
                             monitor='val_accuracy',  verbose = 1, save_best_only = True, mode = "auto")

earlystop = EarlyStopping(monitor = "val_accuracy", patience = 1, verbose = 1, mode = "auto")

log_dir = "logs/model1"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1, write_graph = True)

callback_list = [checkpoint, earlystop, tensorboard_callback, f1]

model1.fit(padded_train, y_train, validation_data = (padded_test, y_test), batch_size = 512, epochs = 100, callbacks = callback_list)

In [None]:
%tensorboard --logdir logs/model1

## Model-2 Using CNN with character embeddings

In [None]:
!rm -rf ./logs/

In [None]:
os.environ["PYTHONHASHSEED"] = "0"
tf.keras.backend.clear_session()
np.random.seed(42)
rn.seed(42)

In [None]:
t = Tokenizer(filters = "_", char_level = True, oov_token = "UNK")
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)
max_length = 3000              
padded_train = pad_sequences(encoded_train, maxlen = max_length, padding = "post", truncating = "post")
padded_test = pad_sequences(encoded_test, maxlen = max_length, padding = "post", truncating = "post")

embedding_index = dict()
with open("./glove.840B.300d-char.txt", "rb") as f:
    for line in f:
        line_split = line.strip().split()
        char = line_split[0]
        coefs = np.asarray(line_split[1:])
        embedding_index[char] = coefs
f.close()

#https://towardsdatascience.com/character-level-cnn-with-keras-50391c3adf33
embedding_matrix = []
embedding_matrix.append(np.zeros(vocab_size))
for char, i in t.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_matrix.append(onehot)

embedding_matrix = np.asarray(embedding_matrix)

In [None]:
input_layer = Input(shape = (max_length, ), name = "Input_Layer")

embed_layer = Embedding(input_dim = vocab_size, output_dim = 40, weights = [embedding_matrix], 
                        input_length = max_length, trainable = False, 
                        name = "Embedding_Layer")(input_layer)

conv_1 = Conv1D(256, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                name = "Convolution_layer_1")(embed_layer)

conv_2 = Conv1D(128, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                name = "Convolution_layer_2")(conv_1)

max_pool_layer_1 = MaxPool1D(pool_size = 4, padding = "valid",
                             name = "MaxPooling_layer_1")(conv_2)

drop_1 = Dropout(0.4, name = "Dropout_layer_1")(max_pool_layer_1)

conv_3 = Conv1D(64, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                name = "Convolution_layer_3")(drop_1)

conv_4 = Conv1D(32, kernel_size = 3, padding = "valid", activation = "relu",
                kernel_initializer = tf.keras.initializers.HeNormal(),
                name = "Convolution_layer_4")(conv_3)

max_pool_layer_2 = MaxPool1D(pool_size = 4, padding = "valid",
                             name = "MaxPooling_layer_2")(conv_4)

drop_2 = Dropout(0.4, name = "Dropout_layer_2")(max_pool_layer_2)

flatten = Flatten(name = "Flatten_layer")(drop_2)

dense_1 = Dense(100, activation = "relu", 
                kernel_initializer = tf.keras.initializers.HeNormal(),
                name = "Dense_Layer_1")(flatten)

drop_3 = Dropout(0.1, name = "Dropout_layer_3")(dense_1)

output = Dense(20, activation = "softmax", 
               name = "Output_Layer")(drop_3)

model2 = Model(inputs = input_layer, outputs = output, 
               name = "Model-2")

model2.summary()             

In [None]:
tf.keras.utils.plot_model(model2, show_shapes = True, to_file = "model2.png")

In [None]:
optimizer = Adam(learning_rate = 0.01) 

model2.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])

f1 = F1_Score()

checkpoint = ModelCheckpoint(filepath = "best_model_2_{val_accuracy:.4f}.hdf5",
                             monitor='val_accuracy',  verbose = 1, save_best_only = True, mode = "auto")

earlystop = EarlyStopping(monitor = "val_accuracy", patience = 2, verbose = 1, mode = "auto")

log_dir = "logs/model2"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1, write_graph = True)

callback_list = [checkpoint, earlystop, tensorboard_callback, f1]

model2.fit(padded_train, y_train, validation_data = (padded_test, y_test), batch_size = 512, epochs = 100, callbacks = callback_list)

In [None]:
%tensorboard --logdir logs/model2