In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ls "/content/drive/My Drive/Alda_project/"

In [0]:
import numpy as np
import keras
from keras.layers import Input, Dense, Dropout, LSTM, Bidirectional
from keras import Model
import pandas as pd
import re
import random
import email
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics 
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [0]:
from gensim.models.word2vec import Word2Vec

In [0]:
dataset = pd.read_csv('/content/drive/My Drive/Alda_project/emails.csv')
dataset_sent_mails = dataset[dataset['file'].str.contains('sent')]
dataset_sent_mails = dataset_sent_mails.assign(sender=dataset_sent_mails["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
dataset_sent_mails.drop("file", axis=1, inplace=True)

In [0]:
users = dataset_sent_mails["sender"].value_counts().head(15).index.values # extract top 15 users
mapping = {}
for i, user in enumerate(users, start = 1):
  mapping[user] = i
sent_user_dataset = dataset_sent_mails[dataset_sent_mails.sender.isin(users)] # extracted data of 15 users

In [0]:
# preprocessing email content
def email_preprocessing(email_message):
    msg = email.message_from_string(email_message)
    
    email_content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            email_content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(email_content)
    # msg["content"] = ''.join(email_content)
    return result

#Function for preprocessing of text data
def content_preprocessing(content):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]

    return ' '.join(words)

In [0]:
final_data = pd.DataFrame(list(map(email_preprocessing, sent_user_dataset.message)))
final_data = pd.DataFrame(list(map(content_preprocessing, final_data[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))), columns = ["content"])
final_data = final_data.assign(user_number= sent_user_dataset["sender"].values)
final_data = final_data.replace({'user_number': mapping})
final_data.head()

In [0]:
emails_words = final_data.content.apply(lambda x: x.split())

In [0]:
model = Word2Vec(emails_words.values)

In [0]:
model.wv.save_word2vec_format('/content/drive/My Drive/Alda_project/model.bin')

In [0]:
from gensim.models import KeyedVectors
filename = '/content/drive/My Drive/Alda_project/model.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [0]:
X = final_data.content.values
y_data = final_data.user_number.values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
#converting into one hot encoded vectors
encoder = OneHotEncoder()
encoder.fit(y_data.reshape(-1, 1))
y_data = encoder.transform(y_data.reshape(-1, 1))

In [0]:
X_data = []
max_vec_len = len(model['hi'])
max_seq_len = 70
max_seq_len, max_vec_len

In [0]:
# Get feature vectors of the word2vec model
for email in emails_words:
  x_arr = []
  for word in email[:max_seq_len]:
    try:
      x_arr.append(model[word])
    except:
      pass
  if max_seq_len - len(x_arr) > 0:
    for _ in range(max_seq_len - len(x_arr)):
      x_arr.append(np.zeros(shape=(max_vec_len,)))
  X_data.append(np.array(x_arr))
  if len(X_data)%5000 == 0:
    print("Next 500 batched finished")
X_data = np.array(X_data)

np.save('/content/drive/My Drive/Alda_project/word2vec_data', X_data)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2) #spliting into training and testing

In [0]:
# LSTM model containing encoder LSTM layers and fully connected layers.
class LSTM_Model:
    
    def __init__(self, enc_seq_length, enc_unique_states, output_states, enc_layers=1, 
                 dense__prev_layers_neurons=[], lstm_units = 256, 
                 bidirectional=False, dropout=0, recurrent_dropout=0, bias_regularizer=None, 
                 kernel_regularizer=None, activity_regularizer=None):
        self.enc_seq_length = enc_seq_length
        self.enc_unique_states = enc_unique_states
        self.enc_layers = enc_layers
        self.output_states = output_states
        self.dense__prev_layers_neurons = dense__prev_layers_neurons
        self.lstm_units = lstm_units
        self.bidirectional = bidirectional
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
        self.bias_regularizer = bias_regularizer
        self.kernel_regularizer = kernel_regularizer
        self.activity_regularizer = activity_regularizer
        self.dense__prev_layers_neurons.append(self.output_states)
        
    def getModel(self):
        
        self.encoder_inputs = Input(shape=(None, self.enc_unique_states), name='encoder_inputs')
        
        self.encoder = []
        self.encoder_outputs = []
        
        # Add encoder layers 
        for i in range(self.enc_layers-1):
            self.encoder.append(LSTM(self.lstm_units, 
                                     return_sequences=True, 
                                     recurrent_dropout=self.recurrent_dropout, 
                                     dropout = self.dropout, 
                                     bias_regularizer = self.bias_regularizer, 
                                     activity_regularizer = self.activity_regularizer, 
                                     kernel_regularizer=self.kernel_regularizer, 
                                     name="encoder"+str(i+1)))
            # Wrap Bidirectional layer if bidirectional is True
            if self.bidirectional:
                self.encoder[i] = Bidirectional(self.encoder[i])
            
        self.encoder.append(LSTM(self.lstm_units,  
                                 recurrent_dropout=self.recurrent_dropout, 
                                 dropout = self.dropout, 
                                 bias_regularizer = self.bias_regularizer, 
                                 activity_regularizer = self.activity_regularizer, 
                                 kernel_regularizer=self.kernel_regularizer, 
                                 name="encoder"+str(self.enc_layers)))
        if self.bidirectional:
                self.encoder[self.enc_layers-1] = Bidirectional(self.encoder[self.enc_layers-1])
        
        # Get encoder outputs for each encoder layer
        for i in range(self.enc_layers):
            if i==0:
                self.encoder_outputs.append(self.encoder[i]((self.encoder_inputs)))
            else:
                self.encoder_outputs.append(self.encoder[i](self.encoder_outputs[i-1]))
        
        self.decoder_dense = []
        self.dense_outputs = []
        self.dense_layers = len(self.dense__prev_layers_neurons)
        
        # Add fully connected layers
        for i in range(self.dense_layers):
            if i < self.dense_layers-1:
                self.decoder_dense.append(Dense(self.dense__prev_layers_neurons[i], 
                                                bias_regularizer = self.bias_regularizer, 
                                                activity_regularizer = self.activity_regularizer, 
                                                activation='relu', name="output_layer"+str(i+1)))
            else:
                self.decoder_dense.append(Dense(self.dense__prev_layers_neurons[i], 
                                                bias_regularizer = self.bias_regularizer, 
                                                activity_regularizer = self.activity_regularizer, 
                                                activation='softmax', name="softmax"))                
            
        # Get outputs of each fully connected layer
        for i in range(self.dense_layers):
            if i==0:
                self.dense_outputs.append(self.decoder_dense[i](self.encoder_outputs[self.enc_layers-1]))
            else:
                self.dense_outputs.append(self.decoder_dense[i](self.dense_outputs[i-1]))
        

        self.model = Model(self.encoder_inputs, self.dense_outputs[self.dense_layers-1])
        
        return self.model

In [0]:
enc_seq_length = max_seq_len
enc_unique_states = max_vec_len
output_states = len(encoder.get_feature_names())
model = LSTM_Model(enc_seq_length, 
              enc_unique_states,
              output_states,
              enc_layers=2,
              lstm_units = 128,
              dense__prev_layers_neurons=[64],
              dropout = 0.3).getModel()

In [0]:
model.summary()

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [0]:
checkpointer = keras.callbacks.ModelCheckpoint("/content/drive/My Drive/Alda_project/weights_lstm/dense.{epoch:02d}-{val_loss:.2f}.hdf5", 
                                       monitor='accuracy',
                                       verbose=1, 
                                       save_best_only=True, 
                                       mode='max')
early_stopping_monitor = keras.callbacks.EarlyStopping(patience=3)

In [0]:
history = model.fit(X_train, y_train, batch_size=64, epochs=15, validation_split=0.1, callbacks=[early_stopping_monitor, checkpointer])

In [0]:
preds = model.predict(X_test)

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test.argmax(axis=1), preds.argmax(axis=1))