# Load Packages

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
import time
import numpy as np
import re
import matplotlib.pyplot as plt
import itertools

from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from torch.utils.data import DataLoader

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'City, University of London - Laptop/MSci Computer Science/Year 3/Semester 2/IN3045 Natural Language Processing/Coursework'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))


Mounted at /content/drive
['Dataset', 'Resources', 'epoch_loss.png', 'Submission', 'Phishing Detection.ipynb']


# Preprocessing

## Understanding Data

In [None]:
# Load dataset
email_dataset = os.path.join(GOOGLE_DRIVE_PATH, 'Dataset/emails.csv')
email_df = pd.read_csv(email_dataset)

# Get information about dataset
#print(email_df.info())

# Check data types
#print('\nData types:')
#print(email_df.dtypes, '\n')

# Check for duplicate values
#print('Duplicate values:', email_df.duplicated().sum(), '\n')

# Check for null values
#print('Null values:')
#print(email_df.isnull().sum())

# Check label frequencies
#print('\nLabel frequencies:')
#print(email_df['Email Type'].value_counts())

## Cleaning data

* Text converted to lowercase
* Null and duplicate values removed
* 'empty' email bodies removed
* Line breaks, non-word and digit characters, and stopwords removed
* Reset index

In [None]:
# Regular expressions for preprocessing
line_break_regex = r'\n'
non_word_regex = r'[^a-zA-Z]+'

# Convert all text to lowercase
email_df.loc[:, 'Email Text'] = email_df['Email Text'].str.lower()

# Remove null values
email_df = email_df.dropna(axis=0)

# Remove duplicate values
email_df = email_df.drop_duplicates()

# Remove empty 'Email Text' cells
# From manual review of the dataset, some Email Text cells are filled with
# 'empty' to indicate no email body is present
email_df = email_df[email_df['Email Text'] != 'empty']

# Remove line breaks
email_df['Email Text'] = email_df['Email Text'].str.replace(line_break_regex,
                                                            ' ', regex=True)
# Remove non-word and digit characters
email_df['Email Text'] = email_df['Email Text'].str.replace(non_word_regex,
                                                            ' ', regex=True)
# Remove stop words
STOPWORDS = set(stopwords.words('english'))
email_df['Email Text'] = email_df['Email Text'].apply(lambda text: ' '.join(
    [word for word in text.split() if word.lower() not in STOPWORDS]))

# Convert categorical labels to numerical
email_df['Email Type'] = email_df['Email Type'].replace({'Safe Email' : 0,
                                                         'Phishing Email' : 1})

# Remove unnamed column containing row number
email_df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Reset index after preprocessing
email_df.reset_index(drop=True, inplace=True)

print('\nData after preprocessing')
print(email_df.info(), '\n')

# Save cleaned dataset
cleaned_df_path = os.path.join(GOOGLE_DRIVE_PATH, 'Dataset/emails_cleaned.csv')
email_df.to_csv((cleaned_df_path), index=False, mode='w')
if (os.path.exists(cleaned_df_path)):
    print('Cleaned dataset saved at:')
    print(cleaned_df_path)
else:
    print('Couldn\'t save cleaned dataset')

## Tokenisation

In [None]:
# Load cleaned dataset
cleaned_dataset = os.path.join(GOOGLE_DRIVE_PATH, 'Dataset/emails_cleaned.csv')
cleaned_df = pd.read_csv(cleaned_dataset)

corpus = []
for text in cleaned_df['Email Text']:
    tokens = word_tokenize(str(text))
    tokens = [token for token in tokens if len(wordnet.synsets(token)) > 0]
    corpus.append(' '.join(tokens))

# Baseline Models

Baseline models include Decision Tree, Support Vector Machine and Multinomial Naive Bayes

In [None]:
# Vectorise corpus
vect = CountVectorizer(binary=True)
vect_corpus = vect.fit_transform(corpus)

# Collect features and labels
X = vect_corpus
y = cleaned_df['Email Type']

# Generate training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

models = {'Decision Tree' : DecisionTreeClassifier(
                            criterion='gini', max_depth=100, max_features=100,
                            max_leaf_nodes=50, min_samples_leaf=50,
                            min_samples_split=50, random_state=42),
          'SVC' : SVC(kernel='rbf', C=3.0, gamma='scale'),
          'MultinomialNB' : MultinomialNB()}

for model_name, model in models.items():
    # Train models
    t0 = time.time()
    model.fit(X_train, y_train)
    print(f'Model: {model_name}')
    print(f'Training time: {time.time() - t0:.3f} seconds')

    # Test models
    y_pred = model.predict(X_test)

    # Generate statistics
    print(f'Classification report:')
    print(classification_report(y_test, y_pred))
    print('=======================================================')

# Generate Word Embeddings

Word embeddings are generated using CBOW Word2Vec with PyTorch

In [None]:
# Building vocabulary
word_counts = Counter([word for email in corpus for word in email.split()])
#print('\nword_counts:', word_counts)
vocabulary = list(word_counts.keys())
#print('\nvocabulary:', vocabulary)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
#print('\nword_to_index:', word_to_index)

# Hyperparameters
embedding_size = 150
window_size = 2
learning_rate = 0.001
num_epochs = 100
batch_size = 64

class CBOW(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocabulary_size)

    def forward(self, context):
        embedded_context = self.embedding(context)
        sum_embedded_context = torch.sum(embedded_context, dim=0)
        output = self.linear(sum_embedded_context)
        return output

def train_cbow(corpus, word_to_index, vocabulary, embedding_size, window_size,
               learning_rate, num_epochs, batch_size):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)

    corpus_tensors = []
    for email in corpus:
        for i in range(window_size, len(email.split()) - window_size):
            context = [word_to_index[word] for word in
                       email.split()[i-window_size:i] +
                       email.split()[i+1:i+window_size+1]]

            target = word_to_index[email.split()[i]]
            corpus_tensors.append((context, target))

    dataloader = DataLoader(corpus_tensors, batch_size=batch_size, shuffle=True)
    model = CBOW(len(vocabulary), embedding_size).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        start_time = time.time()
        total_loss = 0
        for contexts, targets in dataloader:
            context_tensor = torch.stack(
                [torch.tensor(context, dtype=torch.long) for context in
                 contexts]).to(device)
            target_tensor = torch.tensor(targets, dtype=torch.long).to(device)

            optimizer.zero_grad()
            output = model(context_tensor)
            loss = criterion(output, target_tensor)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        end_time = time.time()
        epoch_time = end_time - start_time
        print("Epoch %s - loss=%.4f | time=%.2f seconds"
              % (epoch+1, total_loss, epoch_time))
    return model

# Train CBOW model
model = train_cbow(corpus[:7240], word_to_index, vocabulary, embedding_size,
                   window_size, learning_rate, num_epochs, batch_size)

# Collect learned embeddings
word_embeddings = model.embedding.weight.detach().cpu().numpy()

# Save embeddings
def save_embeddings(embeddings, vocabulary, file_path):
    with open(file_path, 'w') as file:
        for i, word in enumerate(vocabulary):
            embedding = ' '.join(map(str, embeddings[i]))
            file.write(f"{word} {embedding}\n")

save_embeddings(word_embeddings, vocabulary,
                os.path.join(GOOGLE_DRIVE_PATH, 'word_embeddings.txt'))

# Primary Models

In [None]:
'''Generate train/test sets for word embeddings'''
cleaned_dataset = os.path.join(GOOGLE_DRIVE_PATH, 'Dataset/emails_cleaned.csv')
cleaned_df = pd.read_csv(cleaned_dataset)
embeddings_path = os.path.join(GOOGLE_DRIVE_PATH, 'word_embeddings.txt')

def load_word_embeddings(embeddings_path):
    embeddings = {}
    with open(embeddings_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            embedding = np.array([float(val) for val in values[1:]])
            embeddings[word] = embedding
    return embeddings

def aggregate_embeddings(text, embeddings):
    words = str(text).split()
    email_embedding = np.zeros(len(next(iter(embeddings.values()))))
    count = 0
    for word in words:
        if word in embeddings:
            email_embedding += embeddings[word]
            count += 1
    if count != 0:
        email_embedding /= count
    return email_embedding

embeddings = load_word_embeddings(embeddings_path)

X_embd = np.array([aggregate_embeddings(text, embeddings) for
                   text in cleaned_df['Email Text']])
y = cleaned_df['Email Type']

# Split data into train and test sets
X_train_embd, X_test_embd, y_train_embd, y_test_embd = train_test_split(
    X_embd, y, test_size=0.2, random_state=42)

models = {'Random Forest' : RandomForestClassifier(),
          'MLP' : MLPClassifier()}

for model_name, model in models.items():
    # Train models
    t0 = time.time()
    model.fit(X_train_embd, y_train_embd)
    print(f'{model_name} model')
    print(f'Training time: {time.time() - t0:.3f} seconds')

    # Test models
    y_pred = model.predict(X_test_embd)

    # Generate statistics
    print(f'Classification report:')
    print(classification_report(y_test_embd, y_pred))
    print('=======================================================')

Random Forest model
Training time: 21.095 seconds
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2237
           1       0.94      0.87      0.90      1384

    accuracy                           0.93      3621
   macro avg       0.93      0.92      0.92      3621
weighted avg       0.93      0.93      0.93      3621

MLP model
Training time: 25.242 seconds
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2237
           1       0.95      0.93      0.94      1384

    accuracy                           0.96      3621
   macro avg       0.96      0.95      0.95      3621
weighted avg       0.96      0.96      0.96      3621

