# **ARTI 502 â€“ Deep Learning Project**




-------



# **Step 1: Importing Libraries**

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk import pos_tag
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, GRU
import numpy as np
from sklearn.model_selection import train_test_split
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#tf.random.set_seed(42)
#np.random.seed(42)

In [2]:
# Load your dataset
df = pd.read_csv('emails.csv')
df['Email'] = df['Email'].astype(str)  # Ensure all emails are strings

In [None]:
df.head()

# **Step 2: Exploratory Data Analysis (EDA)**

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
print(df['Department'].value_counts())

In [None]:
# Distribution of Department
department_distribution = df['Department'].value_counts()
department_distribution.plot(kind='bar', title='Distribution of Department')
plt.show()

In [None]:
# Word Count Distribution
df['WordCount'] = df['Email'].apply(lambda x: len(word_tokenize(str(x))))
word_count_distribution = df.groupby('Department')['WordCount'].plot(kind='hist', alpha=0.5, legend=True)
plt.title('Word Count Distribution by Department')
plt.show()

# **Step 3: Data Preprocessing using NLP**


* Conversion of POS Tags
* Removing Greetings
* Removing Special Characters
* Removing Common Email Signatures
* Converting to Lowercase
* Tokenization
* Part-of-Speech Tagging
* Lemmatization and Stemming
* Removing Stopwords
* Removing Specific Words



In [4]:
# Function to convert NLTK's pos tags to the format recognized by WordNetLemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun if not found

# Function to clean and process email text
def clean_and_process_email(email_text):
    # Define a regular expression pattern for common greetings with names
    greeting_pattern = re.compile(r'\b(?:Hi|Hello|Dear|Greetings|Good morning|Good evening|Good day|I hope this email finds you well|I trust this email finds you well|I hope you are doing well)\b\s+(\w+\s+)(\w+\s+)?(\w+)', flags=re.IGNORECASE)
    processed_email = re.sub(greeting_pattern, '', email_text)

    # Remove special characters
    processed_email = re.sub(r'[^a-zA-Z\s]', '', processed_email)

    # Remove common email signatures
    signature_pattern = re.compile(r'\b(thanks|thank you|regards|cheers|sincerely|best regards|kind regards|warm regards|with regards)\b.*', flags=re.IGNORECASE)
    processed_email = re.sub(signature_pattern, '', processed_email)

    # Convert to lowercase
    processed_email = processed_email.lower()

    # Tokenize the email
    tokens = word_tokenize(processed_email)
    tagged_tokens = pos_tag(tokens)

    # Initialize lemmatizer and stemmer
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Remove stopwords, lemmatize, and stem the tokens
    stop_words = set(stopwords.words('english'))
    preprocessed_tokens = [stemmer.stem(lemmatizer.lemmatize(token, get_wordnet_pos(tag))) for token, tag in tagged_tokens if token not in stop_words]

    # Additional removal of specific words after lemmatization
    preprocessed_tokens = [token for token in preprocessed_tokens if token not in ['email', 'finds', 'well']]

    # Return the preprocessed tokens
    return ' '.join(preprocessed_tokens)  # Join tokens into a single string for the tokenizer

In [5]:
df['Processed_Email'] = df['Email'].apply(clean_and_process_email)



* Word2Vec Word Embedding



In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Processed_Email'])
vocab_size = len(tokenizer.word_index)+1

# Convert the texts to sequences
sequences = tokenizer.texts_to_sequences(df['Processed_Email'])

# Pad the sequences to have the same length
maxlen = max(len(seq) for seq in sequences)  # You can set a fixed maxlen if you prefer
padded_sequences = pad_sequences(sequences, maxlen=158, padding='post', truncating='post')
# padded_sequences [8]
word2vec_model = Word2Vec(sentences=df['Processed_Email'], vector_size=30, window=3, min_count=2, workers=4, seed = 25)
word2vec_model.save("word2vec.model")



In [7]:
def target(x):
  if x =='HR':
    return 0
  elif x == 'IT':
    return 1
  elif x == 'Customer Service':
    return 2

In [8]:
df['Department'] = df['Department'].map(target)

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Processed_Email'])
sequences = tokenizer.texts_to_sequences(df['Processed_Email'])
word_index = tokenizer.word_index
max_sequence_length = max(len(seq) for seq in sequences)

data = pad_sequences(sequences, maxlen=max_sequence_length)

embedding_dim = 30
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = word2vec_model.wv[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            pass

# **Step 4: Model Development and Training**

> ## **BiGRU Model**



In [None]:
num_classes = 3

#Define and build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True))
model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(Bidirectional(GRU(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(GRU(64, return_sequences=False)))
model.add(Dense(num_classes, activation = 'softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True, mode='min')

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['Department'], test_size=0.2, random_state=42)

#Training the model
history = model.fit(X_train, y_train, batch_size=16, epochs=40, validation_split=0.1,callbacks=[early_stopping]) # 32

In [None]:
model.summary()



> ## **BiLSTM Model**





In [None]:
num_classes = 3

#Define and build the model
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=True))
model2.add(Bidirectional(LSTM(32, return_sequences=True)))
model2.add(Dropout(0.3))
model2.add(Bidirectional(LSTM(64, return_sequences=True)))
model2.add(Bidirectional(LSTM(32, return_sequences=False)))
model2.add(Dense(num_classes, activation = 'softmax'))

# Compile the model
model2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True, mode='min')

# Splitting the dataset
X2_train, X2_test, y2_train, y2_test = train_test_split(padded_sequences, df['Department'], test_size=0.2, random_state=42)

#Training the model
history2 = model2.fit(X2_train, y2_train, batch_size=16, epochs=100, validation_split=0.1,callbacks=[early_stopping]) # 32

In [None]:
model2.summary()

# **Step 5: Model Evaluation**

> ## **BiGRU Model**


In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Predicting and calculating metrics
y_pred = model.predict(X_test)
recall = recall_score(y_test, np.argmax(y_pred, axis=1), average='weighted')
precision = precision_score(y_test, np.argmax(y_pred, axis=1), average='weighted')
f1 = f1_score(y_test, np.argmax(y_pred, axis=1), average='weighted')

print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1-Score: {f1}")

# Confusion matrix
cm = confusion_matrix(y_test, np.argmax(y_pred, axis=1))
print("Confusion Matrix:")
print(cm)

In [None]:
# Visualize training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Visualize Confusion Matrix
plt.figure(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
# Visualize Model Performance Metrics
metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1
}
metric_names = list(metrics.keys())
metric_values = list(metrics.values())
plt.figure(figsize=(7, 6))
plt.bar(metric_names, metric_values, color=['blue', 'green', 'orange', 'cyan'])
plt.title('Model Performance Metrics')
plt.ylabel('Score')
plt.show()

> ## **BiLSTM Model**

In [None]:
# Evaluating the model
loss2, accuracy2 = model2.evaluate(X2_test, y2_test)
print(f"Test Loss: {loss2}, Test Accuracy: {accuracy2}")

# Predicting and calculating metrics
y2_pred = model2.predict(X2_test)
recall2 = recall_score(y2_test, np.argmax(y2_pred, axis=1), average='weighted')
precision2 = precision_score(y2_test, np.argmax(y2_pred, axis=1), average='weighted')
f12 = f1_score(y2_test, np.argmax(y2_pred, axis=1), average='weighted')

print(f"Recall: {recall2}")
print(f"Precision: {precision2}")
print(f"F1-Score: {f12}")

# Confusion matrix
cm2 = confusion_matrix(y2_test, np.argmax(y2_pred, axis=1))
print("Confusion Matrix:")
print(cm2)

In [None]:
# Visualize training and validation accuracy
plt.plot(history2.history['accuracy'], label='Training Accuracy')
plt.plot(history2.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Visualize Confusion Matrix
plt.figure(figsize=(7, 6))
sns.heatmap(cm2, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
# Visualize Model Performance Metrics
metrics = {
    'Accuracy': accuracy2,
    'Precision': precision2,
    'Recall': recall2,
    'F1-Score': f12
}
metric_names = list(metrics.keys())
metric_values = list(metrics.values())
plt.figure(figsize=(7, 6))
plt.bar(metric_names, metric_values, color=['blue', 'green', 'orange', 'cyan'])
plt.title('Model Performance Metrics')
plt.ylabel('Score')
plt.show()