<a href="https://colab.research.google.com/github/sohamtalukdar/Enron-Email-Analysis/blob/main/Vlabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir "/content/drive/MyDrive/maildir/"
!tar -xzvf "/content/drive/MyDrive/enron_with_categories.tar.gz" -C "/content/drive/MyDrive/maildir/"

In [49]:
import os
import csv
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk.stem as stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import gensim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
#Map folder names to label values
labels = {
    "1": "Company Business, Strategy, etc.",
    "2": "Purely Personal",
    "3": "Personal but in professional context",
    "4": "Logistic Arrangements",
    "5": "Status arrangements",
    "6": "Document editing/checking",
    "7": "Empty message (due to missing attachment)",
    "8": "Empty message"
}

root_directory = '/content/drive/MyDrive/maildir/enron_with_categories/'
rows = []

for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    if os.path.isdir(folder_path):
        label = labels[folder_name]
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                with open(os.path.join(folder_path, filename), 'r') as f:
                    data = f.read()
                    message_body = data.split("\n\n")[-1]
                    number = int(filename.split(".")[0])
                    rows.append({"#": number,"Label": label,"Message": message_body})

df = pd.DataFrame(rows)

# Clean the email message
df['Message'] = df['Message'].apply(lambda x: re.sub(r'[^\w\s]|\d', '', x).lower())
df['Message'] = df['Message'].apply(lambda x: re.sub(r'\S+@\S+', '', x))
df['Message'] = df['Message'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Message'] = df['Message'].apply(lambda x: " ".join([word for word in word_tokenize(x) if word.isalpha() and word not in set(stopwords.words("english"))]))
df['Message'] = df['Message'].apply(lambda x: " ".join([SnowballStemmer("english").stem(word) for word in x.split()]))



# Drop rows with "Empty message (due to missing attachment)" or "Empty message" labels
df = df[df['Label'] != 'Empty message (due to missing attachment)']
df = df[df['Label'] != 'Empty message']

# Drop rows with empty Message column
df = df.dropna(subset=['Message'])
df = df[df['Message'] != '']

# Remove duplicates
df.drop_duplicates(inplace=True)



In [40]:
df.shape
df

Unnamed: 0,#,Label,Message
1,54659,Purely Personal,hormon hostag know day month man open mouth ta...
6,9176,Purely Personal,call nbcs cant see tv cheap rate ploy work the...
7,174265,Purely Personal,test note capabl enron home comput
8,54544,Purely Personal,kevin moor
9,54545,Purely Personal,spoke friend ibm san jose offic ibm fellow adv...
...,...,...,...
1695,201643,"Company Business, Strategy, etc.",srp epng settlecvrdoc srp epng capac proposald...
1698,178056,"Company Business, Strategy, etc.",embed pictur devic independ bitmap
1699,9167,"Company Business, Strategy, etc.",perspect simpl pie face protest organ sophist ...
1700,9177,"Company Business, Strategy, etc.",jeff prepar attach chart captur idea discuss l...


In [42]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Message'])

# Normalization
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=0)

In [46]:
"""
Naive Bayes
"""
# Train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on the test set
y_pred = nb.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.2914285714285714

Classification Report:
                                       precision    recall  f1-score   support

    Company Business, Strategy, etc.       0.79      0.29      0.43        89
           Document editing/checking       0.05      0.14      0.07         7
               Logistic Arrangements       0.62      0.30      0.40        61
Personal but in professional context       0.08      0.25      0.12         8
                     Purely Personal       0.00      0.00      0.00         4
                 Status arrangements       0.07      0.67      0.12         6

                            accuracy                           0.29       175
                           macro avg       0.27      0.27      0.19       175
                        weighted avg       0.62      0.29      0.37       175



In [44]:
#Random Forest

# Train a Random Forest classifier on the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6171428571428571
                                      precision    recall  f1-score   support

    Company Business, Strategy, etc.       0.62      0.93      0.74        89
           Document editing/checking       0.00      0.00      0.00         7
               Logistic Arrangements       0.65      0.39      0.49        61
Personal but in professional context       0.00      0.00      0.00         8
                     Purely Personal       0.00      0.00      0.00         4
                 Status arrangements       0.33      0.17      0.22         6

                            accuracy                           0.62       175
                           macro avg       0.27      0.25      0.24       175
                        weighted avg       0.55      0.62      0.56       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
#Support Vector Machine


# Train a SVM classifier on the training data
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6114285714285714
                                      precision    recall  f1-score   support

    Company Business, Strategy, etc.       0.61      0.90      0.73        89
           Document editing/checking       0.00      0.00      0.00         7
               Logistic Arrangements       0.65      0.43      0.51        61
Personal but in professional context       0.00      0.00      0.00         8
                     Purely Personal       0.00      0.00      0.00         4
                 Status arrangements       0.33      0.17      0.22         6

                            accuracy                           0.61       175
                           macro avg       0.27      0.25      0.24       175
                        weighted avg       0.55      0.61      0.56       175



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:

# Get the messages as a list of lists of words
messages = df['Message'].apply(lambda x: x.split()).tolist()

# Train the CBOW word2vec model
model = Word2Vec(messages, sg=0, size=100, window=5, min_count=1, negative=10, iter=100, seed=0)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

# Get the train and test labels and messages
train_labels = train_df['Label'].tolist()
train_messages = train_df['Message'].apply(lambda x: x.split()).tolist()
test_labels = test_df['Label'].tolist()
test_messages = test_df['Message'].apply(lambda x: x.split()).tolist()

# Convert the messages to average word vectors
train_vectors = np.zeros((len(train_messages), 100))
for i, message in enumerate(train_messages):
    vectors = np.zeros((100,))
    for word in message:
        if word in model.wv:
            vectors += model.wv[word]
    vectors = vectors / len(message)
    train_vectors[i] = vectors

test_vectors = np.zeros((len(test_messages), 100))
for i, message in enumerate(test_messages):
    vectors = np.zeros((100,))
    for word in message:
        if word in model.wv:
            vectors += model.wv[word]
    vectors = vectors / len(message)
    test_vectors[i] = vectors

# Train a classifier on the train data
classifier = SVC(kernel='linear', C=1)
classifier.fit(train_vectors, train_labels)

# Predict the labels for the test data
predictions = classifier.predict(test_vectors)

# Evaluate the model using metrics such as accuracy, precision, recall, and F1-score
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='weighted', zero_division=0)
recall = recall_score(test_labels, predictions, average='weighted')
f1_score = f1_score(test_labels, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)


Accuracy: 0.6228571428571429
Precision: 0.5433916038188238
Recall: 0.6228571428571429
F1-Score: 0.5776587070471754
