<a href="https://colab.research.google.com/github/sohamtalukdar/Enron-Email-Analysis/blob/main/Vlabs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir "/content/drive/MyDrive/maildir/"
!tar -xzvf "/content/drive/MyDrive/enron_with_categories.tar.gz" -C "/content/drive/MyDrive/maildir/"

In [None]:
import os
import csv
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk.stem as stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Map folder names to label values
labels = {
    "1": "Company Business, Strategy, etc.",
    "2": "Purely Personal",
    "3": "Personal but in professional context",
    "4": "Logistic Arrangements",
    "5": "Status arrangements",
    "6": "Document editing/checking",
    "7": "Empty message (due to missing attachment)",
    "8": "Empty message"
}

root_directory = '/content/drive/MyDrive/maildir/enron_with_categories/'
rows = []

for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    if os.path.isdir(folder_path):
        label = labels[folder_name]
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                with open(os.path.join(folder_path, filename), 'r') as f:
                    data = f.read()
                    message_body = data.split("\n\n")[-1]
                    number = int(filename.split(".")[0])
                    rows.append({"#": number,"Label": label,"Message": message_body})

In [None]:
df = pd.DataFrame(rows)
# Drop rows with "Empty message (due to missing attachment)" or "Empty message" labels
df = df[df['Label'] != 'Empty message (due to missing attachment)']
df = df[df['Label'] != 'Empty message']

# Drop rows with empty Message column
df = df.dropna(subset=['Message'])
df = df[df['Message'] != '']

# Remove duplicates
df.drop_duplicates(inplace=True)

# Clean the email message
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]|\d', '', text)
    text = text.lower()
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    tokens = word_tokenize(text)
    # Stem the words
    # Initialize the stemmer
    stemmer = SnowballStemmer("english")
    text = [stemmer.stem(word) for word in tokens]
    #text = [stemmer.stem(text) for text in text]
    # Remove punctuation and stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)
df['Message'] = df['Message'].apply(lambda x: clean_text(x))
df

Unnamed: 0,#,Label,Message
1,54659,Purely Personal,hormone hostage knows days month man open mout...
5,176665,Purely Personal,
6,9176,Purely Personal,call nbcs cant see tv cheap rating ploy works ...
7,174265,Purely Personal,testing notes capabilities enron home computer
8,54544,Purely Personal,kevin moore
...,...,...,...
1696,174395,"Company Business, Strategy, etc.",please someone contact cell telephone address ...
1697,174396,"Company Business, Strategy, etc.",
1698,174397,"Company Business, Strategy, etc.",
1700,174400,"Company Business, Strategy, etc.",


In [None]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Message'])

# Normalization
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=0)

In [None]:
# """
# Naive Bayes
# """
# # Train the model
# nb = MultinomialNB()
# nb.fit(X_train, y_train)

# # Predict on the test set
# y_pred = nb.predict(X_test)

# # Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
#Random Forest

# Train a Random Forest classifier on the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6153846153846154
                                      precision    recall  f1-score   support

    Company Business, Strategy, etc.       0.62      0.90      0.74       133
           Document editing/checking       0.17      0.07      0.10        14
               Logistic Arrangements       0.74      0.36      0.49        77
Personal but in professional context       0.00      0.00      0.00        12
                     Purely Personal       1.00      0.33      0.50         6
                 Status arrangements       0.33      0.20      0.25         5

                            accuracy                           0.62       247
                           macro avg       0.48      0.31      0.35       247
                        weighted avg       0.60      0.62      0.57       247



In [None]:
# #Support Vector Machine


# # Train a SVM classifier on the training data
# clf = SVC(kernel='linear', C=1, random_state=42)
# clf.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = clf.predict(X_test)

# # Evaluate the performance of the classifier
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))


In [None]:
# Tokenize the messages
tokenized_docs = [word_tokenize(text) for text in df['Message']]

# Train the Word2Vec model
model = Word2Vec(tokenized_docs, size=100, window=5, min_count=1, workers=4)

# Get the average word vectors for each message
message_vectors = []
for doc in tokenized_docs:
    if len(doc) == 0:
        vector = 0
    else:
        vector = sum([model.wv[word] for word in doc])/len(doc)
    message_vectors.append(vector)

# Convert the list of vectors into a pandas DataFrame
message_vectors = [message_vectors]
message_vectors = pd.DataFrame(message_vectors)

# Combine the message vectors and the labels into one DataFrame
df_vectors = pd.concat([message_vectors, df['Label']], axis=1)




In [None]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(message_vectors, df['Label'], test_size=0.2, random_state=0)


# Fit the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Fit the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict the labels for the test set
nb_pred = nb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)

# Evaluate the performance of the models
from sklearn.metrics import accuracy_score

nb_score = accuracy_score(y_test, nb_pred)
svm_score = accuracy_score(y_test, svm_pred)

print("Naive Bayes accuracy: ", nb_score)
print("SVM accuracy: ", svm_score)


ValueError: ignored