<a href="https://colab.research.google.com/github/sohamtalukdar/Enron-Email-Analysis/blob/main/Enron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir "/content/drive/MyDrive/maildir/"
!tar -xzvf "/content/drive/MyDrive/enron_with_categories.tar.gz" -C "/content/drive/MyDrive/maildir/"

mkdir: cannot create directory ‘/content/drive/MyDrive/maildir/’: File exists
enron_with_categories/
enron_with_categories/1/
enron_with_categories/1/114715.txt
enron_with_categories/1/114715.cats
enron_with_categories/1/229405.txt
enron_with_categories/1/229405.cats
enron_with_categories/1/232795.txt
enron_with_categories/1/232795.cats
enron_with_categories/1/62815.txt
enron_with_categories/1/62815.cats
enron_with_categories/1/118871.txt
enron_with_categories/1/118871.cats
enron_with_categories/1/106588.txt
enron_with_categories/1/106588.cats
enron_with_categories/1/122973.txt
enron_with_categories/1/122973.cats
enron_with_categories/1/106590.txt
enron_with_categories/1/106590.cats
enron_with_categories/1/139359.txt
enron_with_categories/1/139359.cats
enron_with_categories/1/136551.txt
enron_with_categories/1/136551.cats
enron_with_categories/1/98429.txt
enron_with_categories/1/98429.cats
enron_with_categories/1/221314.txt
enron_with_categories/1/221314.cats
enron_with_categories/1/54

In [None]:
import os
import csv
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk.stem as stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.stem import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import gensim
from gensim.models import Word2Vec

In [None]:
class Preprocessor:

    def __init__(self):
        self.df = None  
        self.labels = {
            "1": "Company Business, Strategy, etc.",
            "2": "Purely Personal",
            "3": "Personal but in professional context",
            "4": "Logistic Arrangements",
            "5": "Status arrangements",
            "6": "Document editing/checking",
            "7": "Empty message (due to missing attachment)",
            "8": "Empty message"
        }
        self.root_directory = '/content/drive/MyDrive/maildir/enron_with_categories/'

    def preprocess_data(self):
        rows = []

        for folder_name in os.listdir(self.root_directory):
            folder_path = os.path.join(self.root_directory, folder_name)
            if os.path.isdir(folder_path):
                label = self.labels[folder_name]
                for filename in os.listdir(folder_path):
                    if filename.endswith(".txt"):
                        with open(os.path.join(folder_path, filename), 'r') as f:
                            data = f.read()
                            message_body = data.split("\n\n")[-1]
                            number = int(filename.split(".")[0])
                            rows.append({"#": number, "Label": label, "Message": message_body})

        self.df = pd.DataFrame(rows)
        # Drop rows with "Empty message (due to missing attachment)" or "Empty message" labels
        self.df = self.df[self.df['Label'] != 'Empty message (due to missing attachment)']
        self.df = self.df[self.df['Label'] != 'Empty message']

        # Drop rows with empty Message column
        self.df = self.df.dropna(subset=['Message'])
        self.df = self.df[self.df['Message'] != '']

        # Remove duplicates
        self.df.drop_duplicates(inplace=True)

        # Clean the email message
        self.df['Message'] = self.df['Message'].apply(lambda x: re.sub(r'[^\w\s]|\d', '', x).lower())
        self.df['Message'] = self.df['Message'].apply(lambda x: re.sub(r'\S+@\S+', '', x))
        self.df['Message'] = self.df['Message'].apply(lambda x: re.sub(r'http\S+', '', x))
        self.df['Message'] = self.df['Message'].apply(lambda x: " ".join([word for word in word_tokenize(x) if word.isalpha() and word not in set(stopwords.words("english"))]))
        self.df['Message'] = self.df['Message'].apply(lambda x: " ".join([SnowballStemmer("english").stem(word) for word in x.split()]))

        display(self.df)
    
    def display(self):
    
        return self.df


    def get_dataframe(self):

        return self.df



In [None]:
preprocessor = Preprocessor()
preprocessor.preprocess_data()
df_model = preprocessor.get_dataframe()
display_df = preprocessor.display()

In [None]:


# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_model['Message'])

# Normalization
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df_model['Label'], test_size=0.2, random_state=0)

def run_model(clf, name):
    # Train the model
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print(f"\n{name} Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Naive Bayes
run_model(MultinomialNB(), "Naive Bayes")

# Random Forest
run_model(RandomForestClassifier(n_estimators=100, random_state=42), "Random Forest")

# Support Vector Machine
run_model(SVC(kernel='linear', C=1, random_state=42), "Support Vector Machine")

