In [3]:
import os
import csv
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Soham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Map folder names to label values
labels = {
    "1": "Company Business, Strategy, etc.",
    "2": "Purely Personal",
    "3": "Personal but in professional context",
    "4": "Logistic Arrangements",
    "5": "Status arrangements",
    "6": "Document editing/checking",
    "7": "Empty message (due to missing attachment)",
    "8": "Empty message"
}

root_directory = 'enron_with_categories/'
rows = []

for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    if os.path.isdir(folder_path):
        label = labels[folder_name]
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                with open(os.path.join(folder_path, filename), 'r') as f:
                    data = f.read()
                    message_body = data.split("\n\n")[-1]
                    number = int(filename.split(".")[0])
                    rows.append({"Message": message_body, "Label": label, "#": number})

# Load the data into a pandas DataFrame
df = pd.DataFrame(rows, columns=['#', 'Label', 'Message'])

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Encoding categorical variables
df = pd.get_dummies(df, columns=["Label"])

# Dropping rows with NaN values
df.dropna(subset=["Message"], inplace=True)
df = df[~df['Message'].isin([np.inf, -np.inf])]

# Check for NaN values
print(df[df.isnull().any(axis=1)])

# Check for infinite values
print(df.replace([np.inf, -np.inf], np.nan).isnull().sum().sum())

# Download stopwords and stemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
punctuation = string.punctuation

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Stem the words
    words = [stemmer.stem(word) for word in words]
    
    # Rejoin the words into a single string
    text = " ".join(words)
    
    return text

df["Message"] = df["Message"].apply(preprocess_text)

# Transform the text data into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Message"])
y = df.drop(["Message", "#"], axis=1)

Empty DataFrame
Columns: [#, Message, Label_Company Business, Strategy, etc., Label_Document editing/checking, Label_Empty message, Label_Empty message (due to missing attachment), Label_Logistic Arrangements, Label_Personal but in professional context, Label_Purely Personal, Label_Status arrangements]
Index: []
0


  df.fillna(df.mean(), inplace=True)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Soham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
