# Imports and Setup

Import necessary libraries for file handling (os), text processing (nltk), feature extraction (sklearn.feature_extraction.text), label encoding (sklearn.preprocessing), model training and evaluation (sklearn.model_selection, sklearn.naive_bayes, sklearn.metrics), and numerical operations (numpy).
Ensure NLTK data (punkt and stopwords) is downloaded.

In [45]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [46]:
# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Satya
[nltk_data]     Kilani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Satya
[nltk_data]     Kilani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Function to Load Emails:

Define a function load_emails_from_directory to load email texts and their labels from a specified directory.
Loop through files in the directory, read email contents, and extract labels from filenames.


In [47]:
# Load emails from a directory
def load_emails_from_directory(directory_path):
    email_texts = []
    labels = []  # Assuming you have labels for each email, e.g., 'spam' or 'ham'

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='latin1') as file:
                email_text = file.read()
                email_texts.append(email_text)
                
                # Extract label from filename or metadata
                # Example: filename might be 'spam_email1.txt' and you extract 'spam'
                label = filename.split('.')[3]  # Adjust according to your naming convention
                labels.append(label)

    return email_texts, labels

# Load emails from the directory

In [48]:
directory_path = r"C:\Users\Satya Kilani\OneDrive\Pictures\Documents\hunarintern\task4\training emails (1)"
emails, email_labels = load_emails_from_directory(directory_path)
print(f"Number of emails loaded: {len(email_labels)}")
print(f"Unique labels: {set(email_labels)}")

Number of emails loaded: 5172
Unique labels: {'ham', 'spam'}


# Preprocess email texts

In [49]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    # Tokenize
    words = word_tokenize(text)
    
    # Lowercase and remove stop words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    
    # Stemming
    words = [stemmer.stem(word) for word in words]
    
    return ' '.join(words)



# Apply preprocessing

In [50]:
preprocessed_emails = []
for email in emails:
    preprocessed_emails.append(preprocess_text(email))

preprocessed_emails

['subject christma tree farm pictur',
 'subject vastar resourc inc gari product high island larger block commenc saturday p gross carlo expect gross tomorrow vastar own gross product georg x forward georg weissman hou ect daren j farmer carlo j rodriguez hou ect ect cc georg weissman hou ect ect melissa grave hou ect ect subject vastar resourc inc carlo pleas call linda get everyth set go estim come tomorrow increas follow day base convers bill fischer bmar forward daren j farmer hou ect enron north america corp georg weissman daren j farmer hou ect ect cc gari bryan hou ect ect melissa grave hou ect ect subject vastar resourc inc darren attach appear nomin vastar resourc inc high island larger block previous erron refer well vastar expect well commenc product sometim tomorrow told linda harri get telephon number ga control provid notif turn tomorrow linda number record voic fax would pleas see someon contact linda advis submit futur nomin via e mail fax voic thank georg x forward geor

In [51]:
# Convert texts to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_emails)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(email_labels)
print(f"Encoded labels: {y}")



Encoded labels: [0 0 0 ... 1 1 0]


In [52]:
X

<5172x37896 sparse matrix of type '<class 'numpy.float64'>'
	with 307527 stored elements in Compressed Sparse Row format>

In [53]:
y

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

# Split data into training and test sets

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model

In [55]:
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on the test set

In [56]:
y_pred = model.predict(X_test)

In [57]:
# Print the unique classes
unique_y_test = np.unique(y_test)
unique_y_pred = np.unique(y_pred)
print(f"Number of unique classes in y_test: {len(unique_y_test)}")
print(f"Number of unique classes in y_pred: {len(unique_y_pred)}")
print(f"Unique classes in y_test: {unique_y_test}")
print(f"Unique classes in y_pred: {unique_y_pred}")

# Create the target names
target_names = [label_encoder.classes_[i] for i in unique_y_test]



Number of unique classes in y_test: 2
Number of unique classes in y_pred: 2
Unique classes in y_test: [0 1]
Unique classes in y_pred: [0 1]


# Predict and evaluate

In [58]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=target_names))


Accuracy: 0.9265700483091788
              precision    recall  f1-score   support

         ham       0.91      1.00      0.95       739
        spam       1.00      0.75      0.85       296

    accuracy                           0.93      1035
   macro avg       0.95      0.87      0.90      1035
weighted avg       0.93      0.93      0.92      1035

