In [1]:
# Import necessary libraries
import pandas as pd  # for handling data
from nltk.tokenize import word_tokenize  # for breaking text into words
from nltk.probability import FreqDist  # for calculating word frequencies
from nltk.classify import NaiveBayesClassifier  # for building a Naive Bayes classifier
from sklearn.model_selection import train_test_split  # for splitting data into training and testing sets
from sklearn.metrics import accuracy_score  # for evaluating classifier accuracy
from nltk.corpus import stopwords  # for common English words
import nltk  # the Natural Language Toolkit library


# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the spam dataset from CSV
df = pd.read_csv('spam.csv', encoding='latin-1')
# used when the CSV file contains characters that are not encoded in the default encoding

# Rename columns for better clarity
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

# Preprocess the messages
stop_words = set(stopwords.words('english'))

def preprocess_message(message):
    words = word_tokenize(message) #breaks the message  down into individual words
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    # filtered words are words used for analysis therefore checks if the word consists only of
    # alphabetic characters (no numbers or special characters), and ensures that the word is not
    # in a list of stop words(often excluded from text analysis).
    return dict(FreqDist(filtered_words))
    #Creates a dictionary containing the frequency distribution of words in the filtered_words

# Feature extraction
features = [(preprocess_message(message), label) for (label, message) in zip(df['label'], df['message'])]
# The zip() function then combines these two columns pairwise.

# Split data into training and testing sets
train_set, test_set = train_test_split(features, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Test the classifier
test_features = [feature for (feature, label) in test_set]
predictions = classifier.classify_many(test_features)

# Evaluate accuracy
true_labels = [label for (feature, label) in test_set]
accuracy = accuracy_score(true_labels, predictions)

print(f"Accuracy: {accuracy * 100:.2f}%")


[nltk_data] Downloading package stopwords to C:\Users\prathyusha
[nltk_data]     reddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to C:\Users\prathyusha
[nltk_data]     reddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Accuracy: 90.31%
