In [2]:
# Problem to be solved
# Build a naïve Bayes classifier for sentiment classification. We are defining
# sentiment classification as two classes: positive and negative. Our data set consists of airline reviews. The
# zip directory for the data contains training and test datasets, where each file contains one airline review
# tweet. You will build the model using training data and evaluate with test data. Each of training data and
# test data contains 4182 reviews. You will have to build the system from the scratch (e.g. numpy).

In [3]:
# Give actual examples of program input and output, along with usage instructions.
# Loading data for frequency with no stemming
# Loading data for class 'positive' - Number of documents: 1181
# Loading data for class 'negative' - Number of documents: 3000
# Loading data for class 'positive' - Number of documents: 1182
# Loading data for class 'negative' - Number of documents: 3000
# Training Naive Bayes classifier with frequency mode...
# Naive Bayes training complete.
# Calculating performance metrics...
# Metrics - Accuracy: 0.8867, Precision: 0.8800, Recall: 0.9750, F1: 0.9250
# Completed frequency with no stemming

In [4]:
#Describe the algorithm you have used to solve the problem, specified in a stepwise or point by point fashion.

# load_and_preprocess_data
    # Define the classes and prepare containers for data and vocabulary.
    # For each class, load text files, preprocess the text, and store each document’s tokens with its class label.
    # Add each unique token from the documents to the vocabulary set.
    # Output the processed data and the vocabulary.
# preprocess
    # Use BeautifulSoup to remove any HTML tags and extract plain text from HTML content.
    # Strip URLs and non-alphanumeric characters, replacing them with whitespace.
    # Convert text to lowercase, split it into tokens, and apply stemming if enabled.
    # Output the preprocessed list of tokens.
# create_bow
    # Initialize a bag of words (BoW) as a dictionary to count occurrences.
    # For each word in each document, count occurrences; if binary, only count unique appearances per document.
    # Return BoW dictionary with word counts.
# calculate_tf
    # Divide each word's count by the total number of words in the document.
    # Return term frequency (TF) dictionary.
# calculate_idf
    # Count documents containing each word to calculate document frequency.
    # For each word, compute IDF as log(total docs / (1 + document frequency)).
    # Return IDF scores.
# calculate_tfidf
    # Compute term frequency for each word in the document.
    # Multiply TF values by corresponding IDF scores to get TF-IDF values.
    # Return TF-IDF scores.
# train_naive_bayes
    # Split documents by class and calculate class priors as the probability of each class based on document count.
    # Calculate word frequencies per class and smooth probabilities (for non-TF-IDF modes).
    # If using TF-IDF, compute likelihoods with TF-IDF values; otherwise, use word frequencies.
    # Return priors and likelihoods for each class.
# predict
    # Initialize log probability scores with class priors for each class (scores[cls] = math.log(priors[cls])).
    # For each word in the document, update the log score based on the word likelihood if present in the class.
    # Return the class with the highest final score.
# evaluate_naive_bayes
    # For each test document, predict its class and store actual and predicted classes.
    # Collect detailed prediction results and return them with actuals and predictions.
# calculate_performance_metrics
    # Build a confusion matrix from actual vs. predicted classes.
    # Calculate accuracy, precision, recall, and F1 score for each class based on confusion matrix values.
    # Return performance metrics and confusion matrix.

#Set Paths and Configurations: Define training and testing data paths, representation modes (`frequency`, `binary`, `tfidf`), and stemming options.
# Run Experiments: For each combination of representation mode and stemming option:
    # Setup Logging
    # Load and Preprocess Data
    # Train Naive Bayes
    # Evaluate Model
    # Log Results
    # Output Completion

In [5]:
# Additional description: Please state whether the bonus credit questions are answered or not
# Yes we have answered the bonus credit question.

In [8]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.3 MB/s eta 0:00:01
[?25hCollecting regex>=2021.8.3
  Downloading regex-2025.9.1-cp39-cp39-macosx_11_0_arm64.whl (286 kB)
[K     |████████████████████████████████| 286 kB 34.7 MB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 58.8 MB/s eta 0:00:01
Collecting click
  Downloading click-8.1.8-py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 11.3 MB/s eta 0:00:01
[?25hInstalling collected packages: regex, joblib, click, nltk
Successfully installed click-8.1.8 joblib-1.5.2 nltk-3.9.1 regex-2025.9.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use u

In [9]:
import os
import re
import math
import glob
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
from nltk.stem import SnowballStemmer, PorterStemmer # type: ignore

# data loading from dataset folder and data distribution
def load_and_preprocess_data(base_path, stemming_enabled=False):
    classes = ["positive", "negative"]
    data = []
    vocabulary = set()

    for cls in classes:
        files = glob.glob(os.path.join(base_path, cls, '*.txt'))
        print(f"Loading data for class '{cls}' - Number of documents: {len(files)}")
        for file_path in files:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocess(text, stemming_enabled=stemming_enabled)
                data.append({"class": cls, "text": tokens})
                vocabulary.update(tokens)

    return data, vocabulary

def preprocess(text, stemming_enabled=False, stemmer_type="snowball"):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.lower().split()
    if stemming_enabled:
        stemmer = SnowballStemmer('english') if stemmer_type == "snowball" else PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return tokens

# bag of words creation for binary and frequency
def create_bow(data, binary=False):
    bow = defaultdict(int)
    for document in data:
        seen_words = set()
        for word in document:
            if binary:
                if word not in seen_words:
                    bow[word] += 1
                    seen_words.add(word)
            else:
                bow[word] += 1
    return bow

# TF-IDF Calculation Functions
def calculate_tf(word_counts, total_words_in_doc):
    return {word: count / total_words_in_doc for word, count in word_counts.items()}

def calculate_idf(train_data, vocabulary):
    num_docs = len(train_data)
    doc_freq = defaultdict(int)
    for doc in train_data:
        unique_words = set(doc['text'])
        for word in unique_words:
            doc_freq[word] += 1
    return {word: math.log(num_docs / (1 + freq)) for word, freq in doc_freq.items()}

def calculate_tfidf(word_counts, total_words_in_doc, idf_scores):
    tf_scores = calculate_tf(word_counts, total_words_in_doc)
    return {word: tf_scores[word] * idf_scores.get(word, 0) for word in tf_scores}

# Training the Naive Bayes classifier with TF-IDF support
def train_naive_bayes(data, vocabulary, representation="frequency"):
    class_docs = defaultdict(list)
    for doc in data:
        class_docs[doc["class"]].append(doc["text"])

    total_docs = len(data)
    priors = {cls: len(class_docs[cls]) / total_docs for cls in class_docs}
    likelihoods = {}
    idf_scores = calculate_idf(data, vocabulary) if representation == "tfidf" else None

    for cls in class_docs:
        word_counts = create_bow(class_docs[cls], binary=(representation == "binary"))
        total_words_in_class = sum(word_counts.values())

        if representation == "tfidf":
            likelihoods[cls] = calculate_tfidf(word_counts, total_words_in_class, idf_scores)
        else:
            likelihoods[cls] = {
                word: (word_counts.get(word, 0) + 1) / (total_words_in_class + len(vocabulary))
                for word in vocabulary
            }

    return priors, likelihoods

# Predicting the class output of a document based on priors and likelihoods
def predict(document, priors, likelihoods):
    scores = {cls: math.log(priors[cls]) for cls in priors}
    for cls in scores:
        for word in document:
            if word in likelihoods[cls]:
                scores[cls] += math.log(likelihoods[cls][word])
    return max(scores, key=scores.get)

# Naive Bayes evaluation
def evaluate_naive_bayes(test_data, priors, likelihoods):
    actuals, predictions = [], []
    prediction_details = []

    for doc_id, doc in enumerate(test_data):
        actual = doc["class"]
        predicted = predict(doc["text"], priors, likelihoods)
        actuals.append(actual)
        predictions.append(predicted)
        prediction_details.append((doc_id, predicted, actual))

    return actuals, predictions, prediction_details

# Performance metrics calculation (Confusion Matrix, Accuracy, Precision, Recall, F1-score)
def calculate_performance_metrics(actuals, predictions):
    print("Calculating performance metrics...")
    classes = sorted(set(actuals))
    class_index = {cls: i for i, cls in enumerate(classes)}
    confusion_matrix = [[0] * len(classes) for _ in classes]

    for actual, predicted in zip(actuals, predictions):
        i = class_index[actual]
        j = class_index[predicted]
        confusion_matrix[i][j] += 1

    tp = confusion_matrix[0][0]
    fn = confusion_matrix[0][1]
    fp = confusion_matrix[1][0]
    tn = confusion_matrix[1][1]

    # Metrics calculation
    accuracy = (tp + tn) / float(sum(sum(row) for row in confusion_matrix))
    precision = tp / float(tp + fp) if tp + fp != 0 else 0
    recall = tp / float(tp + fn) if tp + fn != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return confusion_matrix, accuracy, precision, recall, f1_score


if __name__ == "__main__":
    path_to_training_data = "/home/zoro/Workspace/GMU_Masters/Term 3/Assignments/AIT 526/Group Assignments/PA3/tweet/tweet/train"
    path_to_test_data = "/home/zoro/Workspace/GMU_Masters/Term 3/Assignments/AIT 526/Group Assignments/PA3/tweet/tweet/test"
    representations = ["frequency", "binary", "tfidf"]
    stem_options = [True, False]

    # Run experiments for each configuration
    for representation in representations:
        for stemming_enabled in stem_options:
            stem_status = "stemmed" if stemming_enabled else "non-stemmed"
            config_name = f"{representation}_{stem_status}"
            log_filename = f"log_{config_name}.txt"

            with open(log_filename, "w") as log_file:
                print(f"\nLoading data for {representation} with {'stemming' if stemming_enabled else 'no stemming'}")

                train_data, vocab = load_and_preprocess_data(path_to_training_data, stemming_enabled)
                test_data, _ = load_and_preprocess_data(path_to_test_data, stemming_enabled)

                # Training the Naive Bayes classifier
                print(f"Training Naive Bayes classifier with {representation} mode...")
                log_file.write(f"Training Naive Bayes classifier with {representation} mode...\n")
                priors, likelihoods = train_naive_bayes(train_data, vocab, representation=representation)
                print("Naive Bayes training complete.")
                log_file.write("Naive Bayes training complete.\n")
                actuals, predictions, prediction_details = evaluate_naive_bayes(test_data, priors, likelihoods)

                # Calculate performance metrics (now using the updated function for single values)
                confusion_matrix, accuracy, precision, recall, f1_score = calculate_performance_metrics(actuals, predictions)
                print(f"Metrics - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1_score:.4f}")

                log_file.write("\n=Performance Metrics:\n")
                log_file.write("Confusion Matrix:\n")
                for row in confusion_matrix:
                    log_file.write(f"{row}\n")
                log_file.write(f"Accuracy: {accuracy:.4f}\n")
                log_file.write(f"Precision: {precision:.4f}\n")
                log_file.write(f"Recall: {recall:.4f}\n")
                log_file.write(f"F1 Score: {f1_score:.4f}\n\n")
                log_file.write("Predictions Log:\nDoc ID, Predicted Class, Actual Class\n")
                for doc_id, predicted, actual in prediction_details:
                    log_file.write(f"{doc_id + 1}, {predicted}, {actual}\n")
                print(f"Completed {representation} with {'stemming' if stemming_enabled else 'no stemming'}")



Loading data for frequency with stemming
Loading data for class 'positive' - Number of documents: 0
Loading data for class 'negative' - Number of documents: 0
Loading data for class 'positive' - Number of documents: 0
Loading data for class 'negative' - Number of documents: 0
Training Naive Bayes classifier with frequency mode...
Naive Bayes training complete.
Calculating performance metrics...


IndexError: list index out of range

In [None]:
# References
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/51209579
# https://www.datacamp.com/tutorial/naive-bayes-scikit-learn