In [1]:
#Assignment 3 Fundamentals Of AI Shubh Sudan

In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict
import os # Added for loading data

In [4]:
MOVIE_TRAIN_PATH = 'reviews_polarity_train.csv'
MOVIE_TEST_PATH = 'reviews_polarity_test.csv'
NEWS_TRAIN_PATH = 'newsgroup_train.csv'
NEWS_TEST_PATH = 'newsgroup_test.csv'


# --- Download NLTK resources one time ---
for resource in ['punkt', 'punkt_tab']:
    try:
        nltk.data.find(f'tokenizers/{resource}')
    except LookupError:
        print(f"Downloading '{resource}' resource...")
        nltk.download(resource)



# CLASS 1: NaiveBayes (for Q4 - NO Smoothing)


#lets make a class naive bayes - which consists of different methods that would be helpful for us.
#We will include the pre-processing of text within this class itself - to iterate the process whenever required..
class NaiveBayes:

    def __init__(self):
        self.log_priors = {}
        self.log_likelihoods = {}
        #Here we define a set as it stores unique entries and does not allow repetition of elements
        self.vocabulary = set()
        self.classes = set()
        self.vocab_size = 0 #Size of vocab from dataset - initialized at 0

    def preprocess(self, text):
        #Here we clean the textual data
        
        # --- FIX: Handle non-string (float/NaN) data ---
        if not isinstance(text, str):
            return [] # Return an empty list of tokens
            
        #Lowercasing the text
        text = text.lower()
        #Tokenizing - splitting sentences into words
        tokens = nltk.word_tokenize(text)
        #removing special characters,punctuations
        cleaned_tokens = [word for word in tokens if word.isalpha()]
        return cleaned_tokens

    def fit(self, X_train, y_train):
        #Naive bayes trained using training data.
        
        # --- NOTE: This 'fit' method is for Q4 (no smoothing) ---
        
        num_docs = len(X_train)
        self.classes = set(y_train)

        #Here we calculate the parameters required in a naive bayes formula
        #eg: prior prob, likelihood probability (Causation Approach)
        for i in self.classes:
            num_docs_in_class = sum(1 for label in y_train if label == i)
            self.log_priors[i] = np.log2(num_docs_in_class / num_docs)

            
        #Now we build vocabulary
        print("Now we build vocabulary and counting words...")

        word_counts_per_class = defaultdict(lambda: defaultdict(int))
        total_words_per_class = defaultdict(int)

        # This is where we loop through every single document to learn
        for text, label in zip(X_train, y_train):
            tokens = self.preprocess(text)

            for word in tokens:
                self.vocabulary.add(word)
                #increment count(w,i)
                word_counts_per_class[label][word] += 1
                #increment total word count for the class
                total_words_per_class[label] += 1
                
        self.vocab_size = len(self.vocabulary)
        print(f"Vocabulary size = {self.vocab_size}")

        print("Calculating Likelihood (Q4 - No Smoothing)")
        for i in self.classes:
            self.log_likelihoods[i] = {}
            total_word_count_class = total_words_per_class[i]

            if total_word_count_class == 0:
                print(f"Warning: Class '{i}' has no words. Likelihoods will be -inf.")
                denominator = 1
            else:
                denominator = total_word_count_class

            for word in self.vocabulary:
                count_w_c = word_counts_per_class[i].get(word, 0)
                numerator = count_w_c

                # --- This is CRITICAL for no-smoothing ---
                # We must handle log(0) which will crash the code.
                if numerator == 0:
                    self.log_likelihoods[i][word] = np.log2(1e-9) # Use a tiny log-prob
                else:
                    #calculating and storing likelihood
                    self.log_likelihoods[i][word] = np.log2(numerator / denominator)

        print("Training done - now time to test")

    def predict(self, X_test):
        
        #Predicts the class labels for a list of test documents.
        #X_test: A list of raw text documents.
        
        predictions = []
        for text in X_test:
            tokens = self.preprocess(text)
            scores = {}
            for i in self.classes:
                if i not in self.log_priors:
                    log_prob = np.log2(1e-9)
                else:
                    log_prob = self.log_priors[i]

                for word in tokens:
                    if word in self.vocabulary:
                        if word in self.log_likelihoods[i]:
                            log_prob += self.log_likelihoods[i][word]
                
                scores[i] = log_prob
            
            predicted_class = max(scores, key=scores.get)
            predictions.append(predicted_class)
            
        return predictions


# CLASS 2: NaiveBayesSmoothed (for Q7 - Laplace Smoothing)


# We'll make a new class for this to keep things clean.
# It's mostly a copy of the first class, but with a modified 'fit' method.
class NaiveBayesSmoothed:

    def __init__(self):
        self.log_priors = {}
        self.log_likelihoods = {}
        #Here we define a set as it stores unique entries and does not allow repetition of elements
        self.vocabulary = set()
        self.classes = set()
        self.vocab_size = 0 #Size of vocab from dataset - initialized at 0
        self.alpha = 1 # This is the '1' in (count + 1) for Laplace

    def preprocess(self, text):
        #Here we clean the textual data
        if not isinstance(text, str):
            return [] # Return an empty list of tokens
            
        #Lowercasing the text
        text = text.lower()
        #Tokenizing - splitting sentences into words
        tokens = nltk.word_tokenize(text)
        #removing special characters,punctuations
        cleaned_tokens = [word for word in tokens if word.isalpha()]
        return cleaned_tokens

    def fit(self, X_train, y_train):
        #Naive bayes trained using training data.
        
        # --- NOTE: This 'fit' method is for Q7 (WITH smoothing) ---
        
        num_docs = len(X_train)
        self.classes = set(y_train)

        #Here we calculate the parameters required in a naive bayes formula
        #eg: prior prob, likelihood probability (Causation Approach)
        for i in self.classes:
            num_docs_in_class = sum(1 for label in y_train if label == i)
            self.log_priors[i] = np.log2(num_docs_in_class / num_docs)

            
        #Now we build vocabulary
        print("Now we build vocabulary and counting words...")
        word_counts_per_class = defaultdict(lambda: defaultdict(int))
        total_words_per_class = defaultdict(int)

        for text, label in zip(X_train, y_train):
            tokens = self.preprocess(text)
            for word in tokens:
                self.vocabulary.add(word)
                word_counts_per_class[label][word] += 1
                total_words_per_class[label] += 1
                
        self.vocab_size = len(self.vocabulary)
        print(f"Vocabulary size = {self.vocab_size}")

        print(f"Calculating Likelihood (Q7 - Laplace Smoothing, alpha={self.alpha})")
        for i in self.classes:
            self.log_likelihoods[i] = {}
            total_word_count_class = total_words_per_class[i]

            # --- Q7 CHANGE 1: Denominator from Eq. 3.1 ---
            # (Sum of word counts in class) + (alpha * |V|)
            denominator = total_word_count_class + (self.alpha * self.vocab_size)

            for word in self.vocabulary:
                count_w_c = word_counts_per_class[i].get(word, 0)
                
                # --- Q7 CHANGE 2: Numerator from Eq. 3.1 ---
                # (count(w, c)) + alpha
                numerator = count_w_c + self.alpha

                # Now we don't need to check for log(0) because
                # the numerator can never be zero!
                self.log_likelihoods[i][word] = np.log2(numerator / denominator)

        print("Training done - now time to test")

    def predict(self, X_test):
        # This method is identical to the one in the first class.
        predictions = []
        for text in X_test:
            tokens = self.preprocess(text)
            scores = {}
            for i in self.classes:
                if i not in self.log_priors:
                    log_prob = np.log2(1e-9)
                else:
                    log_prob = self.log_priors[i]
                
                for word in tokens:
                    if word in self.vocabulary:
                        # --- This is the one tiny change ---
                        # What if a word is in our vocab, but *never*
                        # appeared in class 'i'? We need to give it
                        # the smoothed probability.
                        if word in self.log_likelihoods[i]:
                            log_prob += self.log_likelihoods[i][word]
                        else:
                            # This should not happen if vocab is built correctly,
                            # but as a safety, we can assign the "unknown word" prob
                            # which is (0 + alpha) / (Nc + alpha*|V|)
                            total_word_count_class = total_words_per_class[i]
                            denominator = total_word_count_class + (self.alpha * self.vocab_size)
                            log_prob += np.log2(self.alpha / denominator)
                
                scores[i] = log_prob
            
            predicted_class = max(scores, key=scores.get)
            predictions.append(predicted_class)
            
        return predictions


# HELPER FUNCTIONS TO RUN EXPERIMENTS

def load_data(train_csv_path, test_csv_path):
    """
    A simple function to load our data from the CSV files.
    """
    print(f"Loading data from {train_csv_path} and {test_csv_path}...")
    try:
        train_df = pd.read_csv(train_csv_path)
        test_df = pd.read_csv(test_csv_path)
        
        train_df['Text'] = train_df['Text'].fillna('')
        test_df['Text'] = test_df['Text'].fillna('')

        X_train = train_df['Text'].tolist()
        y_train = train_df['Label'].tolist()
        
        X_test = test_df['Text'].tolist()
        y_test = test_df['Label'].tolist()
        
        print("Data loaded successfully.")
        return X_train, y_train, X_test, y_test
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Please make sure your CSV files are named correctly and have 'Text' and 'Label' columns.")
        return [], [], [], []

def calculate_metrics(y_true, y_pred, classes):
    """
    Calculates and prints all the metrics required for Q4.
    """
    
    classes = sorted(list(set(y_true + y_pred)))
    
    correct = sum(1 for yt, yp in zip(y_true, y_pred) if yt == yp)
    accuracy = correct / len(y_true)
    
    metrics = {}
    for c in classes:
        tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt == c and yp == c)
        fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt != c and yp == c)
        fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt == c and yp != c)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        metrics[c] = {'precision': precision, 'recall': recall}
        
    macro_precision = sum(m['precision'] for m in metrics.values()) / len(classes)
    macro_recall = sum(m['recall'] for m in metrics.values()) / len(classes)
    
    cm = defaultdict(lambda: defaultdict(int))
    for yt, yp in zip(y_true, y_pred):
        cm[yt][yp] += 1

    print(f"Macro-Average Accuracy:  {accuracy:.4f}")
    print(f"Macro-Average Precision: {macro_precision:.4f}")
    print(f"Macro-Average Recall:    {macro_recall:.4f}")
    print("\nConfusion Matrix (Actual vs. Predicted):")
    
    header = " " * 12
    for c in classes:
        header += f"{c:>12}"
    print(header)
    print("-" * len(header))
    
    for actual_c in classes:
        row = f"{actual_c:>12}"
        for predicted_c in classes:
            count = cm[actual_c].get(predicted_c, 0)
            row += f"{count:>12}"
        print(row)


# --- Load Movie Data ---
X_movie_train, y_movie_train, X_movie_test, y_movie_test = load_data(MOVIE_TRAIN_PATH, MOVIE_TEST_PATH)

# --- Load News Data ---
X_news_train, y_news_train, X_news_test, y_news_test = load_data(NEWS_TRAIN_PATH, NEWS_TEST_PATH)


# === 1. Movie Review Dataset (Q4 - NO Smoothing) ===

print("1.A.Running Experiment: Movie Review Dataset (Q4 - No Smoothing)")

if X_movie_train:
    nb_movie = NaiveBayes()
    nb_movie.fit(X_movie_train, y_movie_train)
    print("\nPredicting on movie test set...")
    y_movie_pred = nb_movie.predict(X_movie_test)
    print("\n--- Movie Review Results (Q4) ---")
    calculate_metrics(y_movie_test, y_movie_pred, nb_movie.classes)


# === 2. 20 Newsgroups Dataset (Q4 - NO Smoothing) ===

print("1.B.Running Experiment: 20 Newsgroups Dataset (Q4 - No Smoothing)")

if X_news_train:
    nb_news = NaiveBayes()
    nb_news.fit(X_news_train, y_news_train)
    print("\nPredicting on newsgroup test set...")
    y_news_pred = nb_news.predict(X_news_test)
    print("\n--- 20 Newsgroups Results (Q4) ---")
    calculate_metrics(y_news_test, y_news_pred, nb_news.classes)


# === 3. Movie Review Dataset (Q7 - WITH Smoothing) ===

print("2.A.Running Experiment: Movie Review Dataset (Q7 - Laplace Smoothing)")

if X_movie_train:
    nb_movie_smooth = NaiveBayesSmoothed()
    nb_movie_smooth.fit(X_movie_train, y_movie_train)
    print("\nPredicting on movie test set (smoothed)...")
    y_movie_pred_smooth = nb_movie_smooth.predict(X_movie_test)
    print("\n--- Movie Review Results (Q7) ---")
    calculate_metrics(y_movie_test, y_movie_pred_smooth, nb_movie_smooth.classes)


# === 4. 20 Newsgroups Dataset (Q7 - WITH Smoothing) ===

print("2.B.Running Experiment: 20 Newsgroups Dataset (Q7 - Laplace Smoothing)")

if X_news_train:
    nb_news_smooth = NaiveBayesSmoothed()
    nb_news_smooth.fit(X_news_train, y_news_train)
    print("\nPredicting on newsgroup test set (smoothed)...")
    y_news_pred_smooth = nb_news_smooth.predict(X_news_test)
    print("\n--- 20 Newsgroups Results (Q7) ---")
    calculate_metrics(y_news_test, y_news_pred_smooth, nb_news_smooth.classes)



print("All experiments for Q4 and Q7 are complete.")


Loading data from reviews_polarity_train.csv and reviews_polarity_test.csv...
Data loaded successfully.
Loading data from newsgroup_train.csv and newsgroup_test.csv...
Data loaded successfully.
1.A.Running Experiment: Movie Review Dataset (Q4 - No Smoothing)
Now we build vocabulary and counting words...
Vocabulary size = 2376
Calculating Likelihood (Q4 - No Smoothing)
Training done - now time to test

Predicting on movie test set...

--- Movie Review Results (Q4) ---
Macro-Average Accuracy:  0.7700
Macro-Average Precision: 0.7703
Macro-Average Recall:    0.7700

Confusion Matrix (Actual vs. Predicted):
                     neg         pos
------------------------------------
         neg         118          32
         pos          37         113
1.B.Running Experiment: 20 Newsgroups Dataset (Q4 - No Smoothing)
Now we build vocabulary and counting words...
Vocabulary size = 26479
Calculating Likelihood (Q4 - No Smoothing)
Training done - now time to test

Predicting on newsgroup test 