## This for question 2 part b and c.

In [1]:
## Name: Sagar Ghimire 
## NLP Assignment

In [2]:
## Import necessary libraries
import pandas as pd 
import numpy as np
import math

# Training data
train_data = [
    ('comedy', {'fun': 1, 'couple': 1, 'love': 2}),
    ('action', {'fast': 1, 'furious': 1, 'shoot': 2}),
    ('comedy', {'couple': 1, 'fly': 1, 'fast': 1, 'fun': 2}),
    ('action', {'furious': 1, 'shoot': 2, 'fun': 1}),
    ('action', {'fly': 1, 'fast': 1, 'shoot': 1, 'love': 1})
]

# Test data
test_features = {'fast': 1, 'couple': 1, 'shoot': 1, 'fly': 1}

# Vocabulary size
vocabulary = {'fun', 'couple', 'love', 'fast', 'furious', 'shoot', 'fly'}

## Create a class for Naive Bayes model
class NaiveBayesClassifier:
    ## Constructor method
    def __init__(self, train_data, vocabulary):
        # Training data containing labels and corresponding features
        self.train_data = train_data
        # Vocabulary containing all unique words in the dataset
        self.vocabulary = vocabulary
        # Set of unique classes in the training data
        self.classes = set(label for label, _ in train_data)
        # Dictionary to store prior probabilities for each class
        self.priors = {}
        # Nested dictionary to store likelihood probabilities for each word in each class
        self.likelihoods = {label: {word: 0 for word in vocabulary} for label in self.classes}
        # Compute priors and likelihoods during initialization
        self.compute_priors()
        self.compute_likelihoods()

    # Method to compute prior probabilities for each class
    def compute_priors(self):
        total_docs = len(self.train_data)
        for label in self.classes:
            # Count the number of documents belonging to the current class
            docs_in_class = sum(1 for lbl, _ in self.train_data if lbl == label)
            # Calculate prior probability for the class
            self.priors[label] = docs_in_class / total_docs

    # Method to compute likelihood probabilities for each word in each class
    def compute_likelihoods(self):
        # Initialize dictionaries to store word counts and total word counts for each class
        class_word_counts = {label: {word: 0 for word in self.vocabulary} for label in self.classes}
        class_total_words = {label: 0 for label in self.classes}
        
        # Iterate through training data
        for label, doc in self.train_data:
            # Update word counts and total word counts for each class
            for word, count in doc.items():
                class_word_counts[label][word] += count
                class_total_words[label] += count
        
        # Calculate likelihoods using Laplace smoothing
        for label in self.classes:
            total_words = sum(class_word_counts[label].values())
            for word in self.vocabulary:
                # Add-one smoothing for each word
                self.likelihoods[label][word] = (class_word_counts[label][word] + 1) / (total_words + len(self.vocabulary))

    # Method to predict the class for test features
    def predict(self, test_features):
        # Dictionary to store log probabilities for each class
        class_probs = {}
        for label in self.classes:
            # Initialize log probability with prior probability
            class_probs[label] = math.log(self.priors[label])
            for word, count in test_features.items():
                if word in self.vocabulary:
                    # Update log probability using likelihoods
                    class_probs[label] += count * math.log(self.likelihoods[label][word])

        return class_probs

    # Method to calculate accuracy of the model
    def calculate_accuracy(self, test_data):
        correct_predictions = 0
        total_predictions = len(test_data)
        for label, features in test_data:
            # Predict the class for test features
            predicted_class = max(self.predict(features), key=self.predict(features).get)
            # Check if prediction matches the actual label
            if predicted_class == label:
                correct_predictions += 1
        # Compute accuracy
        return correct_predictions / total_predictions

    # Method to calculate prior probability of a class
    def calculate_prior_class_prob(self, label):
        return self.priors[label]

    # Method to calculate log probability of a class
    def calculate_log_prob(self, label):
        return math.log(self.priors[label])  


# Create and train the classifier
classifier = NaiveBayesClassifier(train_data, vocabulary)

# Prior class probabilities
print("Prior class probabilities:")
for label in classifier.classes:
    prior = classifier.calculate_prior_class_prob(label)
    print("Prior class probability ({}): {:.4f}".format(label, prior))

# Log probabilities for test data
print("\nLog probabilities for test data:")
class_probs = classifier.predict(test_features)
for label, prob in class_probs.items():
    print("Probability of class", label, ":", prob)

# Accuracy
accuracy = classifier.calculate_accuracy(train_data)
print("\nAccuracy:", accuracy)


Prior class probabilities:
Prior class probability (action): 0.6000
Prior class probability (comedy): 0.4000

Log probabilities for test data:
Probability of class action : -8.705062601975643
Probability of class comedy : -9.52173897104528

Accuracy: 1.0
