In [4]:
import pandas as pd
import numpy as np
import re

In [5]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter
        self.classes = None
        self.class_word_counts = {}  # {class_label: {word: count}}
        self.class_counts = {}  # {class_label: count}
        self.vocab = set()

    def preprocess_text(self, text):
        # Convert text to lowercase
        text = text.lower()
        # Remove special characters, multiple spaces, and newlines
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    def fit(self, X, y):
        self.classes = np.unique(y)
        for c in self.classes:
            self.class_word_counts[c] = {}
            self.class_counts[c] = 0
        self.vocab = set()

        for i in range(len(X)):
            processed_text = self.preprocess_text(X[i])
            for word in processed_text.split():
                self.class_word_counts[y[i]][word] = self.class_word_counts[y[i]].get(word, 0) + 1
                self.class_counts[y[i]] += 1
                self.vocab.add(word)

    def _calculate_log_prior(self, class_label):
        return np.log(self.class_counts[class_label] / np.sum(list(self.class_counts.values())))

    def _calculate_log_likelihood(self, x, class_label):
        log_likelihood = 0
        total_words_in_class = sum(self.class_word_counts[class_label].values()) + len(self.vocab) * self.alpha

        for word in x.split():
            word_count = self.class_word_counts[class_label].get(word, 0) + self.alpha
            log_likelihood += np.log(word_count / total_words_in_class)

        return log_likelihood

    def predict(self, X):
        predictions = []
        for x in X:
            processed_text = self.preprocess_text(x)
            best_class = None
            max_posterior = float('-inf')
            for c in self.classes:
                log_prior = self._calculate_log_prior(c)
                log_likelihood = self._calculate_log_likelihood(processed_text, c)
                posterior = log_prior + log_likelihood
                if posterior > max_posterior:
                    max_posterior = posterior
                    best_class = c
            predictions.append(best_class)
        return predictions

In [6]:
# Read dataset from Excel file and split into train and test sets
def read_and_split_dataset(file_path, test_size=0.2):
    df = pd.read_excel(file_path)
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()))  # Preprocess text
    df = df.sample(frac=1).reset_index(drop=True)  # Shuffle rows
    split_index = int(len(df) * (1 - test_size))
    train_data = df.iloc[:split_index]
    test_data = df.iloc[split_index:]
    X_train, y_train = train_data['text'].tolist(), train_data['type'].tolist()
    X_test, y_test = test_data['text'].tolist(), test_data['type'].tolist()
    return X_train, X_test, y_train, y_test

In [9]:
# Example usage:
file_path = "./datasetA.xlsx"  # Replace with the path to your Excel file
X_train, X_test, y_train, y_test = read_and_split_dataset(file_path)

In [10]:
# Create and train the classifier
classifier = MultinomialNaiveBayes(alpha=1)
classifier.fit(X_train, y_train)

In [11]:
# Predict classes for test dataset
predicted_classes = classifier.predict(X_test)
for i, text in enumerate(X_test):
    print("Text:", text)
    print("True Label:", y_test[i])
    print("Predicted Label:", predicted_classes[i])
    print("----------------------")

Text: rt wxyzalicia this just in 

bonjovi is coming in concert to pizzapizzaarena on july th 
bryanadams will be joining them 


True Label: entertainment
Predicted Label: entertainment
----------------------
Text: rt finelinetay imagine going to a paul mccartney concert with harry im so soft  httpstcoyazsgadiqn
True Label: entertainment
Predicted Label: entertainment
----------------------
Text: im actually scared as hell i never went to a concert before 
True Label: entertainment
Predicted Label: entertainment
----------------------
Text: i feel so bad for people who didnt see the cats the musical the movie movie in theaters it was truly a magical experience
True Label: entertainment
Predicted Label: entertainment
----------------------
Text: anndaparatha one of my fav movie 
True Label: entertainment
Predicted Label: entertainment
----------------------
Text: rt stuartpstevens people complaining a presidential debate was boring seems off  the whole idea of politics as entertainment