In [5]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle

def load_data(data_dir, filename):
    with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
        data = [line.strip() for line in f]
    return data

def train_and_evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test, vectorizer, filename):
    alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    
    best_alpha = None
    best_accuracy = 0
    best_model = None
    
    # Iterate over alpha values
    for alpha in alphas:
        # Train a MultinomialNB model with the current alpha
        model = MultinomialNB(alpha=alpha)
        model.fit(X_train, y_train)
        
        # Make predictions on the validation set
        val_pred = model.predict(X_val)
        
        # Calculate accuracy of the predictions
        accuracy = accuracy_score(y_val, val_pred)        
        if accuracy > best_accuracy:
            best_alpha = alpha
            best_accuracy = accuracy
            best_model = model
    
    print(f"Best alpha: {best_alpha}")
    print(f"Best Validation Accuracy: {best_accuracy}")
    test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    # Save the best model and vectorizer's information to a file
    model_data = {
        'model': best_model,
        'vocabulary': vectorizer.vocabulary_,
        'ngram_range': vectorizer.ngram_range,
        'token_pattern': vectorizer.token_pattern
    }
    with open(filename, 'wb') as file:
        pickle.dump(model_data, file)
    
    return best_accuracy, test_accuracy


def print_results_in_table(test_acc_uni, test_acc_bi, test_acc_uni_bi, test_acc_uni_ns, test_acc_bi_ns, test_acc_uni_bi_ns):
    # Create a table with the results
    table = [
        ["Stopwords removed", "Text features", "Accuracy (test set)"],
        ["yes", "unigrams", test_acc_uni_ns],
        ["yes", "bigrams", test_acc_bi_ns],
        ["yes", "unigrams+bigrams", test_acc_uni_bi_ns],
        ["no", "unigrams", test_acc_uni],
        ["no", "bigrams", test_acc_bi],
        ["no", "unigrams+bigrams", test_acc_uni_bi],
    ]
    # Print the table
    print("\nClassification accuracy on the test sets:")
    for row in table:
        print("{:<20} {:<20} {:<20}".format(*row))


    
data_dir = "/Users/raju/Raju Mac/UW/UW/Spring 24/MSCI 641/Assignment/a1/data"
out_dir = "/Users/raju/Raju Mac/UW/UW/Spring 24/MSCI 641/Assignment/a2/data"
#Data with stop Words
train_data = load_data(data_dir, 'train.csv')
val_data = load_data(data_dir, 'val.csv')
test_data = load_data(data_dir, 'test.csv')

#Data without stop words
train_data_ns = load_data(data_dir, 'train_ns.csv')
val_data_ns = load_data(data_dir, 'val_ns.csv')
test_data_ns = load_data(data_dir, 'test_ns.csv')

#labels data
train_labels = load_data(data_dir, 'train_labels.csv')
val_labels = load_data(data_dir, 'val_labels.csv')
test_labels = load_data(data_dir, 'test_labels.csv')

#Define Unigram, bigram and uni-bigram models for vectorization
uni_vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r"(?u)\b\w+\b")
bi_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r"(?u)\b\w+\b")
uni_bi_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b")

#Vectorize the data with stop words
train_uni = uni_vectorizer.fit_transform(train_data)
val_uni = uni_vectorizer.transform(val_data)
test_uni = uni_vectorizer.transform(test_data)

train_bi = bi_vectorizer.fit_transform(train_data)
val_bi = bi_vectorizer.transform(val_data)
test_bi = bi_vectorizer.transform(test_data)

train_uni_bi = uni_bi_vectorizer.fit_transform(train_data)
val_uni_bi = uni_bi_vectorizer.transform(val_data)
test_uni_bi = uni_bi_vectorizer.transform(test_data)


# Vectorize the data without stop words
train_ns_uni = uni_vectorizer.fit_transform(train_data_ns)
val_ns_uni = uni_vectorizer.transform(val_data_ns)
test_ns_uni = uni_vectorizer.transform(test_data_ns)

train_ns_bi = bi_vectorizer.fit_transform(train_data_ns)
val_ns_bi = bi_vectorizer.transform(val_data_ns)
test_ns_bi = bi_vectorizer.transform(test_data_ns)

train_ns_uni_bi = uni_bi_vectorizer.fit_transform(train_data_ns)
val_ns_uni_bi = uni_bi_vectorizer.transform(val_data_ns)
test_ns_uni_bi = uni_bi_vectorizer.transform(test_data_ns)

print("Training MNB with unigrams w stopwords...")
val_acc_uni, test_acc_uni = train_and_evaluate_model(train_uni, train_labels, val_uni, val_labels, test_uni, test_labels, uni_vectorizer, os.path.join(out_dir, 'mnb_uni.pkl'))
# print("Training MNB with bigrams w stopwords...")
# val_acc_bi, test_acc_bi = train_and_evaluate_model(train_bi, train_labels, val_bi, val_labels, test_bi, test_labels, bi_vectorizer, os.path.join(out_dir, 'mnb_bi.pkl'))
# print("Training MNB with unigrams and bigrams w stopwords...")
# val_acc_uni_bi, test_acc_uni_bi = train_and_evaluate_model(train_uni_bi, train_labels, val_uni_bi, val_labels, test_uni_bi, test_labels, uni_bi_vectorizer, os.path.join(out_dir, 'mnb_uni_bi.pkl'))

# print("Training MNB with unigrams w/o stopwords...")
# val_acc_uni_ns, test_acc_uni_ns = train_and_evaluate_model(train_ns_uni, train_labels, val_ns_uni, val_labels, test_ns_uni, test_labels, uni_vectorizer, os.path.join(out_dir, 'mnb_uni_ns.pkl'))
# print("Training MNB with bigrams w/o stopwords...")
# val_acc_bi_ns, test_acc_bi_ns = train_and_evaluate_model(train_ns_bi, train_labels, val_ns_bi, val_labels, test_ns_bi, test_labels, bi_vectorizer, os.path.join(out_dir, 'mnb_bi_ns.pkl'))
# print("Training MNB with unigrams and bigrams w/o stopwords...")
# val_acc_uni_bi_ns, test_acc_uni_bi_ns = train_and_evaluate_model(train_ns_uni_bi, train_labels, val_ns_uni_bi, val_labels, test_ns_uni_bi, test_labels, uni_bi_vectorizer, os.path.join(out_dir, 'mnb_uni_bi_ns.pkl'))

# # Print the accuracy results
# print("\nClassification accuracy on the test sets:")
# print(f"Unigrams w stopwords - Test: {test_acc_uni}")
# print(f"Bigrams w stopwords - Test: {test_acc_bi}")
# print(f"Unigrams+bigrams w stopwords - Test: {test_acc_uni_bi}")
# print(f"Unigrams w/o stopwords - Test: {test_acc_uni_ns}")
# print(f"Bigrams w/o stopwords - Test: {test_acc_bi_ns}")
# print(f"Unigrams+bigrams w/o stopwords - Test: {test_acc_uni_bi_ns}")
# print_results_in_table(test_acc_uni, test_acc_bi, test_acc_uni_bi, test_acc_uni_ns, test_acc_bi_ns, test_acc_uni_bi_ns)

Training MNB with unigrams w stopwords...
Best alpha: 1
Best Validation Accuracy: 0.8074375
