In [1]:
import polars as pl
from sklearn.model_selection import train_test_split

f_name = "amazon_cells_labelled.txt"
df = pl.read_csv(f_name, has_header=False, sep="\t", new_columns=["text", "class"])
df = df.with_columns(pl.col("text").str.replace_all("[^\s\w\d]", "").str.to_lowercase())
df_train, df_test = train_test_split(df, test_size=0.2, random_state=2023)

df_train.tail()

text,class
str,i64
"""as i said abov...",0
"""do not buy if ...",0
"""ive also had p...",0
"""small sleek im...",1
"""just does not ...",0


In [2]:
# count the occurrence of each distinct word
def count_words(df):
    return df.select(pl.col("text").str.split(" ").alias("words").flatten()).to_series().value_counts()

VOCAB_SIZE = len(count_words(df)) # number of distinct words in the entire dataset
LABEL_TO_NAME = {0: "Negative", 1: "Positive"}

In [3]:
def build_categories(df_train):
    cats = []
    for label, df in df_train.partition_by(groups="class", as_dict=True).items():
        wc = count_words(df).get_column("counts").sum()
        table = dict(count_words(df).rows())
        prior = len(df) / len(df_train)
        cats.append(Category(label, prior, wc, table))
    return cats

class Category:
    label: int
    prior: float
    word_count: int
    table: dict[str, float]
    
    def __init__(self, label, prior, word_count, table):
        self.label = label
        self.prior = prior
        self.word_count = word_count
        self.table = table 

def conditional_prob(word_class_count, word_count):
    return (word_count + 1) / (word_class_count + VOCAB_SIZE)

def sentence_prob(sentence: list[str], class_dict, word_class_count):
    prob = 1.0
    for word in sentence:
        word_count = class_dict.get(word, 0)
        prob *= conditional_prob(word_class_count, word_count)
    return prob

def nb(sentence: list[str], cats: list[Category]):
    best_cat, best = 0, -1
    for cat in cats:
        prob = cat.prior * sentence_prob(sentence, cat.table, cat.word_count)
        if prob > best:
            best_cat, best = cat, prob
    return best_cat.label

cats = build_categories(df_train)
LABEL_TO_NAME[nb(["best", "product", "ever"], cats)]

'Positive'

In [4]:
def validation_acc(test_df, cats):
    correct = 0
    for row in test_df.rows():
        words = row[0].strip().split(" ")
        y = row[1]
        y_hat = nb(words, cats)
        if y_hat == y:
            correct += 1
    print(f"{100.0 * correct / len(test_df)}% Accuracy")

validation_acc(df_test, cats)

79.0% Accuracy
