# Naive Bayes

## 1. Import libraries and load data

In [12]:
import pandas as pd
import numpy as np

train = pd.read_csv("../cleaned_data/train_clean.csv")
valid = pd.read_csv("../cleaned_data/validation_clean.csv")

## 2. Data Pre-Processing
### 2.1 Combine and clean text features

In [13]:
# Combine text columns into a single column because Naive Bayes works on a single text input
text_cols = ["tasks_use_model", "suboptimal_example", "verify_method"]

def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train = combine_text(train)
valid = combine_text(valid)

In [14]:
# Text preprocessing: lowercase, remove punctuation, extra spaces
import re

def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

train["full_text"] = train["full_text"].apply(clean_text)
valid["full_text"] = valid["full_text"].apply(clean_text)


### 2.2 Select numeric and categorical features

In [15]:
# List of numeric features
ordinal_cols = [
    "academic_use_likelihood",
    "suboptimal_frequency",
    "reference_expectation",
    "verify_frequency"
]

# List of categorical features
categorical_cols = [c for c in train.columns
                    if c.startswith("best_task_types_") 
                    or c.startswith("suboptimal_task_types_")]

# Combine all feature columns
feature_cols = ordinal_cols + categorical_cols

# Prepare numeric feature matrices
X_train_numeric = train[feature_cols].values
X_valid_numeric = valid[feature_cols].values

## 3. Vectorize text and build feature matrice

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize text data using Bag-of-Words
vectorizer = CountVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = vectorizer.fit_transform(train["full_text"])         # Fit on training data -> create vocabulary
X_valid_text = vectorizer.transform(valid["full_text"])             # Transform validation data -> use same vocabulary

X_train_text.shape, X_valid_text.shape


((576, 3000), (123, 3000))

In [17]:
from scipy.sparse import hstack

# Combine text and numeric features
X_train = hstack([X_train_text, X_train_numeric])
X_valid = hstack([X_valid_text, X_valid_numeric])

# Prepare labels
y_train = train["label"].values
y_valid = valid["label"].values


## 4. Train Naive Bayes baseline 

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Simple baseline with alpha = 1.0 -> Laplace smoothing factor
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)

y_train_pred = nb.predict(X_train)
y_valid_pred = nb.predict(X_valid)

print("Training accuracy (alpha=1.0):", accuracy_score(y_train, y_train_pred))
print("Validation accuracy (alpha=1.0):", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred, digits=3))


Training accuracy (alpha=1.0): 0.890625
Validation accuracy (alpha=1.0): 0.6097560975609756
              precision    recall  f1-score   support

           0      0.696     0.780     0.736        41
           1      0.556     0.610     0.581        41
           2      0.562     0.439     0.493        41

    accuracy                          0.610       123
   macro avg      0.605     0.610     0.603       123
weighted avg      0.605     0.610     0.603       123



## 5. With vs without stopwords
### 5.1 Compare Bag-of-Words with vs without stopwords

In [19]:
best_use_stopwords = None
best_acc = -np.inf

for use_stopwords in [False, True]:
    print("\n--- stop_words =", use_stopwords, "---")
    
    if use_stopwords:
        vectorizer = CountVectorizer(
            max_features=3000,
            ngram_range=(1, 2),
            min_df=2,
            stop_words='english'
        )
    else:
        vectorizer = CountVectorizer(
            max_features=3000,
            ngram_range=(1, 2),
            min_df=2
        )

    # Recompute text features for this setting
    X_train_text = vectorizer.fit_transform(train["full_text"])
    X_valid_text = vectorizer.transform(valid["full_text"])

    # Combine with numeric features
    X_train_tmp = hstack([X_train_text, X_train_numeric])
    X_valid_tmp = hstack([X_valid_text, X_valid_numeric])

    # Train a quick NB with fixed alpha
    nb = MultinomialNB(alpha=1.0)
    nb.fit(X_train_tmp, y_train)
    y_pred = nb.predict(X_valid_tmp)

    acc = accuracy_score(y_valid, y_pred)
    print("Validation accuracy:", acc)

    if acc > best_acc:
        best_acc = acc
        best_use_stopwords = use_stopwords

print("\nBest setting: stop_words =", best_use_stopwords, "with accuracy", best_acc)



--- stop_words = False ---
Validation accuracy: 0.6097560975609756

--- stop_words = True ---
Validation accuracy: 0.6016260162601627

Best setting: stop_words = False with accuracy 0.6097560975609756


### 5.2 Rebuild features using the best stopword setting


In [20]:
# Use the best stopword configuration for the rest of the notebook
if best_use_stopwords:
    vectorizer = CountVectorizer(
        max_features=3000,
        ngram_range=(1, 2),
        min_df=2,
        stop_words='english'
    )
else:
    vectorizer = CountVectorizer(
        max_features=3000,
        ngram_range=(1, 2),
        min_df=2
    )

# Recompute text features with the best setting
X_train_text = vectorizer.fit_transform(train["full_text"])
X_valid_text = vectorizer.transform(valid["full_text"])

# Combine with numeric features
from scipy.sparse import hstack
X_train = hstack([X_train_text, X_train_numeric])
X_valid = hstack([X_valid_text, X_valid_numeric])


## 6. 5-fold GroupKFold cross-validation to tune alpha

In [21]:
from sklearn.model_selection import GroupKFold
from scipy.sparse import csr_matrix

# csr_matrix format for efficient row slicing
X_train = csr_matrix(X_train)

alphas = [0.1, 0.5, 1.0, 2.0]
groups = train["student_id"].values   # group by student

gkf = GroupKFold(n_splits=5)
cv_results = {}

# Cross-validation to find best alpha
for a in alphas:
    fold_accuracies = []

    # Split data into training and validation folds
    for train_idx, val_idx in gkf.split(X_train, y_train, groups=groups):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        # Train Naive Bayes model
        nb = MultinomialNB(alpha=a)
        nb.fit(X_tr, y_tr)
        y_val_pred = nb.predict(X_val)

        # Evaluate accuracy
        acc = accuracy_score(y_val, y_val_pred)
        fold_accuracies.append(acc)

    # Compute mean accuracy across folds
    mean_acc = np.mean(fold_accuracies)
    cv_results[a] = mean_acc
    print(f"alpha = {a}, mean CV accuracy = {mean_acc:.4f}")

# Select best alpha
best_alpha = max(cv_results, key=cv_results.get)
print("Best alpha from CV:", best_alpha, "with accuracy", cv_results[best_alpha])


alpha = 0.1, mean CV accuracy = 0.6026
alpha = 0.5, mean CV accuracy = 0.6306
alpha = 1.0, mean CV accuracy = 0.6426
alpha = 2.0, mean CV accuracy = 0.6268
Best alpha from CV: 1.0 with accuracy 0.6426000899685109


## 7. Final model trained on full training set and evaluated on validation set

In [22]:
final_nb = MultinomialNB(alpha=best_alpha)
final_nb.fit(X_train, y_train)

y_train_pred = nb.predict(X_train)
y_valid_pred = final_nb.predict(X_valid)

print("Training accuracy:", accuracy_score(y_train, y_train_pred))
print("Final validation accuracy:", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred, digits=3))

Training accuracy: 0.8472222222222222
Final validation accuracy: 0.6097560975609756
              precision    recall  f1-score   support

           0      0.696     0.780     0.736        41
           1      0.556     0.610     0.581        41
           2      0.562     0.439     0.493        41

    accuracy                          0.610       123
   macro avg      0.605     0.610     0.603       123
weighted avg      0.605     0.610     0.603       123

