# Naive Bayes

## 1. Import libraries and load data

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("../cleaned_data/train_clean.csv")
valid = pd.read_csv("../cleaned_data/validation_clean.csv")

## 2. Data Pre-Processing
### 2.1 Combine and clean text features

In [2]:
# Combine text columns into a single column because Naive Bayes works on a single text input
text_cols = ["tasks_use_model", "suboptimal_example", "verify_method"]

def combine_text(df):
    df["full_text"] = df[text_cols].fillna("").agg(" ".join, axis=1)
    return df

train = combine_text(train)
valid = combine_text(valid)

In [None]:
# Text preprocessing: lowercase, remove punctuation, extra spaces
import re

def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

train["full_text"] = train["full_text"].apply(clean_text)
valid["full_text"] = valid["full_text"].apply(clean_text)


### 2.2 Select numeric and categorical features

In [None]:
# List of numeric features
ordinal_cols = [
    "academic_use_likelihood",
    "suboptimal_frequency",
    "reference_expectation",
    "verify_frequency"
]

# List of categorical features
categorical_cols = [c for c in train.columns
                    if c.startswith("best_task_types_") 
                    or c.startswith("suboptimal_task_types_")]

# Combine all feature columns
feature_cols = ordinal_cols + categorical_cols

# Prepare numeric feature matrices
X_train_numeric = train[feature_cols].values
X_valid_numeric = valid[feature_cols].values

TypeError: expected string or bytes-like object, got 'Series'

## 3. Vectorize text and build feature matrice

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize text data using Bag-of-Words
vectorizer = CountVectorizer(
    max_features=3000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = vectorizer.fit_transform(train["full_text"])         # Fit on training data -> create vocabulary
X_valid_text = vectorizer.transform(valid["full_text"])             # Transform validation data -> use same vocabulary

X_train_text.shape, X_valid_text.shape


((576, 3000), (123, 3000))

In [6]:
from scipy.sparse import hstack

# Combine text and numeric features
X_train = hstack([X_train_text, X_train_numeric])
X_valid = hstack([X_valid_text, X_valid_numeric])

# Prepare labels
y_train = train["label"].values
y_valid = valid["label"].values


## 4. Train Naive Bayes baseline 

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Simple baseline with alpha = 1.0 -> Laplace smoothing factor
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)

y_train_pred = nb.predict(X_train)
y_valid_pred = nb.predict(X_valid)

print("Training accuracy (alpha=1.0):", accuracy_score(y_train, y_train_pred))
print("Validation accuracy (alpha=1.0):", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred, digits=3))


Training accuracy (alpha=1.0): 0.890625
Validation accuracy (alpha=1.0): 0.6097560975609756
              precision    recall  f1-score   support

           0      0.696     0.780     0.736        41
           1      0.556     0.610     0.581        41
           2      0.562     0.439     0.493        41

    accuracy                          0.610       123
   macro avg      0.605     0.610     0.603       123
weighted avg      0.605     0.610     0.603       123



## 5. With vs without stopwords
### 5.1 Compare Bag-of-Words with vs without stopwords

In [8]:
stopword_options = [False, True]
max_features_list = [1000, 2000, 3000, 5000]
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
min_dfs = [1, 2, 3]
alphas = [0.1, 0.5, 1.0, 2.0]

best_acc = -np.inf
best_config = None

for use_stopwords in stopword_options:
    print(f"\n=== stop_words = {use_stopwords} ===")
    
    for mf in max_features_list:
        for ngram in ngram_ranges:
            for md in min_dfs:
                # Build vectorizer for this config
                if use_stopwords:
                    vectorizer = CountVectorizer(
                        max_features=mf,
                        ngram_range=ngram,
                        min_df=md,
                        stop_words='english'
                    )
                else:
                    vectorizer = CountVectorizer(
                        max_features=mf,
                        ngram_range=ngram,
                        min_df=md
                    )
                
                # Fit on train text, transform train + valid
                X_train_text = vectorizer.fit_transform(train["full_text"])
                X_valid_text = vectorizer.transform(valid["full_text"])
                
                # Combine with numeric features
                X_train_tmp = hstack([X_train_text, X_train_numeric])
                X_valid_tmp = hstack([X_valid_text, X_valid_numeric])
                
                for alpha in alphas:
                    nb = MultinomialNB(alpha=alpha)
                    nb.fit(X_train_tmp, y_train)
                    y_pred = nb.predict(X_valid_tmp)
                    
                    acc = accuracy_score(y_valid, y_pred)
                    print(
                        f"acc={acc:.4f}, stopwords={use_stopwords}, "
                        f"mf={mf}, ngram={ngram}, min_df={md}, alpha={alpha}"
                    )
                    
                    if acc > best_acc:
                        best_acc = acc
                        best_config = {
                            "stopwords": use_stopwords,
                            "max_features": mf,
                            "ngram_range": ngram,
                            "min_df": md,
                            "alpha": alpha,
                            "vectorizer": vectorizer,  # keep the fitted one
                        }

print("\nBest validation accuracy:", best_acc)
print("Best config:", best_config)



=== stop_words = False ===
acc=0.5691, stopwords=False, mf=1000, ngram=(1, 1), min_df=1, alpha=0.1
acc=0.5772, stopwords=False, mf=1000, ngram=(1, 1), min_df=1, alpha=0.5
acc=0.6098, stopwords=False, mf=1000, ngram=(1, 1), min_df=1, alpha=1.0
acc=0.6423, stopwords=False, mf=1000, ngram=(1, 1), min_df=1, alpha=2.0
acc=0.5610, stopwords=False, mf=1000, ngram=(1, 1), min_df=2, alpha=0.1
acc=0.5854, stopwords=False, mf=1000, ngram=(1, 1), min_df=2, alpha=0.5
acc=0.6016, stopwords=False, mf=1000, ngram=(1, 1), min_df=2, alpha=1.0
acc=0.6341, stopwords=False, mf=1000, ngram=(1, 1), min_df=2, alpha=2.0
acc=0.5528, stopwords=False, mf=1000, ngram=(1, 1), min_df=3, alpha=0.1
acc=0.5854, stopwords=False, mf=1000, ngram=(1, 1), min_df=3, alpha=0.5
acc=0.6098, stopwords=False, mf=1000, ngram=(1, 1), min_df=3, alpha=1.0
acc=0.6179, stopwords=False, mf=1000, ngram=(1, 1), min_df=3, alpha=2.0
acc=0.5854, stopwords=False, mf=1000, ngram=(1, 2), min_df=1, alpha=0.1
acc=0.5854, stopwords=False, mf=1000

### 5.2 Rebuild features and final model using best config

In [9]:
best_use_stopwords = best_config["stopwords"]
best_mf = best_config["max_features"]
best_ngram = best_config["ngram_range"]
best_min_df = best_config["min_df"]
best_alpha = best_config["alpha"]

print("Using best config:")
print(" stopwords:", best_use_stopwords)
print(" max_features:", best_mf)
print(" ngram_range:", best_ngram)
print(" min_df:", best_min_df)
print(" alpha:", best_alpha)

# Recreate & refit vectorizer cleanly (optional; or reuse best_config['vectorizer'])
if best_use_stopwords:
    vectorizer = CountVectorizer(
        max_features=best_mf,
        ngram_range=best_ngram,
        min_df=best_min_df,
        stop_words='english'
    )
else:
    vectorizer = CountVectorizer(
        max_features=best_mf,
        ngram_range=best_ngram,
        min_df=best_min_df
    )

X_train_text = vectorizer.fit_transform(train["full_text"])
X_valid_text = vectorizer.transform(valid["full_text"])

from scipy.sparse import hstack
X_train = hstack([X_train_text, X_train_numeric])
X_valid = hstack([X_valid_text, X_valid_numeric])

final_nb = MultinomialNB(alpha=best_alpha)
final_nb.fit(X_train, y_train)

from sklearn.metrics import classification_report, accuracy_score
y_valid_pred = final_nb.predict(X_valid)
print("Final validation accuracy:", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred, digits=3))


Using best config:
 stopwords: False
 max_features: 1000
 ngram_range: (1, 3)
 min_df: 1
 alpha: 2.0
Final validation accuracy: 0.6666666666666666
              precision    recall  f1-score   support

           0      0.761     0.854     0.805        41
           1      0.615     0.585     0.600        41
           2      0.605     0.561     0.582        41

    accuracy                          0.667       123
   macro avg      0.661     0.667     0.662       123
weighted avg      0.661     0.667     0.662       123



## 6. 5-fold GroupKFold cross-validation to tune alpha

In [10]:
from sklearn.model_selection import GroupKFold
from scipy.sparse import csr_matrix

# csr_matrix format for efficient row slicing
X_train = csr_matrix(X_train)

alphas = [0.1, 0.5, 1.0, 2.0]
groups = train["student_id"].values   # group by student

gkf = GroupKFold(n_splits=5)
cv_results = {}

# Cross-validation to find best alpha
for a in alphas:
    fold_accuracies = []

    # Split data into training and validation folds
    for train_idx, val_idx in gkf.split(X_train, y_train, groups=groups):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        # Train Naive Bayes model
        nb = MultinomialNB(alpha=a)
        nb.fit(X_tr, y_tr)
        y_val_pred = nb.predict(X_val)

        # Evaluate accuracy
        acc = accuracy_score(y_val, y_val_pred)
        fold_accuracies.append(acc)

    # Compute mean accuracy across folds
    mean_acc = np.mean(fold_accuracies)
    cv_results[a] = mean_acc
    print(f"alpha = {a}, mean CV accuracy = {mean_acc:.4f}")

# Select best alpha
best_alpha = max(cv_results, key=cv_results.get)
print("Best alpha from CV:", best_alpha, "with accuracy", cv_results[best_alpha])


alpha = 0.1, mean CV accuracy = 0.6406
alpha = 0.5, mean CV accuracy = 0.6371
alpha = 1.0, mean CV accuracy = 0.6424
alpha = 2.0, mean CV accuracy = 0.6372
Best alpha from CV: 1.0 with accuracy 0.6423751686909582


## 7. Final model trained on full training set and evaluated on validation set

In [11]:
final_nb = MultinomialNB(alpha=best_alpha)
final_nb.fit(X_train, y_train)

y_train_pred = final_nb.predict(X_train)
y_valid_pred = final_nb.predict(X_valid)

print("Training accuracy:", accuracy_score(y_train, y_train_pred))
print("Final validation accuracy:", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred, digits=3))

Training accuracy: 0.796875
Final validation accuracy: 0.6422764227642277
              precision    recall  f1-score   support

           0      0.744     0.780     0.762        41
           1      0.585     0.585     0.585        41
           2      0.590     0.561     0.575        41

    accuracy                          0.642       123
   macro avg      0.640     0.642     0.641       123
weighted avg      0.640     0.642     0.641       123



## 8. Evaluating on test set
### 8.1 Load and preprocess test set

In [12]:
test = pd.read_csv("../cleaned_data/test_clean.csv")

# Combine text fields
test = combine_text(test)    # uses the same text_cols as train/valid

# Clean text
test["full_text"] = test["full_text"].apply(clean_text)

# Numeric features (same feature_cols as before)
X_test_numeric = test[feature_cols].values

# Labels
y_test = test["label"].values


### 8.2 Build test feature matrix and evaluate on test set

In [13]:

# Use the already-fitted vectorizer
X_test_text = vectorizer.transform(test["full_text"])

# Combine text + numeric for test
X_test = hstack([X_test_text, X_test_numeric])

# Predict with final trained Naive Bayes
y_test_pred = final_nb.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=3))


Test accuracy: 0.5952380952380952
              precision    recall  f1-score   support

           0      0.655     0.857     0.742        42
           1      0.535     0.548     0.541        42
           2      0.571     0.381     0.457        42

    accuracy                          0.595       126
   macro avg      0.587     0.595     0.580       126
weighted avg      0.587     0.595     0.580       126

