### Importing libs and reqired modules

In [4]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import time

# Importing our models

from src.models.naive_bayes_model import NaiveBayesRecommender
from src.models.knn_model import KNNRecommender
from src.models.logistic_regression_model import LogisticRegressionRecommender


# Setting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

### Loading Preprocessed Data

In [5]:
print("Loading processed data...")

train = pd.read_csv('../data/processed/processed_train.csv')
val = pd.read_csv('../data/processed/processed_val.csv')
test = pd.read_csv('../data/processed/processed_test.csv')

# Loaingd category mapping
with open('../data/processed/category_mapping.json', 'r') as f:
    category_mapping = json.load(f)

print(f"Training set: {len(train)} books")
print(f"Validation set: {len(val)} books")
print(f"Test set: {len(test)} books")
print(f"\nCategories: {list(category_mapping.keys())}")

Loading processed data...
Training set: 766 books
Validation set: 110 books
Test set: 220 books

Categories: ['Biography & Autobiography', 'Business & Economics', 'Computers', 'Education', 'Fiction', 'History', 'Juvenile Fiction', 'Language Arts & Disciplines', 'Literary Criticism', 'Philosophy', 'Science']


### Preparing Features and Labels

In [6]:
# Features (X) and labels (y)
X_train = train['combined_text']
y_train = train['category_encoded']

X_val = val['combined_text']
y_val = val['category_encoded']

X_test = test['combined_text']
y_test = test['category_encoded']

print("Features prepared!")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

Features prepared!
X_train shape: (766,)
y_train shape: (766,)


### Training Naive Bayes Model

In [7]:
print("="*60)
print("TRAINING NAIVE BAYES MODEL")
print("="*60)

start_time = time.time()

nb_model = NaiveBayesRecommender(model_type='multinomial')
nb_model.train(X_train, y_train)

nb_train_time = time.time() - start_time
print(f"\nTraining time: {nb_train_time:.2f} seconds")

# Evaluating on validation set
print("\n--- Validation Set Performance ---")
nb_val_metrics = nb_model.evaluate(X_val, y_val)

# Evaluating on test set
print("\n--- Test Set Performance ---")
nb_test_metrics = nb_model.evaluate(X_test, y_test)

# Saving model
nb_model.save_model()
print("\n✓ Naive Bayes model saved!")

TRAINING NAIVE BAYES MODEL
Training Naive Bayes model...
Training completed!

Training time: 0.09 seconds

--- Validation Set Performance ---

MULTINOMIAL Naive Bayes Evaluation:
Accuracy: 0.4727
Precision: 0.3631
Recall: 0.4727
F1-Score: 0.3311

--- Test Set Performance ---

MULTINOMIAL Naive Bayes Evaluation:
Accuracy: 0.4909
Precision: 0.4579
Recall: 0.4909
F1-Score: 0.3560
Model saved to /Users/nirdeshsubedi/Documents/Courseworks/book-rec-ai/ai-book-recommendation/trained_models/naive_bayes.pkl

✓ Naive Bayes model saved!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Training Logistics Regression Model

In [8]:
print("\n" + "="*60)
print("TRAINING LOGISTIC REGRESSION MODEL")
print("="*60)

start_time = time.time()

lr_model = LogisticRegressionRecommender(max_iter=1000)
lr_model.train(X_train, y_train)

lr_train_time = time.time() - start_time
print(f"\nTraining time: {lr_train_time:.2f} seconds")

# Evaluating on validation set
print("\n--- Validation Set Performance ---")
lr_val_metrics = lr_model.evaluate(X_val, y_val)

# Evaluating on test set
print("\n--- Test Set Performance ---")
lr_test_metrics = lr_model.evaluate(X_test, y_test)

# Saving model
lr_model.save_model()
print("\n✓ Logistic Regression model saved!")



TRAINING LOGISTIC REGRESSION MODEL
Training Logistic Regression model...




Training completed!

Training time: 1.59 seconds

--- Validation Set Performance ---

Logistic Regression Evaluation:
Accuracy: 0.7091
Precision: 0.6972
Recall: 0.7091
F1-Score: 0.6927

--- Test Set Performance ---

Logistic Regression Evaluation:
Accuracy: 0.6591
Precision: 0.6278
Recall: 0.6591
F1-Score: 0.6350
Model saved to /Users/nirdeshsubedi/Documents/Courseworks/book-rec-ai/ai-book-recommendation/trained_models/logistic_regression.pkl

✓ Logistic Regression model saved!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Training KNN Model

In [9]:
print("\n" + "="*60)
print("TRAINING KNN MODEL")
print("="*60)

start_time = time.time()

knn_model = KNNRecommender(n_neighbors=15)
knn_model.train(X_train, y_train)

knn_train_time = time.time() - start_time
print(f"\nTraining time: {knn_train_time:.2f} seconds")

# Evaluating on validation set
print("\n--- Validation Set Performance ---")
knn_val_metrics = knn_model.evaluate(X_val, y_val)

# Evaluating on test set
print("\n--- Test Set Performance ---")
knn_test_metrics = knn_model.evaluate(X_test, y_test)

# Saving model
knn_model.save_model()
print("\n✓ KNN model saved!")


TRAINING KNN MODEL
Training KNN model with 15 neighbors...
Training completed!

Training time: 0.08 seconds

--- Validation Set Performance ---

KNN Evaluation (k=15):
Accuracy: 0.6364
Precision: 0.5841
Recall: 0.6364
F1-Score: 0.5718

--- Test Set Performance ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



KNN Evaluation (k=15):
Accuracy: 0.6318
Precision: 0.5706
Recall: 0.6318
F1-Score: 0.5601
Model saved to /Users/nirdeshsubedi/Documents/Courseworks/book-rec-ai/ai-book-recommendation/trained_models/knn.pkl

✓ KNN model saved!


### Comparing all models

In [10]:
print("\n" + "="*60)
print("MODEL COMPARISON - TEST SET RESULTS")
print("="*60)

comparison_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'KNN', 'Logistic Regression'],
    'Accuracy': [
        nb_test_metrics['accuracy'],
        knn_test_metrics['accuracy'],
        lr_test_metrics['accuracy']
    ],
    'Precision': [
        nb_test_metrics['precision'],
        knn_test_metrics['precision'],
        lr_test_metrics['precision']
    ],
    'Recall': [
        nb_test_metrics['recall'],
        knn_test_metrics['recall'],
        lr_test_metrics['recall']
    ],
    'F1-Score': [
        nb_test_metrics['f1_score'],
        knn_test_metrics['f1_score'],
        lr_test_metrics['f1_score']
    ],
    'Training Time (s)': [
        nb_train_time,
        knn_train_time,
        lr_train_time
    ]
})

print("\n" + comparison_df.to_string(index=False))



# Find best model
best_accuracy_idx = comparison_df['Accuracy'].idxmax()
best_model = comparison_df.loc[best_accuracy_idx, 'Model']
best_accuracy = comparison_df.loc[best_accuracy_idx, 'Accuracy']

print(f"\nBest Model: {best_model} with {best_accuracy:.4f} accuracy")

# Saving comparison
comparison_df.to_csv('../trained_models/model_comparison.csv', index=False)
print("\nComparison saved to trained_models/model_comparison.csv")



MODEL COMPARISON - TEST SET RESULTS

              Model  Accuracy  Precision   Recall  F1-Score  Training Time (s)
        Naive Bayes  0.490909   0.457889 0.490909  0.356031           0.090000
                KNN  0.631818   0.570650 0.631818  0.560075           0.082499
Logistic Regression  0.659091   0.627763 0.659091  0.635027           1.589647

Best Model: Logistic Regression with 0.6591 accuracy

Comparison saved to trained_models/model_comparison.csv
