In [10]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import load_svmlight_file
import joblib
import os 

# --- Configuration ---
RAW_DATA_DIR = "../data/raw/"
PROCESSED_DATA_DIR = "../data/processed/"
MODEL_DIR = "../models/"
OUTPUT_DIR = "../output/"

# --- Create directories if they don't exist ---
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

INPUT_MATRIX_FILE = "user_item_rating_matrix.libsvm"
MODEL_FILE = "knn_recommender.pkl"
OUTPUT_RECOMMENDATIONS_FILE = "recommendations_output.csv"

# --- Load Data and Mappings ---
print("Loading processed data and creating mappings...")

# Load the user-item matrix
user_item_matrix, _ = load_svmlight_file(f"{PROCESSED_DATA_DIR}{INPUT_MATRIX_FILE}")

# To get real book titles, we need to recreate the mappings from the preprocessing step
ratings_df = pd.read_csv(f"{RAW_DATA_DIR}Ratings.csv", delimiter=";", low_memory=False, on_bad_lines='skip', encoding='latin-1')
ratings_df.columns = ['UserID', 'ISBN', 'Rating']
books_df = pd.read_csv(f"{RAW_DATA_DIR}Books.csv", delimiter=";", low_memory=False, on_bad_lines='skip', encoding='latin-1')


Loading processed data and creating mappings...


In [11]:
# --- Train or Load Model ---
print("Initializing Nearest Neighbors model...")
try:
    # Try to load a pre-trained model to save time
    model_knn = joblib.load(f"{MODEL_DIR}{MODEL_FILE}")
    print("Loaded pre-trained model from disk.")
except FileNotFoundError:
    print("No pre-trained model found. Training a new one...")
    # Setup K-Nearest Neighbors model with cosine similarity
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(user_item_matrix)
    # Save the newly trained model for future use
    joblib.dump(model_knn, f"{MODEL_DIR}{MODEL_FILE}")
    print(f"Model trained and saved to {MODEL_DIR}{MODEL_FILE}")

Initializing Nearest Neighbors model...
No pre-trained model found. Training a new one...
Model trained and saved to ../models/knn_recommender.pkl


In [12]:
# --- Recommendation Generation ---

def generate_recommendations(user_idx, k_neighbors=10, top_n=5):
    """Generates book recommendations for a single user."""
    
    # Find K nearest neighbors (excluding the user itself)
    distances, indices = model_knn.kneighbors(user_item_matrix[user_idx], n_neighbors=k_neighbors + 1)
    
    # Get neighbor indexes and their similarity scores
    neighbor_indices = indices.flatten()[1:]
    neighbor_similarities = 1 - distances.flatten()[1:]

    # Get the ratings of the neighbors
    neighbor_ratings = user_item_matrix[neighbor_indices].toarray()
    
    # Calculate the weighted average score for each book
    weighted_scores = np.dot(neighbor_similarities, neighbor_ratings)
    
    # Exclude books the user has already rated
    user_rated_books = user_item_matrix[user_idx].toarray().flatten() > 0
    weighted_scores[user_rated_books] = 0
    
    # Get the top N book indexes
    top_book_indices = np.argsort(-weighted_scores)[:top_n]
    
    # Create a list of recommendations
    recommendations = []
    for book_idx in top_book_indices:
        isbn = index_to_book.get(book_idx)
        recommendations.append({
            "User_Index": user_idx,
            "Book_Title": isbn_to_title.get(isbn, "Unknown Title"),
            "Recommendation_Score": weighted_scores[book_idx],
            "ISBN": isbn
        })
    return recommendations

In [None]:
# --- Generate for All Users and Save ---
print("\nGenerating recommendations for all users...")
all_recommendations = []
num_users = user_item_matrix.shape[0]

# Remove `[:1000]` to run for all users.
for user_idx in range(num_users)[:1000]:
    if user_idx % 100 == 0:
        print(f"Processing user {user_idx}/{num_users}...")
    recs = generate_recommendations(user_idx)
    all_recommendations.extend(recs)

recommendations_df = pd.DataFrame(all_recommendations)

# --- Save Output ---
recommendations_df.to_csv(f"{OUTPUT_DIR}{OUTPUT_RECOMMENDATIONS_FILE}", index=False)
print(f"\nRecommendations saved successfully to {OUTPUT_DIR}{OUTPUT_RECOMMENDATIONS_FILE}")
display(recommendations_df.head(10))


Generating recommendations for all users...
Processing user 0/105283...
Processing user 100/105283...
Processing user 200/105283...
Processing user 300/105283...
Processing user 400/105283...
Processing user 500/105283...
Processing user 600/105283...
Processing user 700/105283...
Processing user 800/105283...
Processing user 900/105283...
Processing user 1000/105283...
Processing user 1100/105283...
Processing user 1200/105283...
Processing user 1300/105283...
Processing user 1400/105283...
Processing user 1500/105283...
Processing user 1600/105283...
Processing user 1700/105283...
Processing user 1800/105283...
Processing user 1900/105283...
Processing user 2000/105283...
Processing user 2100/105283...
Processing user 2200/105283...
Processing user 2300/105283...
Processing user 2400/105283...
Processing user 2500/105283...
Processing user 2600/105283...
Processing user 2700/105283...
Processing user 2800/105283...
Processing user 2900/105283...
Processing user 3000/105283...
Proces

Unnamed: 0,User_Index,Book_Title,Recommendation_Score,ISBN
0,0,Flesh Tones: A Novel,0.0,034545104X
1,0,American Primitive,0.0,0316650048
2,0,Angels Go Naked: A Novel,0.0,1582430624
3,0,Oliver Twist (Oxford World's Classics),0.0,0192834398
4,0,Unknown Title,0.0,9780684800714
5,1,Flesh Tones: A Novel,0.0,034545104X
6,1,American Primitive,0.0,0316650048
7,1,Angels Go Naked: A Novel,0.0,1582430624
8,1,Oliver Twist (Oxford World's Classics),0.0,0192834398
9,1,Unknown Title,0.0,9780684800714
