In [1]:
# !pip uninstall numpy pandas scikit-surprise scipy -y

In [2]:
# !pip install numpy==1.26.4 pandas scikit-surprise

In [3]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import re 

from surprise import Reader, Dataset, SVD, BaselineOnly # Import BaselineOnly
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np

In [4]:
import pandas as pd
import os

# Path to extracted MovieLens dataset
DATA_DIR = './data/ml-25m'

# Optimized dtypes
rating_dtypes = {
    'userId': 'int32',
    'movieId': 'int32',
    'rating': 'float32',
    'timestamp': 'int32'
}
movie_dtypes = {
    'movieId': 'int32',
    'title': 'string',
    'genres': 'string'
}
tag_dtypes = {
    'userId': 'int32',
    'movieId': 'int32',
    'tag': 'string',
    'timestamp': 'int32'
}
link_dtypes = {
    'movieId': 'int32',
    'imdbId': 'float32',
    'tmdbId': 'float32'
}


def load_csv(filename, dtypes):
    path = os.path.join(DATA_DIR, filename)
    print(f"Loading {filename} ...")
    df = pd.read_csv(path, dtype=dtypes)
    print(f"Loaded {filename}: shape={df.shape}")
    return df

ratings = load_csv('ratings.csv', rating_dtypes)
movies  = load_csv('movies.csv', movie_dtypes)
tags    = load_csv('tags.csv', tag_dtypes)
links   = load_csv('links.csv', link_dtypes)
# genome_tags = load_csv('genome-tags.csv')
# genome_scores = pd.read_csv('./data/ml-25m/genome-scores.csv-scores.csv')


Loading ratings.csv ...
Loaded ratings.csv: shape=(25000095, 4)
Loading movies.csv ...
Loaded movies.csv: shape=(62423, 3)
Loading tags.csv ...
Loaded tags.csv: shape=(1093360, 4)
Loading links.csv ...
Loaded links.csv: shape=(62423, 3)


In [5]:
data = pd.merge(ratings,movies)
display(data)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance
...,...,...,...,...,...,...
25000090,162541,50872,4.5,1240953372,Ratatouille (2007),Animation|Children|Drama
25000091,162541,55768,2.5,1240951998,Bee Movie (2007),Animation|Comedy
25000092,162541,56176,2.0,1240950697,Alvin and the Chipmunks (2007),Children|Comedy
25000093,162541,58559,4.0,1240953434,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX


In [6]:

# --- BASELINE MODEL CALCULATIONS ---

def calculate_baseline_biases(data: pd.DataFrame):
    """
    Calculates the Global Mean, User Bias, and Item Bias.
    These three components form the complete baseline model prediction.
    """

    # 1. Global Mean (mu)
    # The simplest possible prediction for any unknown rating
    global_mean = data['rating'].mean()

    # 2. Item Bias (b_i)
    # Average deviation of each movie's ratings from the global mean
    
    # Calculate Mean Rating for each Movie
    movie_mean = data.groupby('movieId')['rating'].mean().reset_index(name='movie_mean')
    
    # Calculate Item Bias
    movie_mean['item_bias'] = movie_mean['movie_mean'] - global_mean
    item_bias_df = movie_mean[['movieId', 'item_bias']]

    # 3. User Bias (b_u)
    # Average deviation of each user's ratings from the global mean (after removing item effect)
    
    # Merge Item Bias back into the main data to calculate adjusted user ratings
    data_with_bias = pd.merge(data, item_bias_df, on='movieId', how='left')
    
    # Adjusted Rating = Original Rating - (Global Mean + Item Bias)
    # Adjusted Rating should ideally only contain the User Bias (plus noise)
    data_with_bias['adjusted_rating'] = data_with_bias['rating'] - (global_mean + data_with_bias['item_bias'])
    
    # Calculate User Bias (mean of the adjusted ratings for each user)
    user_bias_df = data_with_bias.groupby('userId')['adjusted_rating'].mean().reset_index(name='user_bias')

    return global_mean, item_bias_df, user_bias_df

def baseline_predict(global_mean, user_id, movie_id, item_bias_df, user_bias_df):
    """
    Predicts a rating using the baseline formula: mu + b_u + b_i
    """
    # 1. Get Global Mean (mu)
    mu = global_mean

    # 2. Get Item Bias (b_i)
    b_i_series = item_bias_df[item_bias_df['movieId'] == movie_id]['item_bias']
    b_i = b_i_series.iloc[0] if not b_i_series.empty else 0.0

    # 3. Get User Bias (b_u)
    b_u_series = user_bias_df[user_bias_df['userId'] == user_id]['user_bias']
    b_u = b_u_series.iloc[0] if not b_u_series.empty else 0.0
    
    # Baseline Prediction Formula:
    predicted_rating = mu + b_u + b_i
    
    return predicted_rating, mu, b_u, b_i


# 1. Calculate the Biases
global_mean, item_bias_df, user_bias_df = calculate_baseline_biases(data)

if global_mean is not None:
    print("\n--- BASELINE MODEL RESULTS ---")
    print(f"1. Global Mean (\u03bc): {global_mean:.4f}")

    print("\n2. User Biases (b_u): The User's Leniency/Harshness")
    print(user_bias_df)

    print("\n3. Item Biases (b_i): The Movie's Popularity/Quality")
    print(item_bias_df.sort_values('item_bias', ascending=False))

    


--- BASELINE MODEL RESULTS ---
1. Global Mean (μ): 3.5339

2. User Biases (b_u): The User's Leniency/Harshness
        userId  user_bias
0            1   0.009460
1            2  -0.025164
2            3   0.244178
3            4  -0.262870
4            5   0.271491
...        ...        ...
162536  162537   0.712264
162537  162538  -0.202443
162538  162539   0.736999
162539  162540   0.497151
162540  162541  -0.188255

[162541 rows x 2 columns]

3. Item Biases (b_i): The Movie's Popularity/Quality
       movieId  item_bias
29523   136782   1.466145
49654   186119   1.466145
29643   137032   1.466145
49041   184643   1.466145
29646   137038   1.466145
...        ...        ...
5693      5805  -3.033855
55757   199922  -3.033855
53387   194608  -3.033855
58517   207153  -3.033855
45900   177419  -3.033855

[59047 rows x 2 columns]


In [7]:
# 4. Example Prediction
    
user_to_predict = 1
movie_to_predict = 899

predicted_rating, mu, b_u, b_i = baseline_predict(
    global_mean, user_to_predict, movie_to_predict, item_bias_df, user_bias_df
)

actual_rating = data[(data['userId'] == user_to_predict) & (data['movieId'] == movie_to_predict)]['rating'].iloc[0]

print(f"\n--- BASELINE PREDICTION EXAMPLE (User {user_to_predict} for Movie {movie_to_predict}) ---")
print(f"Actual Rating: {actual_rating:.2f}")
print(f"Prediction Components:")
print(f"  \u03bc (Global Mean): {mu:.4f}")
print(f"  b_u (User Bias):   {b_u:.4f}")
print(f"  b_i (Item Bias):   {b_i:.4f}")
print(f"  Baseline Prediction (\u03bc + b_u + b_i): {predicted_rating:.4f}")


--- BASELINE PREDICTION EXAMPLE (User 1 for Movie 899) ---
Actual Rating: 3.50
Prediction Components:
  μ (Global Mean): 3.5339
  b_u (User Bias):   0.0095
  b_i (Item Bias):   0.5171
  Baseline Prediction (μ + b_u + b_i): 4.0604


In [8]:
# --- 1. Data Preparation for Surprise ---

reader = Reader(rating_scale=(0.5, 5.0))

# Load the DataFrame into the Surprise Dataset format
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# --- 2. Train-Test Split ---
# Split data into 80% training set and 20% test set
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# --- 3. Matrix Factorization (SVD Implementation) ---

# SVD is the implementation of Matrix Factorization trained via SGD
algo = SVD(
    n_factors=100,  # Number of latent factors (k)
    n_epochs=20,    # Number of training iterations
    lr_all=0.005,   # Learning rate
    reg_all=0.02,   # Regularization term
    random_state=42
)

print("\n--- 4. Training Matrix Factorization (SVD) Model ---")
# Train the algorithm on the training set
algo.fit(trainset)
print("Training complete.")


--- 4. Training Matrix Factorization (SVD) Model ---
Training complete.


In [9]:

# --- 5. Evaluation ---

# Predict ratings for the test set
predictions = algo.test(testset)

# Calculate RMSE (Root Mean Squared Error) and MAE
rmse = accuracy.rmse(predictions, verbose=False)
mae = accuracy.mae(predictions, verbose=False)

print("\n--- 5. Evaluation Results ---")
print(f"SVD RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"SVD MAE (Mean Absolute Error): {mae:.4f}")



--- 5. Evaluation Results ---
SVD RMSE (Root Mean Squared Error): 0.7773
SVD MAE (Mean Absolute Error): 0.5865


In [10]:

# --- 6. Get Sample of Predictions ---

def get_prediction_sample(predictions, n_samples=5):
    """Converts Surprise predictions into a DataFrame and returns a sample."""
    # Convert predictions (named tuple list) to a dictionary list
    pred_data = [
        (uid, iid, r_ui, est, details['was_impossible'])
        for (uid, iid, r_ui, est, details) in predictions
    ]
    
    # Create the DataFrame
    pred_df = pd.DataFrame(
        pred_data,
        columns=['userId', 'movieId', 'Actual_Rating', 'Predicted_Rating', 'Impossible']
    )
    
    # Merge with movie titles (assuming the global movies_df is available, though 
    # we'll use a placeholder here for independence)
    
    # NOTE: You will need to load the 'movies.csv' file earlier in your full script 
    # and use it here to map movieIds to titles for better interpretation.
    
    print(f"Total test set predictions generated: {len(pred_df):,}")
    return pred_df.sample(n=n_samples, random_state=42)
    
N_SAMPLE = 10
prediction_sample_df = get_prediction_sample(predictions, N_SAMPLE)

print(f"\n--- 6. Sample of {N_SAMPLE} Predictions from Test Set ---")
print("This shows where the model's prediction (Predicted_Rating) matched the user's actual rating.")
print(prediction_sample_df[['userId', 'movieId', 'Actual_Rating', 'Predicted_Rating']])


Total test set predictions generated: 5,000,019

--- 6. Sample of 10 Predictions from Test Set ---
This shows where the model's prediction (Predicted_Rating) matched the user's actual rating.
         userId  movieId  Actual_Rating  Predicted_Rating
547077    70745     3507            3.5          3.352319
260763     2076     3133            4.0          3.618980
2148015   10397    69844            3.0          2.792635
3627539   92953     3624            3.0          3.094113
4692539   81695     6539            3.5          3.555909
4717853   33960      986            4.0          3.422375
967463   108059     1663            1.5          2.892461
3240406    9015     1258            4.5          4.602095
3217118   87138     8961            3.0          3.838990
1206106   35316     1389            2.0          2.189074


In [11]:
# --- 7. Accessing Learned Biases (for comparison to your EDA) ---

test_user_id = 1
test_movie_id = 899

def get_learned_biases(algo, test_user_id, test_movie_id):
    """Retrieves the learned user and item bias values from the SVD model."""
    try:
        # Get the internal ID (the index) for the movie and user from the trained model
        # This is necessary because the bias arrays are indexed by internal ID, not raw MovieLens ID
        internal_movie_id = algo.trainset.to_inner_iid(test_movie_id)
        internal_user_id = algo.trainset.to_inner_uid(test_user_id)

        # Access the bias arrays directly (bu for user bias, bi for item bias)
        # These are NumPy arrays indexed by the internal ID
        item_bias = algo.bi[internal_movie_id]
        user_bias = algo.bu[internal_user_id]
        
        return user_bias, item_bias

    except ValueError:
        # This happens if the user or movie ID was not in the training set (a cold start case)
        print(f"User {test_user_id} or Movie {test_movie_id} was not in the training set (Cold Start).")
        return None, None

prediction = algo.predict(test_user_id, test_movie_id)
predicted_rating = prediction.est

# Get the learned bias terms using the new helper function
user_bias, item_bias = get_learned_biases(algo, test_user_id, test_movie_id)
model_mu = algo.default_prediction() 

print(f"\n--- 6. Example Prediction for User {test_user_id} on Movie {test_movie_id} ---")
print(f"Predicted rating: {predicted_rating:.4f}")

# --- 7. Display Learned Biases ---

if user_bias is not None:
    # We use the prediction.details dictionary to retrieve the interaction term 
    # (which is the residual error after accounting for biases)
    interaction_term = predicted_rating - (model_mu + user_bias + item_bias)

    print("\n--- 7. Learned Biases from SVD ---")
    print(f"Learned Global Mean (\u03bc): {model_mu:.4f}")
    print(f"Learned User Bias (b_u):   {user_bias:.4f}")
    print(f"Learned Item Bias (b_i):   {item_bias:.4f}")
    print(f"Learned Interaction Term:  {interaction_term:.4f}")
    print(f"\nPrediction Check: \u03bc + b_u + b_i + interaction_term = {model_mu + user_bias + item_bias + interaction_term:.4f}")



--- 6. Example Prediction for User 1 on Movie 899 ---
Predicted rating: 3.5558

--- 7. Learned Biases from SVD ---
Learned Global Mean (μ): 3.5336
Learned User Bias (b_u):   -0.1323
Learned Item Bias (b_i):   0.3434
Learned Interaction Term:  -0.1889

Prediction Check: μ + b_u + b_i + interaction_term = 3.5558


In [12]:
def calculate_baseline_rmse(trainset, testset):
    """Trains the BaselineOnly model and calculates its RMSE on the test set."""
    print("\n--- 4A. Baseline Model (mu + b_u + b_i) ---")
    bsl_options = {'method': 'sgd'} # Use Stochastic Gradient Descent for optimization
    
    # Instantiate the BaselineOnly model
    algo_baseline = BaselineOnly(bsl_options=bsl_options)
    
    # Train and predict
    algo_baseline.fit(trainset)
    predictions_baseline = algo_baseline.test(testset)
    
    rmse_baseline = accuracy.rmse(predictions_baseline, verbose=False)
    mae_baseline = accuracy.mae(predictions_baseline, verbose=False)
    
    print(f"Baseline RMSE: {rmse_baseline:.4f}")
    print(f"Baseline MAE: {mae_baseline:.4f}")
    
    return rmse_baseline, mae_baseline, predictions_baseline

In [13]:
rmse_baseline, mae_baseline, predictions_baseline = calculate_baseline_rmse(trainset, testset)


--- 4A. Baseline Model (mu + b_u + b_i) ---
Estimating biases using sgd...
Baseline RMSE: 0.8641
Baseline MAE: 0.6593


In [14]:
print("\n=======================================================")
print(f"| Performance Improvement: {rmse_baseline - rmse:.4f} RMSE reduction |")
print(f"| SVD is {'BETTER' if rmse < rmse_baseline else 'WORSE'} than Baseline |")
print("=======================================================")


| Performance Improvement: 0.0868 RMSE reduction |
| SVD is BETTER than Baseline |


### Changing evaluation metric from accuracy to Precision@k and recall@k

In [15]:
def get_top_n(predictions, n=10, threshold=4.0):
    """
    Returns the top N recommendations for each user, based on predicted rating (est).
    
    A movie is considered "relevant" if the ACTUAL rating (r_ui) is above the threshold.
    """
    # 1. Map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, r_ui, est, _ in predictions:
        top_n[uid].append((iid, r_ui, est)) # Store item ID, actual rating, predicted rating

    # 2. Sort the predictions for each user and retrieve the k highest ones
    for uid, user_ratings in top_n.items():
        # Sort by predicted rating (est) in descending order
        user_ratings.sort(key=lambda x: x[2], reverse=True)
        # Keep only the top N items
        top_n[uid] = user_ratings[:n]

    return top_n

def precision_recall_at_k(predictions, k=10, threshold=4.0):
    """
    Calculates Precision@k and Recall@k for the predictions.
    Relevance is defined by the actual rating (r_ui) being >= threshold.
    """
    user_to_items = defaultdict(list)
    
    # 1. Map all relevant items (Actual Ratings >= Threshold) per user
    for uid, iid, r_ui, est, _ in predictions:
        if r_ui >= threshold:
            user_to_items[uid].append(iid) # The list of items the user ACTUALLY liked (Relevant Items)

    # 2. Get the top N predictions based on the *predicted score*
    top_n = get_top_n(predictions, n=k, threshold=threshold)

    precisions = dict()
    recalled_items = dict()
    
    # 3. Calculate Precision and Recall for each user
    for uid, recommended_items in top_n.items():
        # Recommended items are tuples: (iid, r_ui, est)
        
        # True Positives: Recommended items that were actually relevant (r_ui >= threshold)
        n_relevant_and_recommended = sum(1 for (iid, r_ui, est) in recommended_items if r_ui >= threshold)
        
        # Denominator for Recall: Total number of relevant items in the Test Set
        n_relevant_total = len(user_to_items[uid])
        
        # Precision@k: (Relevant and Recommended) / (Total Recommended @ k)
        precisions[uid] = n_relevant_and_recommended / k if k > 0 else 0
        
        # Recall@k: (Relevant and Recommended) / (Total Relevant in Test Set)
        recalled_items[uid] = n_relevant_and_recommended / n_relevant_total if n_relevant_total > 0 else 0

    # Return the average across all users
    return np.mean(list(precisions.values())), np.mean(list(recalled_items.values()))

In [16]:
K_VAL = 10
THRESHOLD = 4.0 # Define a rating threshold for 'relevance' (e.g., ratings >= 4.0 are considered a 'like')

In [17]:
# Calculate P@k and R@k for SVD
precision_svd, recall_svd = precision_recall_at_k(predictions, k=K_VAL, threshold=THRESHOLD)
# Calculate P@k and R@k for Baseline
precision_baseline, recall_baseline = precision_recall_at_k(predictions_baseline, k=K_VAL, threshold=THRESHOLD)

print("\n--- 4. Ranking Evaluation (P@k, R@k) ---")
print(f"Threshold for Relevance: Actual Rating >= {THRESHOLD}")
print(f"K (List Size): {K_VAL}")
print("-" * 35)
print(f"| Model | Precision@{K_VAL} | Recall@{K_VAL} |")
print("-" * 35)
print(f"| SVD   | {precision_svd:.4f}      | {recall_svd:.4f}     |")
print(f"| Baseline | {precision_baseline:.4f}   | {recall_baseline:.4f}     |")
print("-" * 35)

if precision_svd > precision_baseline:
    print("\n✅ SVD performed better on Precision@k, meaning its top 10 recommendations are more relevant.")
else:
    print("\n⚠️ Baseline performed better or equal on Precision@k. SVD needs tuning.")


--- 4. Ranking Evaluation (P@k, R@k) ---
Threshold for Relevance: Actual Rating >= 4.0
K (List Size): 10
-----------------------------------
| Model | Precision@10 | Recall@10 |
-----------------------------------
| SVD   | 0.6027      | 0.6965     |
| Baseline | 0.5694   | 0.6773     |
-----------------------------------

✅ SVD performed better on Precision@k, meaning its top 10 recommendations are more relevant.


In [18]:
merged_data_df = pd.merge(ratings, movies, on='movieId', how='left')
display(merged_data_df)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance
...,...,...,...,...,...,...
25000090,162541,50872,4.5,1240953372,Ratatouille (2007),Animation|Children|Drama
25000091,162541,55768,2.5,1240951998,Bee Movie (2007),Animation|Comedy
25000092,162541,56176,2.0,1240950697,Alvin and the Chipmunks (2007),Children|Comedy
25000093,162541,58559,4.0,1240953434,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX


In [19]:
print("\n--- STEP 1: Calculating Genre-Based Content Score ---")

# 1.1 Explode the ratings data to link every rating to every genre
ratings_exploded = merged_data_df[['movieId', 'rating', 'genres']].copy()
ratings_exploded['genres'] = ratings_exploded['genres'].str.split('|')
ratings_exploded = ratings_exploded.explode('genres')

# 1.2 Calculate the global mean rating for each genre
genre_mean_ratings = ratings_exploded.groupby('genres')['rating'].mean().reset_index(name='genre_avg_rating')
print(f"Calculated average rating for {len(genre_mean_ratings)} genres.")
print("Top 5 Genres by Average Rating:\n", genre_mean_ratings.sort_values('genre_avg_rating', ascending=False).head(5))

# 1.3 Create a function to calculate a movie's genre score
genre_dict = genre_mean_ratings.set_index('genres')['genre_avg_rating'].to_dict()

def calculate_movie_genre_score(genres_str):
    """Calculates the mean of the average ratings of all a movie's genres."""
    if pd.isna(genres_str) or genres_str == '(no genres listed)':
        return np.nan
    
    genres = genres_str.split('|')
    scores = [genre_dict.get(g, np.nan) for g in genres]
    
    # Return the mean of non-NaN genre scores
    return np.mean([s for s in scores if not np.isnan(s)])

# Apply the function to all movies (use movies_df for unique movies)
movies['G_score'] = movies['genres'].apply(calculate_movie_genre_score)

print("\nMovie Feature (G_score) created:")
print(movies[['title', 'genres', 'G_score']].head(10))



--- STEP 1: Calculating Genre-Based Content Score ---
Calculated average rating for 20 genres.
Top 5 Genres by Average Rating:
          genres  genre_avg_rating
10    Film-Noir          3.925728
18          War          3.791466
7   Documentary          3.705281
6         Crime          3.685044
8         Drama          3.677185

Movie Feature (G_score) created:
                                title  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   
5                         Heat (1995)   
6                      Sabrina (1995)   
7                 Tom and Huck (1995)   
8                 Sudden Death (1995)   
9                    GoldenEye (1995)   

                                        genres   G_score  
0  Adventure|Animation|Children|Comedy|Fantasy  3.500096  
1                   Adventure|Children|Fantasy  3.487180  
2   

In [20]:
# --- STEP 2: The Hybrid Prediction Logic ---

def hybrid_cold_start_prediction(user_id, movie_id, svd_model, min_ratings_threshold=50):
    """
    Predicts a rating using SVD if warm, or the Genre Score if cold.
    
    NOTE: This function needs access to 'merged_data_df', 'movies_df', and the trained 'svd_model'.
    """
    # 1. Determine User Status (Cold/Warm) and get b_u
    is_user_cold = False
    try:
        internal_user_id = svd_model.trainset.to_inner_uid(user_id)
        user_bias = svd_model.bu[internal_user_id] 
    except ValueError:
        is_user_cold = True
        user_bias = 0.0 # New users have zero learned user bias initially

    # 2. Determine Item Status (Cold/Warm)
    movie_stats = merged_data_df.groupby('movieId')['rating'].count()
    rating_count = movie_stats.get(movie_id, 0)
    is_item_cold = rating_count < min_ratings_threshold
    
    mu = svd_model.default_prediction()
    prediction = 0.0
    source = ""

    # --- SCENARIO A: WARM USER, WARM ITEM -> Full Personalization ---
    if not is_user_cold and not is_item_cold:
        prediction = svd_model.predict(user_id, movie_id).est
        source = "SVD (Warm)"
        
    # --- SCENARIO B: NEW USER, WARM ITEM -> Popularity Baseline (mu + b_i) ---
    elif is_user_cold and not is_item_cold:
        try:
            # Use learned item bias from SVD for the popularity baseline
            internal_movie_id = svd_model.trainset.to_inner_iid(movie_id)
            item_bias = svd_model.bi[internal_movie_id]
            
            prediction = mu + item_bias # User bias is 0.0
            source = "Popularity Baseline (New User: mu + b_i)"
        except ValueError:
            # Should not happen if item is warm, but defensive coding
            prediction = mu 
            source = "Global Mean (New User, Error fetching b_i)"
            
    # --- SCENARIO C: WARM USER, NEW ITEM -> Personalized Content Fallback (mu + b_u + G-Score) ---
    elif not is_user_cold and is_item_cold:
        genre_score = movies_df[movies_df['movieId'] == movie_id]['G_score'].iloc[0]
        # The item bias is derived from the genre score relative to the global mean
        item_content_bias = genre_score - mu
        
        prediction = mu + user_bias + item_content_bias
        source = "Personalized Content (New Item: mu + b_u + G_score_Bias)"
        
    # --- SCENARIO D: NEW USER, NEW ITEM -> Pure Content Fallback (mu + G-Score) ---
    elif is_user_cold and is_item_cold:
        genre_score = movies_df[movies_df['movieId'] == movie_id]['G_score'].iloc[0]
        # The item bias is derived from the genre score relative to the global mean
        item_content_bias = genre_score - mu
        
        prediction = mu + 0.0 + item_content_bias # User bias is 0.0
        source = "Popularity-Content Fallback (Both Cold: mu + G_score_Bias)"

    # Cap the final prediction score
    prediction = max(0.5, min(5.0, prediction))

    return prediction, source

In [21]:
TEST_USER_ID= 162541
COLD_MOVIE_ID = 296

# TEST_USER_ID= 162541
# COLD_MOVIE_ID = 899

In [22]:
print(f"\n--- DEMO: Hybrid Prediction for Cold Start Movie {COLD_MOVIE_ID} ---")
predicted_score, source = hybrid_cold_start_prediction(
    user_id=TEST_USER_ID, 
    movie_id=COLD_MOVIE_ID, 
    svd_model=algo 
)
print(f"Final Prediction: {predicted_score:.4f} (Source: {source})")


--- DEMO: Hybrid Prediction for Cold Start Movie 296 ---
Final Prediction: 4.8430 (Source: SVD (Warm))


In [23]:
def get_cold_testset(testset, trainset):
    """
    Filters the testset to include only ratings where either user OR item 
    was NOT present in the training set (Cold-Only Evaluation).
    Returns the cold set and the warm set (for verification).
    """
    warm_testset = []
    cold_testset = []
    
    train_users = set(trainset.ur.keys())
    train_items = set(trainset.ir.keys())
    
    for uid, iid, r_ui in testset:
        try:
            inner_uid = trainset.to_inner_uid(uid)
            inner_iid = trainset.to_inner_iid(iid)
            
            is_warm = (inner_uid in train_users) and (inner_iid in train_items)
            
            if is_warm:
                warm_testset.append((uid, iid, r_ui))
            else:
                cold_testset.append((uid, iid, r_ui))
        except ValueError:
            # Item/User ID was not in the original dataset's universe (very cold)
            cold_testset.append((uid, iid, r_ui))

    print(f"Total test ratings: {len(testset)}")
    print(f"Warm-Only ratings: {len(warm_testset)}")
    print(f"Cold-Only ratings: {len(cold_testset)}")
    
    return cold_testset, warm_testset

In [24]:
def get_cold_predictions(cold_testset, svd_model, movies_df, global_mean):
    """
    Generates predictions on the cold test set using two strategies:
    1. SVD Default: Predicts the Global Mean (mu) - Represents SVD failure.
    2. Hybrid Fallback: Uses the mu + b_i/G_score logic - Represents the solution.
    """
    predictions_svd_default = []
    predictions_hybrid_fallback = []
    
    for uid, iid, r_ui in cold_testset:
        # --- 1. SVD DEFAULT (Pure Global Mean) ---
        # SVD model always defaults to mu when user/item factors are missing/zero
        svd_default_est = global_mean 
        details = (uid, iid, r_ui, svd_default_est, {})
        predictions_svd_default.append(details)
        
        # --- 2. CUSTOM HYBRID FALLBACK ---
        
        # Replicate the logic from the hybrid_cold_start_prediction function
        user_bias = 0.0
        try:
            # Check for Warm User Bias (Scenario C)
            internal_user_id = svd_model.trainset.to_inner_uid(uid)
            user_bias = svd_model.bu[internal_user_id] 
        except ValueError:
            # User is Cold (Scenario B or D)
            user_bias = 0.0 
        
        # Get G-Score for Item Bias
        movie_row = movies_df[movies_df['movieId'] == iid]
        if not movie_row.empty and 'G_score' in movie_row.columns:
            genre_score = movie_row['G_score'].iloc[0]
            if np.isnan(genre_score):
                # Fallback if genre data is missing even in the movie list (rare)
                item_bias_term = 0.0
            else:
                item_bias_term = genre_score - global_mean
        else:
            # If no G_score is available, revert to simple mu (mu + 0)
            item_bias_term = 0.0

        # Hybrid Prediction: mu + b_u + Item_Bias_Term (derived from G-Score)
        hybrid_est = global_mean + user_bias + item_bias_term
        
        # Cap the score
        hybrid_est = max(0.5, min(5.0, hybrid_est))

        details = (uid, iid, r_ui, hybrid_est, {})
        predictions_hybrid_fallback.append(details)

    return predictions_svd_default, predictions_hybrid_fallback

# Check if hybrid fallback is effective

In [26]:

# --- 1. Filter the Test Set into Cold and Warm ---
cold_testset, warm_testset = get_cold_testset(testset, trainset)

# --- 3. WARM SET Evaluation (Standard Benchmark) ---
predictions_svd_warm = algo.test(warm_testset)
predictions_baseline_warm = BaselineOnly(bsl_options={'method': 'sgd'}).fit(trainset).test(warm_testset)
rmse_svd_warm = accuracy.rmse(predictions_svd_warm, verbose=False)
rmse_baseline_warm = accuracy.rmse(predictions_baseline_warm, verbose=False)

print("\n--- 3. WARM SET (Standard SVD vs. Baseline) ---")
print(f"Warm SVD RMSE: {rmse_svd_warm:.4f} | Warm Baseline RMSE: {rmse_baseline_warm:.4f}")

# --- 4. COLD SET Evaluation (Hybrid Fallback vs. SVD Failure) ---
svd_default_preds, hybrid_fallback_preds = get_cold_predictions(
    cold_testset, 
    algo, 
    movies, 
    global_mean
)

# RMSE for the SVD failure (prediction = mu for all cold ratings)
rmse_svd_default = accuracy.rmse(svd_default_preds, verbose=False) 
# RMSE for the custom Hybrid Fallback logic (prediction = mu + b_u + G_score_Bias)
rmse_hybrid_fallback = accuracy.rmse(hybrid_fallback_preds, verbose=False)

print("\n--- 4. COLD SET (SVD Failure vs. Hybrid Solution) ---")
print("This comparison proves the value of the custom fallback logic:")
print(f"| SVD Default (RMSE on Cold Set): {rmse_svd_default:.4f}")
print(f"| Hybrid Fallback (RMSE on Cold Set): {rmse_hybrid_fallback:.4f}")

if rmse_hybrid_fallback < rmse_svd_default:
    print(f"\n✅ **Conclusion:** The Hybrid Fallback reduces the Cold Set RMSE by {rmse_svd_default - rmse_hybrid_fallback:.4f}, demonstrating robust problem-solving.")
else:
     print("\n⚠️ **Conclusion:** The Hybrid Fallback did not beat the SVD default.")

Total test ratings: 5000019
Warm-Only ratings: 4997258
Cold-Only ratings: 2761
Estimating biases using sgd...

--- 3. WARM SET (Standard SVD vs. Baseline) ---
Warm SVD RMSE: 0.7772 | Warm Baseline RMSE: 0.8641

--- 4. COLD SET (SVD Failure vs. Hybrid Solution) ---
This comparison proves the value of the custom fallback logic:
| SVD Default (RMSE on Cold Set): 1.2685
| Hybrid Fallback (RMSE on Cold Set): 0.9532

✅ **Conclusion:** The Hybrid Fallback reduces the Cold Set RMSE by 0.3152, demonstrating robust problem-solving.
