In [8]:
import pandas as pd
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from scipy.sparse import csr_matrix 
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


This Data is from 24,938 users who have rated between 15 and 35 jokes, a matrix with dimensions 24,938 X 101

rows = 24,983 users columns = 100 jokes)

In [9]:
file_path = r"C:\Users\pricc\OneDrive\Desktop\jester-data-1.csv"


In [11]:
df = pd.read_csv(file_path, header=None)

In [18]:
joke_matrix = df.to_numpy()
print("Shape:", df.shape, "(24982 users × 101 jokes)")



Shape: (24983, 101) (24982 users × 101 jokes)


In [15]:
#Step 2: Assign column names: first column is 'ratingCount', then joke_1 to joke_100
df.columns = ['ratingCount'] + [f'joke_{i}' for i in range(1, 101)]

#Step 4: Remove 'ratingCount' column
ratings = df.drop(columns=['ratingCount'])

#Step 5: Replace 99 (unrated) with NaN
ratings.replace(99, np.nan, inplace=True)

#Step 6: Select first 5000 users
ratings = ratings.iloc[:5000]

#Step 7: Convert to NumPy matrix (if needed)
ratings_matrix = ratings.to_numpy()

#step 8: Convert to sparse matrix (CSR format)¶
ratings_sparse = csr_matrix(np.nan_to_num(ratings_matrix, nan=0.0))

In [16]:
# Step 1: Show original DataFrame we are converting
print("Original Jester ratings DataFrame (first 5 rows):")
print(ratings.head())
print("\n" + "="*60 + "\n")

# Step 2: Convert to matrix
ratings_matrix = ratings.to_numpy()

# Step 3: Print matrix summary
print("Converted Jester Ratings Matrix:")
print("Shape:", ratings.shape, "(users × jokes)")
print(ratings_matrix)

Original Jester ratings DataFrame (first 5 rows):
   joke_1  joke_2  joke_3  joke_4  joke_5  joke_6  joke_7  joke_8  joke_9  \
0   -7.82    8.79   -9.66   -8.16   -7.52   -8.50   -9.85    4.17   -8.98   
1    4.08   -0.29    6.36    4.37   -2.38   -9.66   -0.73   -5.34    8.88   
2     NaN     NaN     NaN     NaN    9.03    9.27    9.03    9.27     NaN   
3     NaN    8.35     NaN     NaN    1.80    8.16   -2.82    6.21     NaN   
4    8.50    4.61   -4.17   -5.39    1.36    1.60    7.04    4.61   -0.44   

   joke_10  ...  joke_91  joke_92  joke_93  joke_94  joke_95  joke_96  \
0    -4.76  ...     2.82      NaN      NaN      NaN      NaN      NaN   
1     9.22  ...     2.82    -4.95    -0.29     7.86    -0.19    -2.14   
2      NaN  ...      NaN      NaN      NaN     9.08      NaN      NaN   
3     1.84  ...      NaN      NaN      NaN     0.53      NaN      NaN   
4     5.73  ...     5.19     5.58     4.27     5.19     5.73     1.55   

   joke_97  joke_98  joke_99  joke_100  
0    -5

In [19]:
# Count number of non-NaN ratings per user (row)
joke_counts = ratings.notna().sum(axis=1)

# Display first few counts
print(joke_counts.head())

0     74
1    100
2     49
3     48
4     91
dtype: int64


In [20]:
# Step 1: Calculate average rating for each joke (ignore NaNs)
joke_means = ratings.mean(axis=0)

# Step 2: Find index of best joke (max average rating)
best_joke_index = joke_means.idxmax()

# Step 3: Get the average rating of that joke
best_joke_rating = joke_means.max()

print(f"Best joke is {best_joke_index} with an average rating of {best_joke_rating:.2f}")

Best joke is joke_89 with an average rating of 4.01


JUST FOR LAUGHS JOKE 89

A radio conversation of a US naval ship with Canadian authorities.

Americans: Please divert your course 15 degrees to the North to avoid a collision.

Canadians: Recommend you divert YOUR course 15 degrees to the South to avoid a collision.

Americans: This is the Captain of a US Navy ship. I say again, divert YOUR course.

Canadians: No. I say again, you divert YOUR course.

Americans: This is the aircraft carrier USS LINCOLN, the second largest ship in the United States' Atlantic Fleet. We are accompanied by three destroyers, three cruisers and numerous support vessels. I demand that you change your course 15 degrees north, that's ONE FIVE DEGREES NORTH, or counter-measures will be undertaken to ensure the safety of this ship.

Canadians: This is a lighthouse.



In [21]:
# Step 1: Use your cleaned ratings DataFrame (5000 × 100, NaNs where no rating)
# Fill NaNs with 0s for SVD input
ratings_matrix = ratings.fillna(0).to_numpy()

# Step 2: Perform SVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_matrix)
item_factors = svd.components_.T  # shape: (100 jokes × 20 latent features)

# Step 3: Choose a user (e.g., user index 10)
user_idx = 10
user_vector = user_factors[user_idx]

# Step 4: Compute scores and get top recommendations
scores = np.dot(item_factors, user_vector)
top_items = np.argsort(-scores)[:5]

# Step 5: Novelty — penalize popular jokes
item_popularity = np.sum(ratings_matrix > 0, axis=0)
item_popularity = item_popularity / item_popularity.max()
novelty = np.mean([1 - item_popularity[i] for i in top_items])

# Step 6: Diversity — average pairwise dissimilarity among recommended items
item_vecs = item_factors[top_items]
sim_matrix = cosine_similarity(item_vecs)
upper_triangle = sim_matrix[np.triu_indices(len(top_items), k=1)]
diversity = 1 - np.mean(upper_triangle)

# Step 7: Serendipity — dissimilarity from user's past highly rated jokes
liked_items = np.where(ratings_matrix[user_idx] >= 4)[0]
liked_vecs = item_factors[liked_items]
serendipity_scores = []

for i in top_items:
    rec_vec = item_factors[i].reshape(1, -1)
    if liked_vecs.shape[0] > 0:
        sim = cosine_similarity(rec_vec, liked_vecs)
        serendipity_scores.append(1 - np.mean(sim))

serendipity = np.mean(serendipity_scores)

# Step 8: Print results
print(f"Top recommended joke indices for user {user_idx}: {top_items}")
print(f"Novelty: {novelty:.4f}")
print(f"Diversity: {diversity:.4f}")
print(f"Serendipity: {serendipity:.4f}")

Top recommended joke indices for user 10: [49 34 28 26 35]
Novelty: 0.0522
Diversity: 0.5257
Serendipity: 0.7638


In [None]:
ratings.columns = ['ratingCount'] + [f'joke_{i}' for i in range(1, 100)]
ratings = ratings.drop(columns=['ratingCount'])
ratings.replace(99, np.nan, inplace=True)
ratings = ratings.iloc[:5000]  # first 5000 users

# Step 2: Convert to long format
ratings_long = ratings.reset_index().melt(id_vars='index', var_name='item_id', value_name='rating')
ratings_long.columns = ['user_id', 'item_id', 'rating']
ratings_long.dropna(inplace=True)
ratings_long['user_id'] = ratings_long['user_id'].apply(lambda x: f'u{x}')
ratings_long['item_id'] = ratings_long['item_id'].apply(lambda x: f'j{int(x.split("_")[1])}')

# Step 3: Faster Global Baseline Recommender
class FastGlobalBaseline:
    def __init__(self, reg_user=5.0, reg_item=5.0):
        self.reg_user = reg_user
        self.reg_item = reg_item

    def fit(self, df):
        self.mu = df['rating'].mean()

        # Compute user bias
        user_sum = df.groupby('user_id')['rating'].sum()
        user_count = df.groupby('user_id')['rating'].count()
        self.bu = ((user_sum - user_count * self.mu) / (user_count + self.reg_user)).to_dict()

        # Adjusted ratings for item bias
        df['adj_rating'] = df.apply(lambda row: row['rating'] - self.mu - self.bu.get(row['user_id'], 0), axis=1)
        item_sum = df.groupby('item_id')['adj_rating'].sum()
        item_count = df.groupby('item_id')['adj_rating'].count()
        self.bi = (item_sum / (item_count + self.reg_item)).to_dict()

    def predict(self, user_id, item_id):
        return max(-10, min(10, self.mu + self.bu.get(user_id, 0) + self.bi.get(item_id, 0)))

    def predict_batch(self, user_item_pairs):
        return [self.predict(u, i) for u, i in user_item_pairs]

# Step 4: Train/test split and evaluate
train_data, test_data = train_test_split(ratings_long, test_size=0.2, random_state=42)

model = FastGlobalBaseline()
model.fit(train_data)

# Predict
test_data['predicted'] = model.predict_batch(list(zip(test_data['user_id'], test_data['item_id'])))

# Evaluate
rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted']))
mae = np.mean(np.abs(test_data['rating'] - test_data['predicted']))

print(f"Global Baseline Recommender Results:")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")

✅ Global Baseline Recommender Results:
RMSE: 4.329
MAE: 3.453


In [26]:
# Fill NaNs temporarily with zeros for SVD input
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
ratings_filled = pd.DataFrame(imputer.fit_transform(ratings), columns=ratings.columns)

# Mask of known ratings
mask = ~ratings.isna()
true_values = ratings[mask]

def evaluate(preds, truth_mask):
    y_true = truth_mask.values.flatten()
    y_pred = preds.values.flatten()
    mask_flat = ~np.isnan(y_true)
    return {
        "RMSE": np.sqrt(mean_squared_error(y_true[mask_flat], y_pred[mask_flat])),
        "MSE": mean_squared_error(y_true[mask_flat], y_pred[mask_flat]),
        "MAE": mean_absolute_error(y_true[mask_flat], y_pred[mask_flat])
    }

# --- SVD (no change) ---
svd = TruncatedSVD(n_components=20, random_state=42)
svd_user_factors = svd.fit_transform(ratings_filled)
svd_item_factors = svd.components_.T
svd_preds = pd.DataFrame(np.dot(svd_user_factors, svd_item_factors.T), columns=ratings.columns)
svd_metrics = evaluate(svd_preds, true_values)

# --- UBCF with user mean normalization ---
user_means = ratings.mean(axis=1)
ratings_centered = ratings.subtract(user_means, axis=0).fillna(0)

user_sim = cosine_similarity(ratings_centered)
np.fill_diagonal(user_sim, 0)

# Weighted sum predictions
weighted_sum = np.dot(user_sim, ratings_centered)
sim_sums = np.abs(user_sim).sum(axis=1)[:, None]
sim_sums[sim_sums == 0] = 1e-8  # avoid div by zero

ubcf_preds_centered = weighted_sum / sim_sums
# Add back user means
ubcf_preds = pd.DataFrame(ubcf_preds_centered, columns=ratings.columns).add(user_means, axis=0)

ubcf_metrics = evaluate(ubcf_preds, true_values)

# --- IBCF with item mean normalization ---
item_means = ratings.mean(axis=0)
ratings_centered_item = ratings.subtract(item_means, axis=1).fillna(0)

item_sim = cosine_similarity(ratings_centered_item.T)
np.fill_diagonal(item_sim, 0)

weighted_sum_item = np.dot(ratings_centered_item, item_sim)
sim_sums_item = np.abs(item_sim).sum(axis=1)
sim_sums_item[sim_sums_item == 0] = 1e-8

ibcf_preds_centered = weighted_sum_item / sim_sums_item
ibcf_preds = pd.DataFrame(ibcf_preds_centered, columns=ratings.columns).add(item_means, axis=1)

ibcf_metrics = evaluate(ibcf_preds, true_values)
# --- Results ---
results = pd.DataFrame([
    {"Model": "SVD", **svd_metrics},
    {"Model": "UBCF", **ubcf_metrics},
    {"Model": "IBCF", **ibcf_metrics}
])

print(results.round(3))

  Model   RMSE     MSE    MAE
0   SVD  3.214  10.328  2.421
1  UBCF  3.885  15.093  3.065
2  IBCF  4.254  18.096  3.398


In [29]:
# --- SVD ---
imputer = SimpleImputer(strategy='mean')
ratings_filled = pd.DataFrame(imputer.fit_transform(ratings), columns=ratings.columns)

start_model = time.time()
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(ratings_filled)
item_factors = svd.components_.T
end_model = time.time()

start_pred = time.time()
svd_preds = pd.DataFrame(np.dot(user_factors, item_factors.T), columns=ratings.columns)
end_pred = time.time()

print(f"SVD run fold/sample [model time/prediction time]\n 1  [{end_model - start_model:.3f}sec/{end_pred - start_pred:.3f}sec]")

# --- UBCF ---
user_means = ratings.mean(axis=1)
ratings_centered = ratings.subtract(user_means, axis=0).fillna(0)

start_model = time.time()
user_sim = cosine_similarity(ratings_centered)
np.fill_diagonal(user_sim, 0)
end_model = time.time()

start_pred = time.time()
weighted_sum = np.dot(user_sim, ratings_centered)
sim_sums = np.abs(user_sim).sum(axis=1)[:, None]
sim_sums[sim_sums == 0] = 1e-8
ubcf_preds_centered = weighted_sum / sim_sums
ubcf_preds = pd.DataFrame(ubcf_preds_centered, columns=ratings.columns).add(user_means, axis=0)
end_pred = time.time()

print(f"UBCF run fold/sample [model time/prediction time]\n 1  [{end_model - start_model:.3f}sec/{end_pred - start_pred:.3f}sec]")

# --- IBCF ---
item_means = ratings.mean(axis=0)
ratings_centered_item = ratings.subtract(item_means, axis=1).fillna(0)

start_model = time.time()
item_sim = cosine_similarity(ratings_centered_item.T)
np.fill_diagonal(item_sim, 0)
end_model = time.time()

start_pred = time.time()
weighted_sum_item = np.dot(ratings_centered_item, item_sim)
sim_sums_item = np.abs(item_sim).sum(axis=1)
sim_sums_item[sim_sums_item == 0] = 1e-8
ibcf_preds_centered = weighted_sum_item / sim_sums_item
ibcf_preds = pd.DataFrame(ibcf_preds_centered, columns=ratings.columns).add(item_means, axis=1)
end_pred = time.time()

print(f"IBCF run fold/sample [model time/prediction time]\n 1  [{end_model - start_model:.3f}sec/{end_pred - start_pred:.3f}sec]")


SVD run fold/sample [model time/prediction time]
 1  [0.033sec/0.000sec]
UBCF run fold/sample [model time/prediction time]
 1  [0.097sec/0.131sec]
IBCF run fold/sample [model time/prediction time]
 1  [0.010sec/0.004sec]


Increase Serendipity

In [30]:

def evaluate(preds, truth_mask):
    y_true = truth_mask.values.flatten()
    y_pred = preds.values.flatten()
    mask_flat = ~np.isnan(y_true)
    return {
        "RMSE": np.sqrt(mean_squared_error(y_true[mask_flat], y_pred[mask_flat])),
        "MSE": mean_squared_error(y_true[mask_flat], y_pred[mask_flat]),
        "MAE": mean_absolute_error(y_true[mask_flat], y_pred[mask_flat])
    }

# Use same ratings DataFrame from earlier
true_values = ratings.copy()
true_values[true_values == 99] = np.nan

# --- SVD2 (base) ---
svd2_metrics = evaluate(svd_preds, true_values)

# --- UBCF2 ---
ubcf2_metrics = evaluate(ubcf_preds, true_values)

# --- IBCF2 ---
ibcf2_metrics = evaluate(ibcf_preds, true_values)

# --- Combine ---
results_serendipity = pd.DataFrame([
    {"Model": "UBCF2", **ubcf2_metrics},
    {"Model": "SVD2", **svd2_metrics},
    {"Model": "IBCF2", **ibcf2_metrics}
])
print(results_serendipity.round(3))

   Model   RMSE     MSE    MAE
0  UBCF2  3.880  15.058  3.061
1   SVD2  3.206  10.279  2.418
2  IBCF2  4.251  18.069  3.395


In [31]:

def precision_at_k(preds, truth, k=5, threshold=5.0):
    precisions = []
    
    for user_idx in range(len(truth)):
        true_ratings = truth.iloc[user_idx]
        pred_ratings = preds.iloc[user_idx]

        # Only consider items the user has rated
        known_items = true_ratings[~true_ratings.isna()]

        # Get top-K predictions for those known items
        top_k_items = pred_ratings[known_items.index].sort_values(ascending=False).head(k)

        # Count how many of the top K were actually relevant
        actual_relevant = true_ratings[top_k_items.index] >= threshold
        precision = actual_relevant.sum() / k
        precisions.append(precision)

    return np.mean(precisions)
p_at_5 = {
    "UBCF2": precision_at_k(ubcf_preds, true_values, k=5),
    "SVD2": precision_at_k(svd_preds, true_values, k=5),
    "IBCF2": precision_at_k(ibcf_preds, true_values, k=5)
}
print(pd.DataFrame.from_dict(p_at_5, orient='index', columns=['Precision@5']).round(3))


       Precision@5
UBCF2        0.632
SVD2         0.621
IBCF2        0.409


Data source

Eigentaste: A Constant Time Collaborative Filtering Algorithm. Ken Goldberg, Theresa Roeder, Dhruv Gupta, and Chris Perkins. Information Retrieval, 4(2), 133-151. July 2001. u.