# Collaborative Filtering — User-User Similarity Analysis

This notebook implements and compares three user-user similarity metrics for collaborative filtering:

| Metric | Description | Range |
|--------|-------------|-------|
| **Mean Squared Difference (MSD)** | Measures average squared rating difference between users | [0, 1] after transformation |
| **Pearson Correlation** | Captures linear correlation between co-rated items | [-1, 1] → normalized to [0, 1] |
| **Cosine Similarity** | Measures angle between user rating vectors | [0, 1] |

We visualize each similarity matrix as a heatmap and use the best-performing metric to generate rating predictions via **k-Nearest Neighbors (k-NN)**.

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Reproducibility
np.random.seed(42)

In [None]:
# Load the sparse ratings dataset
column_names = ['userId', 'movieId', 'rating', 'timestamp']
data = pd.read_csv('../data/rating_sparse.csv', names=column_names, skiprows=1)
data.drop('timestamp', axis=1, inplace=True)

# Build user-item rating matrix (NaN = unrated)
rating_matrix = data.pivot(index='userId', columns='movieId', values='rating')

print(f"Users: {rating_matrix.shape[0]}, Items: {rating_matrix.shape[1]}")
print(f"Sparsity: {rating_matrix.isna().sum().sum() / rating_matrix.size:.2%}")
rating_matrix.head()

## 2. Similarity Functions

Each function operates on a pair of user rating vectors and only considers **co-rated items** (items both users have rated).

In [None]:
def mean_square_difference(user1, user2):
    """Compute MSD between two users over their co-rated items.
    
    Returns np.inf when no items are co-rated (maps to 0 after
    the 1/(1+MSD) transformation).
    """
    common_items = user1.dropna().index.intersection(user2.dropna().index)
    if len(common_items) == 0:
        return np.inf
    return np.mean((user1[common_items] - user2[common_items]) ** 2)

In [None]:
def pearson_similarity(user1, user2):
    """Compute Pearson correlation between two users over co-rated items.
    
    Returns 0 when there are no co-rated items or when standard
    deviation is zero (constant ratings).
    """
    common_items = user1.dropna().index.intersection(user2.dropna().index)
    if len(common_items) == 0:
        return 0

    r1 = user1[common_items].values
    r2 = user2[common_items].values

    mean1, mean2 = np.mean(r1), np.mean(r2)
    numerator = np.sum((r1 - mean1) * (r2 - mean2))
    denominator = np.sqrt(np.sum((r1 - mean1) ** 2)) * np.sqrt(np.sum((r2 - mean2) ** 2))

    if denominator == 0:
        return 0
    return numerator / denominator

## 3. Computing Similarity Matrices

In [None]:
# --- MSD Similarity ---
msd_similarity = pd.DataFrame(index=rating_matrix.index, columns=rating_matrix.index)

for u1 in rating_matrix.index:
    for u2 in rating_matrix.index:
        msd_similarity.loc[u1, u2] = mean_square_difference(
            rating_matrix.loc[u1], rating_matrix.loc[u2]
        )

# Transform: lower MSD → higher similarity
msd_similarity = 1 / (1 + msd_similarity.astype(float))
msd_similarity.fillna(0, inplace=True)

print("MSD similarity matrix computed.")

In [None]:
# --- Pearson Similarity ---
pearson_similarity_matrix = pd.DataFrame(index=rating_matrix.index, columns=rating_matrix.index)

for u1 in rating_matrix.index:
    for u2 in rating_matrix.index:
        if u1 != u2:
            pearson_similarity_matrix.loc[u1, u2] = pearson_similarity(
                rating_matrix.loc[u1], rating_matrix.loc[u2]
            )
        else:
            pearson_similarity_matrix.loc[u1, u2] = 1  # Self-similarity

pearson_similarity_matrix.fillna(0, inplace=True)

# Normalize from [-1, 1] → [0, 1]
pearson_similarity_matrix = (pearson_similarity_matrix.astype(float) + 1) / 2

print("Pearson similarity matrix computed.")

In [None]:
# --- Cosine Similarity ---
filled_rating_matrix = rating_matrix.fillna(0)

cosine_similarity_matrix = pd.DataFrame(
    cosine_similarity(filled_rating_matrix),
    index=rating_matrix.index,
    columns=rating_matrix.index
)

print("Cosine similarity matrix computed.")

## 4. Visualization

Heatmaps provide a quick visual inspection of how similar users are across the three metrics.  
**Green** = high similarity, **Red** = low similarity.

In [None]:
# Shared colormap: red (low) → white (mid) → green (high)
cmap = LinearSegmentedColormap.from_list(
    'green_red_white', ['red', 'white', 'green'], N=256
)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    msd_similarity, annot=True, fmt='.2f', cmap=cmap,
    center=0.5, linewidths=0.5,
    cbar_kws={'label': 'MSD Similarity'}
)
plt.title("User–User MSD Similarity")
plt.xlabel("User ID")
plt.ylabel("User ID")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    pearson_similarity_matrix, annot=True, fmt='.2f', cmap=cmap,
    center=0.5, linewidths=0.5,
    cbar_kws={'label': 'Pearson Similarity (normalized)'}
)
plt.title("User–User Pearson Similarity (Normalized to [0, 1])")
plt.xlabel("User ID")
plt.ylabel("User ID")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(
    cosine_similarity_matrix, annot=True, fmt='.2f', cmap=cmap,
    center=0.5, linewidths=0.5,
    cbar_kws={'label': 'Cosine Similarity'}
)
plt.title("User–User Cosine Similarity")
plt.xlabel("User ID")
plt.ylabel("User ID")
plt.tight_layout()
plt.show()

In [None]:
# Similarity score distributions (excluding self-similarity diagonal)
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

for ax, (name, matrix) in zip(axes, [
    ('MSD', msd_similarity),
    ('Pearson', pearson_similarity_matrix),
    ('Cosine', cosine_similarity_matrix)
]):
    vals = matrix.values[np.triu_indices_from(matrix.values, k=1)]
    sns.histplot(vals, kde=True, ax=ax, color='steelblue', bins=15)
    ax.set_title(f'{name} Distribution')
    ax.set_xlabel('Similarity Score')
    ax.set_ylabel('Frequency')

plt.suptitle('Similarity Score Distributions (Upper Triangle)', y=1.02, fontsize=13)
plt.tight_layout()
plt.show()

## 5. Evaluation — k-NN Rating Prediction

We evaluate each similarity metric by using a **k-Nearest Neighbors** approach to predict ratings:  
for each known rating, we mask it, predict it using the top-*k* most similar users, and measure **MAE** and **RMSE**.

In [None]:
def predict_rating_knn(user_id, item_id, sim_matrix, rating_mat, k=5):
    """Predict a single rating using weighted k-NN."""
    similarities = sim_matrix.loc[user_id].drop(user_id)
    item_ratings = rating_mat[item_id].drop(user_id)

    # Keep only users who rated this item
    rated_mask = item_ratings.notna()
    if rated_mask.sum() == 0:
        return np.nan

    sim_scores = similarities[rated_mask].astype(float)
    ratings = item_ratings[rated_mask].astype(float)

    # Select top-k neighbors
    top_k = sim_scores.nlargest(k)
    top_ratings = ratings[top_k.index]

    weight_sum = top_k.sum()
    if weight_sum == 0:
        return np.nan

    return (top_k * top_ratings).sum() / weight_sum


def evaluate_similarity(sim_matrix, rating_mat, k=5):
    """Evaluate a similarity matrix with leave-one-out on observed ratings."""
    actuals, predictions = [], []

    for user_id in rating_mat.index:
        rated_items = rating_mat.loc[user_id].dropna().index
        for item_id in rated_items:
            pred = predict_rating_knn(user_id, item_id, sim_matrix, rating_mat, k)
            if not np.isnan(pred):
                actuals.append(rating_mat.loc[user_id, item_id])
                predictions.append(pred)

    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    return mae, rmse, len(actuals)

In [None]:
results = {}
for name, sim_mat in [('MSD', msd_similarity),
                       ('Pearson', pearson_similarity_matrix),
                       ('Cosine', cosine_similarity_matrix)]:
    mae, rmse, n = evaluate_similarity(sim_mat, rating_matrix, k=5)
    results[name] = {'MAE': mae, 'RMSE': rmse, 'Predictions': n}
    print(f"{name:>8s}  |  MAE: {mae:.4f}  |  RMSE: {rmse:.4f}  |  n={n}")

results_df = pd.DataFrame(results).T
results_df

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

results_df['MAE'].plot.bar(ax=axes[0], color=['#e74c3c', '#3498db', '#2ecc71'], edgecolor='black')
axes[0].set_title('MAE by Similarity Metric')
axes[0].set_ylabel('Mean Absolute Error')
axes[0].set_ylim(0, results_df['MAE'].max() * 1.3)
axes[0].tick_params(axis='x', rotation=0)

results_df['RMSE'].plot.bar(ax=axes[1], color=['#e74c3c', '#3498db', '#2ecc71'], edgecolor='black')
axes[1].set_title('RMSE by Similarity Metric')
axes[1].set_ylabel('Root Mean Squared Error')
axes[1].set_ylim(0, results_df['RMSE'].max() * 1.3)
axes[1].tick_params(axis='x', rotation=0)

plt.suptitle('k-NN Rating Prediction Accuracy (k=5)', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

## 6. Observations

- **MSD** tends to produce the most uniform similarity scores — it treats all rating differences equally regardless of direction.
- **Pearson** captures relative preference patterns (e.g., both users rate action movies higher than comedy) even when their absolute scales differ.
- **Cosine** is sensitive to the magnitude of the rating vector, which can skew results for users with very few ratings.
- The **k-NN evaluation** provides a quantitative comparison of how well each metric translates into actual rating predictions.

For a deeper approach using generative models, see the companion **CFGAN notebook**.