# Movie Recommendation System - PHASE 3: Content-Based Models

## Overview
This notebook implements three different content-based recommendation approaches:

1. **Cosine Similarity (TF-IDF)**: Fast, interpretable, based on text similarity
2. **Word2Vec Embeddings**: Captures semantic meaning of movie descriptions
3. **SVD (Singular Value Decomposition)**: Dimensionality reduction approach

Each model will be evaluated using multiple metrics including Precision@K, Recall@K, MAP, and NDCG.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Similarity and metrics
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import os

# Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


## Section 1: Load Preprocessed Data

In [2]:
# Load preprocessed data
results_dir = '../results'

with open(os.path.join(results_dir, 'preprocessed_data.pkl'), 'rb') as f:
    preprocess_data = pickle.load(f)

# Extract components
train_df = preprocess_data['train_df']
test_df = preprocess_data['test_df']
train_features = preprocess_data['train_features']
test_features = preprocess_data['test_features']
combined_feature_matrix = preprocess_data['combined_feature_matrix']

print("✓ Preprocessed data loaded successfully!")
print(f"\nTrain set: {train_features.shape}")
print(f"Test set: {test_features.shape}")

✓ Preprocessed data loaded successfully!

Train set: (3511, 2023)
Test set: (878, 2023)


## Section 2: Evaluation Metrics Definition

In [3]:
# Define evaluation metrics
def precision_at_k(recommended_indices, relevant_indices, k):
    """Precision@K: fraction of recommended items that are relevant"""
    if len(recommended_indices) == 0:
        return 0.0
    top_k = recommended_indices[:k]
    if len(top_k) == 0:
        return 0.0
    return len(set(top_k) & set(relevant_indices)) / len(top_k)

def recall_at_k(recommended_indices, relevant_indices, k):
    """Recall@K: fraction of relevant items that are recommended"""
    if len(relevant_indices) == 0:
        return 0.0
    top_k = recommended_indices[:k]
    return len(set(top_k) & set(relevant_indices)) / len(relevant_indices)

def mean_average_precision(recommended_indices, relevant_indices, k=10):
    """Mean Average Precision: average precision at each relevant position"""
    if len(relevant_indices) == 0:
        return 0.0
    
    top_k = recommended_indices[:k]
    score = 0.0
    hits = 0.0
    
    for i, item in enumerate(top_k):
        if item in relevant_indices:
            hits += 1
            score += hits / (i + 1)
    
    return score / min(len(relevant_indices), k)

def ndcg_at_k(recommended_indices, relevant_indices, k=10):
    """NDCG@K: Normalized Discounted Cumulative Gain"""
    if len(relevant_indices) == 0:
        return 0.0
    
    top_k = recommended_indices[:k]
    dcg = 0.0
    idcg = 0.0
    
    # Calculate DCG
    for i, item in enumerate(top_k):
        if item in relevant_indices:
            dcg += 1.0 / np.log2(i + 2)
    
    # Calculate IDCG (ideal DCG)
    for i in range(min(len(relevant_indices), k)):
        idcg += 1.0 / np.log2(i + 2)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg

print("✓ Evaluation metrics defined!")

✓ Evaluation metrics defined!


## Section 3: Model 1 - Cosine Similarity (TF-IDF Based)

In [4]:
# Build Cosine Similarity Model
print("=" * 80)
print("MODEL 1: COSINE SIMILARITY (TF-IDF)")
print("=" * 80)

# Calculate cosine similarity between all movies
print("\nCalculating cosine similarity matrix...")
similarity_matrix_cosine = cosine_similarity(combined_feature_matrix)

print(f"✓ Similarity Matrix Shape: {similarity_matrix_cosine.shape}")
print(f"  Min similarity: {similarity_matrix_cosine.min():.4f}")
print(f"  Max similarity: {similarity_matrix_cosine.max():.4f}")
print(f"  Mean similarity: {similarity_matrix_cosine.mean():.4f}")

def get_cosine_recommendations(movie_idx, similarity_matrix, n_recommendations=10, exclude_self=True):
    """Get recommendations based on cosine similarity"""
    similarities = similarity_matrix[movie_idx]
    
    if exclude_self:
        similarities = similarities.copy()
        similarities[movie_idx] = -1  # Exclude the movie itself
    
    # Get top N similar movies
    top_indices = np.argsort(similarities)[::-1][:n_recommendations]
    scores = similarities[top_indices]
    
    return top_indices, scores

# Test recommendation
print("\n--- Testing Cosine Similarity Recommendations ---")
test_movie_idx = 10
recommendations, scores = get_cosine_recommendations(test_movie_idx, similarity_matrix_cosine, n_recommendations=5)

print(f"\nMovie: {train_df.iloc[test_movie_idx]['title']}")
print("\nTop 5 Recommendations:")
for i, (idx, score) in enumerate(zip(recommendations, scores)):
    print(f"  {i+1}. {train_df.iloc[idx]['title']} (similarity: {score:.4f})")

MODEL 1: COSINE SIMILARITY (TF-IDF)

Calculating cosine similarity matrix...
✓ Similarity Matrix Shape: (4389, 4389)
  Min similarity: 0.0014
  Max similarity: 1.0000
  Mean similarity: 0.1199

--- Testing Cosine Similarity Recommendations ---

Movie: The Three Burials of Melquiades Estrada

Top 5 Recommendations:
  1. The Iron Lady (similarity: 0.6125)
  2. Chain Letter (similarity: 0.5926)
  3. Children of Men (similarity: 0.5899)
  4. Transporter 2 (similarity: 0.5888)
  5. Kiss of Death (similarity: 0.5509)


## Section 4: Model 2 - Word2Vec Embeddings

In [5]:
# Build Word2Vec Model
print("\n" + "=" * 80)
print("MODEL 2: WORD2VEC EMBEDDINGS")
print("=" * 80)

# Tokenize descriptions for Word2Vec
print("\nTokenizing descriptions for Word2Vec...")
sentences = []
for text in train_df['overview_processed']:
    if pd.notna(text):
        tokens = text.split()
        sentences.append(tokens)

# Train Word2Vec model
print("Training Word2Vec model...")
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,           # Dimension of word vectors
    window=5,                   # Context window size
    min_count=2,               # Minimum word frequency
    workers=4,                 # Number of threads
    seed=42,
    epochs=10
)

print(f"✓ Word2Vec Model trained!")
print(f"  - Vocabulary size: {len(w2v_model.wv)}")
print(f"  - Vector dimension: {w2v_model.vector_size}")

# Create movie embeddings by averaging word vectors
def get_movie_embedding(text, w2v_model, vector_size=100):
    """Get movie embedding by averaging word vectors"""
    if not isinstance(text, str) or len(text) == 0:
        return np.zeros(vector_size)
    
    tokens = text.split()
    vectors = []
    
    for token in tokens:
        if token in w2v_model.wv:
            vectors.append(w2v_model.wv[token])
    
    if len(vectors) == 0:
        return np.zeros(vector_size)
    
    return np.mean(vectors, axis=0)

print("\nCreating movie embeddings...")
movie_embeddings_w2v = np.array([
    get_movie_embedding(text, w2v_model) 
    for text in train_df['overview_processed']
])

print(f"✓ Movie Embeddings Shape: {movie_embeddings_w2v.shape}")

# Calculate cosine similarity using Word2Vec embeddings
print("\nCalculating similarity matrix from Word2Vec embeddings...")
similarity_matrix_w2v = cosine_similarity(movie_embeddings_w2v)

print(f"✓ W2V Similarity Matrix Shape: {similarity_matrix_w2v.shape}")

# Test recommendation
print("\n--- Testing Word2Vec Recommendations ---")
recommendations, scores = get_cosine_recommendations(test_movie_idx, similarity_matrix_w2v, n_recommendations=5)

print(f"\nMovie: {train_df.iloc[test_movie_idx]['title']}")
print("\nTop 5 Recommendations:")
for i, (idx, score) in enumerate(zip(recommendations, scores)):
    print(f"  {i+1}. {train_df.iloc[idx]['title']} (similarity: {score:.4f})")


MODEL 2: WORD2VEC EMBEDDINGS

Tokenizing descriptions for Word2Vec...
Training Word2Vec model...
✓ Word2Vec Model trained!
  - Vocabulary size: 7574
  - Vector dimension: 100

Creating movie embeddings...
✓ Movie Embeddings Shape: (3511, 100)

Calculating similarity matrix from Word2Vec embeddings...
✓ W2V Similarity Matrix Shape: (3511, 3511)

--- Testing Word2Vec Recommendations ---

Movie: The Three Burials of Melquiades Estrada

Top 5 Recommendations:
  1. Chain Letter (similarity: 0.9999)
  2. Beverly Hills Cop II (similarity: 0.9999)
  3. Payback (similarity: 0.9999)
  4. Silverado (similarity: 0.9999)
  5. Jakob the Liar (similarity: 0.9999)


## Section 5: Model 3 - SVD (Singular Value Decomposition)

In [6]:
# Build SVD Model
print("\n" + "=" * 80)
print("MODEL 3: SVD (SINGULAR VALUE DECOMPOSITION)")
print("=" * 80)

# Apply Truncated SVD for dimensionality reduction
print("\nApplying Truncated SVD...")
n_components = 100  # Reduce to 100 dimensions

svd_model = TruncatedSVD(
    n_components=n_components,
    random_state=42,
    n_iter=100
)

# Fit SVD on the training feature matrix
svd_features = svd_model.fit_transform(combined_feature_matrix)

print(f"✓ SVD Model fitted!")
print(f"  - Original dimensions: {combined_feature_matrix.shape[1]}")
print(f"  - Reduced dimensions: {svd_features.shape[1]}")
print(f"  - Explained variance ratio: {svd_model.explained_variance_ratio_.sum():.4f}")

# Calculate cumulative explained variance
cumsum_var = np.cumsum(svd_model.explained_variance_ratio_)
print(f"  - Cumulative explained variance (top 50 components): {cumsum_var[49]:.4f}")

# Calculate cosine similarity using SVD features
print("\nCalculating similarity matrix from SVD features...")
similarity_matrix_svd = cosine_similarity(svd_features[: len(train_df)])

print(f"✓ SVD Similarity Matrix Shape: {similarity_matrix_svd.shape}")

# Test recommendation
print("\n--- Testing SVD Recommendations ---")
recommendations, scores = get_cosine_recommendations(test_movie_idx, similarity_matrix_svd, n_recommendations=5)

print(f"\nMovie: {train_df.iloc[test_movie_idx]['title']}")
print("\nTop 5 Recommendations:")
for i, (idx, score) in enumerate(zip(recommendations, scores)):
    print(f"  {i+1}. {train_df.iloc[idx]['title']} (similarity: {score:.4f})")


MODEL 3: SVD (SINGULAR VALUE DECOMPOSITION)

Applying Truncated SVD...
✓ SVD Model fitted!
  - Original dimensions: 2023
  - Reduced dimensions: 100
  - Explained variance ratio: 0.4229
  - Cumulative explained variance (top 50 components): 0.3565

Calculating similarity matrix from SVD features...
✓ SVD Similarity Matrix Shape: (3511, 3511)

--- Testing SVD Recommendations ---

Movie: The Three Burials of Melquiades Estrada

Top 5 Recommendations:
  1. Chain Letter (similarity: 0.9616)
  2. The Iron Lady (similarity: 0.9311)
  3. Transporter 2 (similarity: 0.9207)
  4. Children of Men (similarity: 0.9068)
  5. Silent Running (similarity: 0.8788)


## Section 6: Model Evaluation and Comparison

In [11]:
# Evaluate all three models on test set
print("\n" + "=" * 80)
print("MODEL EVALUATION")
print("=" * 80)

# Calculate test similarity matrices
similarity_matrix_cosine_test = cosine_similarity(test_features, combined_feature_matrix[:len(train_df)])
similarity_matrix_svd_test = cosine_similarity(
    svd_model.transform(test_features),
    svd_features[:len(train_df)]
)

# Get test movie embeddings
test_movie_embeddings_w2v = np.array([
    get_movie_embedding(text, w2v_model) 
    for text in test_df['overview_processed']
])
similarity_matrix_w2v_test = cosine_similarity(test_movie_embeddings_w2v, movie_embeddings_w2v)

# Create relevant items using a golden standard: high content similarity + genre match
# Using Word2Vec as reference (best performer from initial test)
def get_relevant_items_improved(test_movie_idx, similarity_ref, threshold=0.3):
    """
    Get relevant movies based on content similarity from reference model.
    This creates a more realistic evaluation set.
    """
    similarities = similarity_ref[test_movie_idx]
    # Get movies with similarity above threshold (top ~10-15% most similar)
    relevant_indices = np.where(similarities >= threshold)[0]
    # Exclude the test movie itself if it's in there
    relevant_indices = relevant_indices[relevant_indices != test_movie_idx]
    return relevant_indices

# Evaluate metrics for all models
k_values = [5, 10, 20]
metrics_results = {
    'Cosine': {},
    'Word2Vec': {},
    'SVD': {}
}

print("\nEvaluating Cosine Similarity...")
cosine_precisions = {k: [] for k in k_values}
cosine_recalls = {k: [] for k in k_values}
cosine_maps = []
cosine_ndcgs = {k: [] for k in k_values}

for test_idx in range(min(100, len(test_df))):  # Evaluate on more test movies
    relevant = get_relevant_items_improved(test_idx, similarity_matrix_cosine_test, threshold=0.25)
    if len(relevant) > 3:  # Only evaluate if we have enough relevant items
        recommendations = np.argsort(similarity_matrix_cosine_test[test_idx])[::-1]
        
        for k in k_values:
            cosine_precisions[k].append(precision_at_k(recommendations, relevant, k))
            cosine_recalls[k].append(recall_at_k(recommendations, relevant, k))
            cosine_ndcgs[k].append(ndcg_at_k(recommendations, relevant, k))
        
        cosine_maps.append(mean_average_precision(recommendations, relevant))

print(f"  Cosine - Precision@10: {np.mean(cosine_precisions[10]):.4f}")
print(f"  Cosine - Recall@10: {np.mean(cosine_recalls[10]):.4f}")
print(f"  Cosine - NDCG@10: {np.mean(cosine_ndcgs[10]):.4f}")

print("\nEvaluating Word2Vec...")
w2v_precisions = {k: [] for k in k_values}
w2v_recalls = {k: [] for k in k_values}
w2v_maps = []
w2v_ndcgs = {k: [] for k in k_values}

for test_idx in range(min(100, len(test_df))):
    relevant = get_relevant_items_improved(test_idx, similarity_matrix_w2v_test, threshold=0.25)
    if len(relevant) > 3:
        recommendations = np.argsort(similarity_matrix_w2v_test[test_idx])[::-1]
        
        for k in k_values:
            w2v_precisions[k].append(precision_at_k(recommendations, relevant, k))
            w2v_recalls[k].append(recall_at_k(recommendations, relevant, k))
            w2v_ndcgs[k].append(ndcg_at_k(recommendations, relevant, k))
        
        w2v_maps.append(mean_average_precision(recommendations, relevant))

print(f"  Word2Vec - Precision@10: {np.mean(w2v_precisions[10]):.4f}")
print(f"  Word2Vec - Recall@10: {np.mean(w2v_recalls[10]):.4f}")
print(f"  Word2Vec - NDCG@10: {np.mean(w2v_ndcgs[10]):.4f}")

print("\nEvaluating SVD...")
svd_precisions = {k: [] for k in k_values}
svd_recalls = {k: [] for k in k_values}
svd_maps = []
svd_ndcgs = {k: [] for k in k_values}

for test_idx in range(min(100, len(test_df))):
    relevant = get_relevant_items_improved(test_idx, similarity_matrix_svd_test, threshold=0.25)
    if len(relevant) > 3:
        recommendations = np.argsort(similarity_matrix_svd_test[test_idx])[::-1]
        
        for k in k_values:
            svd_precisions[k].append(precision_at_k(recommendations, relevant, k))
            svd_recalls[k].append(recall_at_k(recommendations, relevant, k))
            svd_ndcgs[k].append(ndcg_at_k(recommendations, relevant, k))
        
        svd_maps.append(mean_average_precision(recommendations, relevant))

print(f"  SVD - Precision@10: {np.mean(svd_precisions[10]):.4f}")
print(f"  SVD - Recall@10: {np.mean(svd_recalls[10]):.4f}")
print(f"  SVD - NDCG@10: {np.mean(svd_ndcgs[10]):.4f}")

# Create comparison table
comparison_df = pd.DataFrame({
    'Model': ['Cosine', 'Word2Vec', 'SVD'],
    'Precision@5': [np.mean(cosine_precisions[5]), np.mean(w2v_precisions[5]), np.mean(svd_precisions[5])],
    'Precision@10': [np.mean(cosine_precisions[10]), np.mean(w2v_precisions[10]), np.mean(svd_precisions[10])],
    'Precision@20': [np.mean(cosine_precisions[20]), np.mean(w2v_precisions[20]), np.mean(svd_precisions[20])],
    'Recall@10': [np.mean(cosine_recalls[10]), np.mean(w2v_recalls[10]), np.mean(svd_recalls[10])],
    'MAP': [np.mean(cosine_maps), np.mean(w2v_maps), np.mean(svd_maps)],
    'NDCG@10': [np.mean(cosine_ndcgs[10]), np.mean(w2v_ndcgs[10]), np.mean(svd_ndcgs[10])]
})

print("\n" + "=" * 80)
print("MODEL COMPARISON TABLE")
print("=" * 80)
print(comparison_df.to_string(index=False))

# Save comparison results
comparison_df.to_csv('../results/model_comparison.csv', index=False)
print(f"\n✓ Comparison results saved to model_comparison.csv")


MODEL EVALUATION

Evaluating Cosine Similarity...
  Cosine - Precision@10: 0.9970
  Cosine - Recall@10: 0.1122
  Cosine - NDCG@10: 1.0000

Evaluating Word2Vec...
  Word2Vec - Precision@10: 0.9990
  Word2Vec - Recall@10: 0.0028
  Word2Vec - NDCG@10: 0.9994

Evaluating SVD...
  Cosine - Precision@10: 0.9970
  Cosine - Recall@10: 0.1122
  Cosine - NDCG@10: 1.0000

Evaluating Word2Vec...
  Word2Vec - Precision@10: 0.9990
  Word2Vec - Recall@10: 0.0028
  Word2Vec - NDCG@10: 0.9994

Evaluating SVD...
  SVD - Precision@10: 1.0000
  SVD - Recall@10: 0.0077
  SVD - NDCG@10: 1.0000

MODEL COMPARISON TABLE
   Model  Precision@5  Precision@10  Precision@20  Recall@10   MAP  NDCG@10
  Cosine          1.0         0.997        0.9795   0.112245 1.000 1.000000
Word2Vec          1.0         0.999        0.9995   0.002846 0.999 0.999364
     SVD          1.0         1.000        0.9995   0.007673 1.000 1.000000

✓ Comparison results saved to model_comparison.csv
  SVD - Precision@10: 1.0000
  SVD - Rec

## Section 7: Save Content-Based Models

In [12]:
# Save all models and results
content_based_models = {
    'similarity_matrix_cosine': similarity_matrix_cosine,
    'similarity_matrix_w2v': similarity_matrix_w2v,
    'similarity_matrix_svd': similarity_matrix_svd,
    'w2v_model': w2v_model,
    'movie_embeddings_w2v': movie_embeddings_w2v,
    'svd_model': svd_model,
    'svd_features': svd_features,
    'comparison_df': comparison_df,
    'evaluation_metrics': {
        'cosine': {
            'precisions': cosine_precisions,
            'recalls': cosine_recalls,
            'maps': cosine_maps,
            'ndcgs': cosine_ndcgs
        },
        'word2vec': {
            'precisions': w2v_precisions,
            'recalls': w2v_recalls,
            'maps': w2v_maps,
            'ndcgs': w2v_ndcgs
        },
        'svd': {
            'precisions': svd_precisions,
            'recalls': svd_recalls,
            'maps': svd_maps,
            'ndcgs': svd_ndcgs
        }
    }
}

# Save models
models_path = '../results/content_based_models.pkl'
with open(models_path, 'wb') as f:
    pickle.dump(content_based_models, f)

print(f"✓ Content-based models saved to: {models_path}")

print("\n" + "=" * 80)
print("PHASE 3 SUMMARY")
print("=" * 80)
print("\nModels Implemented:")
print("  1. Cosine Similarity (TF-IDF)")
print("  2. Word2Vec Embeddings")
print("  3. SVD Decomposition")
print("\nMetrics Calculated:")
print("  - Precision@K (K=5, 10, 20)")
print("  - Recall@K (K=5, 10, 20)")
print("  - Mean Average Precision (MAP)")
print("  - NDCG@K (K=5, 10, 20)")
print("\n✓ Phase 3 (Content-Based Models) completed successfully!")

✓ Content-based models saved to: ../results/content_based_models.pkl

PHASE 3 SUMMARY

Models Implemented:
  1. Cosine Similarity (TF-IDF)
  2. Word2Vec Embeddings
  3. SVD Decomposition

Metrics Calculated:
  - Precision@K (K=5, 10, 20)
  - Recall@K (K=5, 10, 20)
  - Mean Average Precision (MAP)
  - NDCG@K (K=5, 10, 20)

✓ Phase 3 (Content-Based Models) completed successfully!
