In [1]:
# 1. Environment Setup and Dependencies Installation
import subprocess
import sys
import os

def install_requirements():
    """Install project dependencies"""
    requirements = [
        'pandas>=1.3.0',
        'numpy>=1.21.0',
        'scikit-learn>=1.0.0',
        'matplotlib>=3.4.0',
        'seaborn>=0.11.0'
    ]
    
    print("Installing project dependencies...")
    for req in requirements:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', req, '--quiet'])
            print(f"[SUCCESS] {req.split('>=')[0]} installed")
        except subprocess.CalledProcessError as e:
            print(f"[ERROR] {req} installation failed: {e}")
    
    print("\nDependencies installation completed!")

install_requirements()

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import VarianceThreshold
import json
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")

print("Environment setup completed!")

Installing project dependencies...
[SUCCESS] pandas installed
[SUCCESS] numpy installed
[SUCCESS] scikit-learn installed
[SUCCESS] matplotlib installed
[SUCCESS] seaborn installed

Dependencies installation completed!
Environment setup completed!


In [2]:
# 2. Configuration Parameters
CONFIG = {
    'RANDOM_SEED': 42,
    'TEST_SIZE': 0.2,
    'OUTPUT_PATH': 'english_output',
    'DATA_PATH': 'ml-100k',
    'K_NEIGHBORS': 20,
    'N_RECOMMENDATIONS': 10
}

# Create output directories
os.makedirs(CONFIG['OUTPUT_PATH'], exist_ok=True)
os.makedirs(os.path.join(CONFIG['OUTPUT_PATH'], 'train_data'), exist_ok=True)
os.makedirs(os.path.join(CONFIG['OUTPUT_PATH'], 'result'), exist_ok=True)

print(f"Output directories created: {CONFIG['OUTPUT_PATH']}")
print(f"Random seed: {CONFIG['RANDOM_SEED']}")

Output directories created: english_output
Random seed: 42


In [3]:
# 3. Data Loading
print("Loading MovieLens 100k dataset...")

try:
    # Load ratings data
    ratings = pd.read_csv(
        f"{CONFIG['DATA_PATH']}/u.data", 
        sep='\t', 
        names=['user_id', 'movie_id', 'rating', 'timestamp'],
        encoding='latin-1'
    )

    # Load movie data
    movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + \
                   [f'genre_{i}' for i in range(19)]
    movies = pd.read_csv(
        f"{CONFIG['DATA_PATH']}/u.item", 
        sep='|', 
        names=movie_columns,
        encoding='latin-1'
    )

    # Load user data
    users = pd.read_csv(
        f"{CONFIG['DATA_PATH']}/u.user", 
        sep='|', 
        names=['user_id', 'age', 'gender', 'occupation', 'zipcode'],
        encoding='latin-1'
    )

    # Load genre data
    genres = pd.read_csv(
        f"{CONFIG['DATA_PATH']}/u.genre", 
        sep='|', 
        names=['genre', 'genre_id'],
        encoding='latin-1'
    ).dropna()
    genre_names = genres['genre'].tolist()

    print("Data loading completed!")
    print(f"Ratings data: {ratings.shape}")
    print(f"Movies data: {movies.shape}")
    print(f"Users data: {users.shape}")
    print(f"Number of genres: {len(genre_names)}")

except Exception as e:
    print(f"Data loading failed: {e}")
    raise

Loading MovieLens 100k dataset...
Data loading completed!
Ratings data: (100000, 4)
Movies data: (1682, 24)
Users data: (943, 5)
Number of genres: 19


In [4]:
# 4. Enhanced User Feature Engineering
def create_user_features_enhanced():
    """Create detailed user features"""
    print("=" * 80)
    print("Enhanced User Feature Engineering")
    print("=" * 80)

    # User age grouping
    def age_group(age):
        if age < 18: return 'Teenager(7-17)'
        elif age < 25: return 'Young(18-24)'
        elif age < 35: return 'Adult(25-34)'
        elif age < 45: return 'Middle-aged(35-44)'
        elif age < 50: return 'Pre-senior(45-49)'
        else: return 'Senior(50+)'

    users['age_group'] = users['age'].apply(age_group)
    users['region'] = users['zipcode'].str[:2]  # Extract region prefix

    print("User feature distribution:")
    print(f"   Age range: {users['age'].min()}-{users['age'].max()} (mean: {users['age'].mean():.1f})")
    print(f"   Gender distribution: {users['gender'].value_counts().to_dict()}")
    print(f"   Age group distribution: {users['age_group'].value_counts().to_dict()}")
    print(f"   Number of occupations: {users['occupation'].nunique()}")
    print(f"   Number of regions: {users['region'].nunique()}")

    # Create feature name mapping for analysis
    def create_user_feature_names():
        """Create user feature name mapping"""
        feature_names = []
        
        # Normalized age (1 dimension)
        feature_names.append('Age(normalized)')
        
        # Age group one-hot (6 dimensions)
        age_groups = sorted(users['age_group'].unique())
        for ag in age_groups:
            feature_names.append(f'AgeGroup_{ag}')
        
        # Gender one-hot (2 dimensions)
        genders = sorted(users['gender'].unique())
        for g in genders:
            gender_name = 'Female' if g == 'F' else 'Male'
            feature_names.append(f'Gender_{gender_name}')
        
        # Occupation one-hot (21 dimensions)
        occupations = sorted(users['occupation'].unique())
        for occ in occupations:
            feature_names.append(f'Occupation_{occ.title()}')
        
        # Region one-hot (top 20 regions + others)
        top_regions = users['region'].value_counts().head(20).index
        for region in top_regions:
            feature_names.append(f'Region_{region}')
        feature_names.append('Region_Other')
        
        return feature_names

    global user_feature_names
    user_feature_names = create_user_feature_names()
    print(f"\nFeature name mapping created, total features: {len(user_feature_names)}")

    return users

users = create_user_features_enhanced()

Enhanced User Feature Engineering
User feature distribution:
   Age range: 7-73 (mean: 34.1)
   Gender distribution: {'M': 670, 'F': 273}
   Age group distribution: {'Adult(25-34)': 310, 'Young(18-24)': 198, 'Middle-aged(35-44)': 194, 'Senior(50+)': 125, 'Pre-senior(45-49)': 80, 'Teenager(7-17)': 36}
   Number of occupations: 21
   Number of regions: 111

Feature name mapping created, total features: 51


In [5]:
# 5. Enhanced Movie Feature Engineering
def create_movie_features_enhanced():
    """Create detailed movie features"""
    print("=" * 80)
    print("Enhanced Movie Feature Engineering")
    print("=" * 80)

    # Extract year information
    movies['year'] = movies['title'].str.extract(r'\((\d{4})\)$')
    movies['year'] = pd.to_numeric(movies['year'], errors='coerce')
    movies['year'] = movies['year'].fillna(movies['year'].median())
    movies['decade'] = (movies['year'] // 10 * 10).astype(int)

    print("Movie feature distribution:")
    print(f"   Total movies: {len(movies)}")
    print(f"   Year range: {movies['year'].min():.0f}-{movies['year'].max():.0f}")
    print(f"   Number of genres: {len(genre_names)}")
    print(f"   Decade distribution: {movies['decade'].value_counts().sort_index().to_dict()}")

    # Create movie feature name mapping
    def create_movie_feature_names():
        """Create movie feature name mapping"""
        feature_names = []
        
        # Genre features (19 dimensions)
        for genre in genre_names:
            feature_names.append(f'Genre_{genre}')
        
        # Normalized year (1 dimension)
        feature_names.append('Year(normalized)')
        
        # Decade one-hot
        decades = sorted(movies['decade'].unique())
        for decade in decades:
            feature_names.append(f'Decade_{int(decade)}s')
        
        return feature_names

    global movie_feature_names
    movie_feature_names = create_movie_feature_names()
    print(f"\nMovie feature name mapping created, total features: {len(movie_feature_names)}")

    return movies

movies = create_movie_features_enhanced()

Enhanced Movie Feature Engineering
Movie feature distribution:
   Total movies: 1682
   Year range: 1922-1998
   Number of genres: 19
   Decade distribution: {1920: 2, 1930: 29, 1940: 45, 1950: 57, 1960: 46, 1970: 55, 1980: 109, 1990: 1339}

Movie feature name mapping created, total features: 28


In [6]:
# 6. Feature Engineering and PCA/SVD Comparison Analysis (Enhanced: Including Genre-only Features)
def create_features_with_detailed_analysis():
    """Create features and perform detailed PCA/SVD comparison analysis"""
    print("=" * 80)
    print("Feature Engineering and Detailed PCA/SVD Comparison Analysis (Enhanced)")
    print("=" * 80)

    # Create rating matrix
    rating_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)
    print(f"Rating matrix shape: {rating_matrix.shape}")

    # === User Feature Engineering ===
    print("\nUser feature engineering:")
    user_features_list = []
    
    # Age normalization
    age_scaler = StandardScaler()
    age_normalized = age_scaler.fit_transform(users[['age']])
    user_features_list.append(age_normalized)
    
    # Age group one-hot encoding
    age_dummies = pd.get_dummies(users['age_group']).values
    user_features_list.append(age_dummies)
    
    # Gender one-hot encoding
    gender_dummies = pd.get_dummies(users['gender']).values
    user_features_list.append(gender_dummies)
    
    # Occupation one-hot encoding
    occupation_dummies = pd.get_dummies(users['occupation']).values
    user_features_list.append(occupation_dummies)
    
    # Region one-hot encoding (top 20 most common regions)
    top_regions = users['region'].value_counts().head(20).index
    users['region_top'] = users['region'].where(users['region'].isin(top_regions), 'other')
    region_dummies = pd.get_dummies(users['region_top']).values
    user_features_list.append(region_dummies)
    
    # Combine user features
    user_features_raw = np.hstack(user_features_list)
    print(f"Raw user feature dimensions: {user_features_raw.shape[1]}")

    # === Movie Feature Engineering ===
    print("\nMovie feature engineering:")
    
    # Get movie IDs in rating matrix and maintain order
    matrix_movie_ids = rating_matrix.columns
    movies_in_matrix = movies[movies['movie_id'].isin(matrix_movie_ids)].copy()
    movies_in_matrix = movies_in_matrix.set_index('movie_id').reindex(matrix_movie_ids).reset_index()
    
    # Genre features (genre-only)
    genre_features = movies_in_matrix[[f'genre_{i}' for i in range(19)]].values
    print(f"Genre-only feature dimensions: {genre_features.shape[1]}")
    
    # Complete movie features (genre + year + decade)
    movie_features_list = []
    movie_features_list.append(genre_features)
    
    # Year normalization
    year_scaler = StandardScaler()
    year_normalized = year_scaler.fit_transform(movies_in_matrix[['year']])
    movie_features_list.append(year_normalized)
    
    # Decade one-hot encoding
    decade_dummies = pd.get_dummies(movies_in_matrix['decade'], prefix='decade').values
    movie_features_list.append(decade_dummies)
    
    # Combine complete movie features
    movie_features_raw = np.hstack(movie_features_list)
    print(f"Complete movie feature dimensions: {movie_features_raw.shape[1]}")

    # === PCA Dimensionality Reduction ===
    print("\nPCA dimensionality reduction analysis:")
    
    # User feature PCA
    user_pca = PCA(n_components=0.85, random_state=CONFIG['RANDOM_SEED'])
    user_features_pca = user_pca.fit_transform(user_features_raw)
    print(f"User PCA: {user_features_raw.shape[1]}D -> {user_features_pca.shape[1]}D (explained variance: {user_pca.explained_variance_ratio_.sum():.3%})")
    
    # Movie complete feature PCA
    movie_pca = PCA(n_components=0.90, random_state=CONFIG['RANDOM_SEED'])
    movie_features_pca = movie_pca.fit_transform(movie_features_raw)
    print(f"Movie complete feature PCA: {movie_features_raw.shape[1]}D -> {movie_features_pca.shape[1]}D (explained variance: {movie_pca.explained_variance_ratio_.sum():.3%})")
    
    # Movie genre-only feature PCA
    genre_pca_components = min(genre_features.shape[1]-1, int(genre_features.shape[1]*0.95))
    movie_genre_pca = PCA(n_components=genre_pca_components, random_state=CONFIG['RANDOM_SEED'])
    movie_genre_features_pca = movie_genre_pca.fit_transform(genre_features)
    print(f"Movie genre-only PCA: {genre_features.shape[1]}D -> {movie_genre_features_pca.shape[1]}D (explained variance: {movie_genre_pca.explained_variance_ratio_.sum():.3%})")

    # === SVD Dimensionality Reduction ===
    print("\nSVD dimensionality reduction analysis:")
    
    # User feature SVD
    user_svd_components = min(user_features_raw.shape[1]-1, user_features_pca.shape[1])
    user_svd = TruncatedSVD(n_components=user_svd_components, random_state=CONFIG['RANDOM_SEED']+1)
    user_features_svd = user_svd.fit_transform(user_features_raw)
    print(f"User SVD: {user_features_raw.shape[1]}D -> {user_features_svd.shape[1]}D (explained variance: {user_svd.explained_variance_ratio_.sum():.3%})")
    
    # Movie complete feature SVD
    movie_svd_components = min(movie_features_raw.shape[1]-1, movie_features_pca.shape[1])
    movie_svd = TruncatedSVD(n_components=movie_svd_components, random_state=CONFIG['RANDOM_SEED']+2)
    movie_features_svd = movie_svd.fit_transform(movie_features_raw)
    print(f"Movie complete feature SVD: {movie_features_raw.shape[1]}D -> {movie_features_svd.shape[1]}D (explained variance: {movie_svd.explained_variance_ratio_.sum():.3%})")
    
    # Movie genre-only feature SVD
    genre_svd_components = min(genre_features.shape[1]-1, movie_genre_features_pca.shape[1])
    movie_genre_svd = TruncatedSVD(n_components=genre_svd_components, random_state=CONFIG['RANDOM_SEED']+3)
    movie_genre_features_svd = movie_genre_svd.fit_transform(genre_features)
    print(f"Movie genre-only SVD: {genre_features.shape[1]}D -> {movie_genre_features_svd.shape[1]}D (explained variance: {movie_genre_svd.explained_variance_ratio_.sum():.3%})")

    # === Detailed Feature Importance Analysis ===
    print("\n" + "="*80)
    print("Detailed Feature Importance Analysis (PCA vs SVD)")
    print("="*80)

    # User feature importance analysis
    print("User Feature Importance Analysis")
    print("-" * 50)

    # PCA user feature importance
    print("PCA - Most important original features for User PC1:")
    user_pc1_weights = np.abs(user_pca.components_[0])
    top_user_features_pca = np.argsort(user_pc1_weights)[-10:]
    for i, idx in enumerate(reversed(top_user_features_pca)):
        feature_name = user_feature_names[idx] if idx < len(user_feature_names) else f'Feature{idx}'
        print(f"     {i+1:2d}. {feature_name:<30} - weight={user_pc1_weights[idx]:.4f}")

    print("PCA - First 3 principal components contribution:")
    for i in range(min(3, len(user_pca.explained_variance_ratio_))):
        print(f"     PC{i+1}: {user_pca.explained_variance_ratio_[i]:.3%}")

    # SVD user feature importance
    print("SVD - Most important original features for User dimension 1:")
    user_svd_weights = np.abs(user_svd.components_[0])
    top_user_features_svd = np.argsort(user_svd_weights)[-10:]
    for i, idx in enumerate(reversed(top_user_features_svd)):
        feature_name = user_feature_names[idx] if idx < len(user_feature_names) else f'Feature{idx}'
        print(f"     {i+1:2d}. {feature_name:<30} - weight={user_svd_weights[idx]:.4f}")

    print("SVD - First 3 dimensions contribution:")
    for i in range(min(3, len(user_svd.explained_variance_ratio_))):
        print(f"     SVD{i+1}: {user_svd.explained_variance_ratio_[i]:.3%}")

    # Movie feature importance analysis
    print("Movie Feature Importance Analysis")
    print("-" * 50)

    # PCA movie complete feature importance
    print("PCA - Most important original features for Movie complete PC1:")
    movie_pc1_weights = np.abs(movie_pca.components_[0])
    top_movie_features_pca = np.argsort(movie_pc1_weights)[-10:]
    for i, idx in enumerate(reversed(top_movie_features_pca)):
        feature_name = movie_feature_names[idx] if idx < len(movie_feature_names) else f'Feature{idx}'
        print(f"     {i+1:2d}. {feature_name:<30} - weight={movie_pc1_weights[idx]:.4f}")

    # PCA movie genre-only feature importance
    print("PCA - Most important original features for Movie genre-only PC1:")
    genre_pc1_weights = np.abs(movie_genre_pca.components_[0])
    top_genre_features_pca = np.argsort(genre_pc1_weights)[-10:]
    for i, idx in enumerate(reversed(top_genre_features_pca)):
        genre_name = genre_names[idx] if idx < len(genre_names) else f'Genre{idx}'
        print(f"     {i+1:2d}. Genre_{genre_name:<25} - weight={genre_pc1_weights[idx]:.4f}")

    # SVD movie complete feature importance
    print("SVD - Most important original features for Movie complete dimension 1:")
    movie_svd_weights = np.abs(movie_svd.components_[0])
    top_movie_features_svd = np.argsort(movie_svd_weights)[-10:]
    for i, idx in enumerate(reversed(top_movie_features_svd)):
        feature_name = movie_feature_names[idx] if idx < len(movie_feature_names) else f'Feature{idx}'
        print(f"     {i+1:2d}. {feature_name:<30} - weight={movie_svd_weights[idx]:.4f}")

    # SVD movie genre-only feature importance
    print("SVD - Most important original features for Movie genre-only dimension 1:")
    genre_svd_weights = np.abs(movie_genre_svd.components_[0])
    top_genre_features_svd = np.argsort(genre_svd_weights)[-10:]
    for i, idx in enumerate(reversed(top_genre_features_svd)):
        genre_name = genre_names[idx] if idx < len(genre_names) else f'Genre{idx}'
        print(f"     {i+1:2d}. Genre_{genre_name:<25} - weight={genre_svd_weights[idx]:.4f}")

    # PCA vs SVD difference analysis
    print("PCA vs SVD Feature Difference Analysis")
    print("-" * 50)

    # Feature correlations
    user_pca_svd_corr = np.corrcoef(user_features_pca[:, 0], user_features_svd[:, 0])[0, 1]
    movie_pca_svd_corr = np.corrcoef(movie_features_pca[:, 0], movie_features_svd[:, 0])[0, 1]
    genre_pca_svd_corr = np.corrcoef(movie_genre_features_pca[:, 0], movie_genre_features_svd[:, 0])[0, 1]
    print(f"User feature first dimension correlation (PCA vs SVD): {user_pca_svd_corr:.4f}")
    print(f"Movie complete feature first dimension correlation (PCA vs SVD): {movie_pca_svd_corr:.4f}")
    print(f"Movie genre-only feature first dimension correlation (PCA vs SVD): {genre_pca_svd_corr:.4f}")

    print("Detailed feature analysis completed!")

    return (rating_matrix, 
            user_features_pca, user_features_svd, 
            movie_features_pca, movie_features_svd,
            movie_genre_features_pca, movie_genre_features_svd,
            user_pca, movie_pca, user_svd, movie_svd,
            movie_genre_pca, movie_genre_svd)

# Execute feature engineering
rating_matrix, user_features_pca, user_features_svd, movie_features_pca, movie_features_svd, movie_genre_features_pca, movie_genre_features_svd, user_pca, movie_pca, user_svd, movie_svd, movie_genre_pca, movie_genre_svd = create_features_with_detailed_analysis()

Feature Engineering and Detailed PCA/SVD Comparison Analysis (Enhanced)
Rating matrix shape: (943, 1682)

User feature engineering:
Raw user feature dimensions: 51

Movie feature engineering:
Genre-only feature dimensions: 19
Complete movie feature dimensions: 28

PCA dimensionality reduction analysis:
User PCA: 51D -> 18D (explained variance: 85.654%)
Movie complete feature PCA: 28D -> 13D (explained variance: 90.575%)
Movie genre-only PCA: 19D -> 18D (explained variance: 99.913%)

SVD dimensionality reduction analysis:
User SVD: 51D -> 18D (explained variance: 84.787%)
Movie complete feature SVD: 28D -> 13D (explained variance: 89.649%)
Movie genre-only SVD: 19D -> 18D (explained variance: 99.912%)

Detailed Feature Importance Analysis (PCA vs SVD)
User Feature Importance Analysis
--------------------------------------------------
PCA - Most important original features for User PC1:
      1. Age(normalized)                - weight=0.9078
      2. AgeGroup_Young(18-24)          - weig

In [7]:
# 7. Recommendation Algorithm Implementation - Complete Comparison Experiment
print("=" * 80)
print("Recommendation Algorithm Implementation - Complete Comparison Experiment")
print("=" * 80)

class UserBasedCF:
    """User-based Collaborative Filtering"""
    def __init__(self, rating_matrix, user_features=None, use_user_features=False, feature_type='PCA', k_neighbors=20):
        self.rating_matrix = rating_matrix
        self.user_features = user_features
        self.use_user_features = use_user_features
        self.feature_type = feature_type
        self.k_neighbors = k_neighbors
        self.user_similarity = None
        
        feature_str = f'+{feature_type}_user_features' if use_user_features else ''
        self.name = f'UserCF(rating_only{feature_str})'
        print(f"[SUCCESS] {self.name} initialized")

    def compute_similarities(self):
        if self.use_user_features and self.user_features is not None:
            rating_sim = cosine_similarity(self.rating_matrix)
            feature_sim = cosine_similarity(self.user_features)
            self.user_similarity = 0.6 * rating_sim + 0.4 * feature_sim
        else:
            self.user_similarity = cosine_similarity(self.rating_matrix)
        np.fill_diagonal(self.user_similarity, 0)

    def predict_rating(self, user_id, movie_id):
        try:
            user_idx = self.rating_matrix.index.get_loc(user_id)
            movie_idx = self.rating_matrix.columns.get_loc(movie_id)
        except KeyError:
            return self.rating_matrix.mean().mean()
        
        movie_ratings = self.rating_matrix.iloc[:, movie_idx]
        rated_mask = movie_ratings > 0
        
        if not rated_mask.any():
            return self.rating_matrix.mean().mean()
        
        similarities = self.user_similarity[user_idx][rated_mask]
        ratings = movie_ratings[rated_mask]
        
        if len(similarities) > self.k_neighbors:
            top_k_idx = np.argsort(similarities)[-self.k_neighbors:]
            similarities = similarities[top_k_idx]
            ratings = ratings.iloc[top_k_idx]
        
        positive_mask = similarities > 0
        if not positive_mask.any() or similarities[positive_mask].sum() == 0:
            return self.rating_matrix.mean().mean()
        
        return np.clip(np.average(ratings[positive_mask], weights=similarities[positive_mask]), 1, 5)

class ItemBasedCF:
    """Item-based Collaborative Filtering"""
    def __init__(self, rating_matrix, movie_features=None, use_content_features=False, feature_type='PCA', feature_scope='complete', k_neighbors=20):
        self.rating_matrix = rating_matrix
        self.movie_features = movie_features
        self.use_content_features = use_content_features
        self.feature_type = feature_type
        self.feature_scope = feature_scope
        self.k_neighbors = k_neighbors
        self.item_similarity = None
        
        if use_content_features:
            feature_str = f'+{feature_type}_{feature_scope}_movie_features'
        else:
            feature_str = ''
        self.name = f'ItemCF(rating_only{feature_str})'
        print(f"[SUCCESS] {self.name} initialized")

    def compute_similarities(self):
        rating_item_sim = cosine_similarity(self.rating_matrix.T)
        
        if self.use_content_features and self.movie_features is not None:
            content_sim = cosine_similarity(self.movie_features)
            self.item_similarity = 0.6 * rating_item_sim + 0.4 * content_sim
        else:
            self.item_similarity = rating_item_sim
        np.fill_diagonal(self.item_similarity, 0)

    def predict_rating(self, user_id, movie_id):
        try:
            user_idx = self.rating_matrix.index.get_loc(user_id)
            movie_idx = self.rating_matrix.columns.get_loc(movie_id)
        except KeyError:
            return self.rating_matrix.mean().mean()
        
        user_ratings = self.rating_matrix.iloc[user_idx, :]
        rated_mask = user_ratings > 0
        
        if not rated_mask.any():
            return self.rating_matrix.mean().mean()
        
        similarities = self.item_similarity[movie_idx][rated_mask]
        ratings = user_ratings[rated_mask]
        
        if len(similarities) > self.k_neighbors:
            top_k_idx = np.argsort(similarities)[-self.k_neighbors:]
            similarities = similarities[top_k_idx]
            ratings = ratings.iloc[top_k_idx]
        
        positive_mask = similarities > 0
        if not positive_mask.any() or similarities[positive_mask].sum() == 0:
            return self.rating_matrix.mean().mean()
        
        return np.clip(np.average(ratings[positive_mask], weights=similarities[positive_mask]), 1, 5)

class HybridRecommender:
    """Hybrid Recommendation System"""
    def __init__(self, user_cf_model, item_cf_model, user_weight=0.5, item_weight=0.5):
        self.user_cf = user_cf_model
        self.item_cf = item_cf_model
        
        # Weight normalization
        total_weight = user_weight + item_weight
        self.user_weight = user_weight / total_weight
        self.item_weight = item_weight / total_weight
        
        user_type = self.user_cf.name
        item_type = self.item_cf.name
        self.name = f'HybridRec({user_type}+{item_type})'
        print(f"[SUCCESS] {self.name} initialized")

    def compute_similarities(self):
        self.user_cf.compute_similarities()
        self.item_cf.compute_similarities()

    def predict_rating(self, user_id, movie_id):
        user_pred = self.user_cf.predict_rating(user_id, movie_id)
        item_pred = self.item_cf.predict_rating(user_id, movie_id)
        return self.user_weight * user_pred + self.item_weight * item_pred

print("Recommendation algorithm classes defined successfully")

Recommendation Algorithm Implementation - Complete Comparison Experiment
Recommendation algorithm classes defined successfully


In [8]:
# 8. Complete Model Initialization and Training (Including Genre-only and SVD Options)
print("=" * 80)
print("Complete Recommendation Model Initialization and Training (Including Genre-only and SVD Options)")
print("=" * 80)

# Data splitting
train_ratings, test_ratings = train_test_split(ratings, test_size=CONFIG['TEST_SIZE'], random_state=CONFIG['RANDOM_SEED'])
train_matrix = train_ratings.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)
print(f"Data splitting completed:")
print(f"   Training set: {train_matrix.shape} (users x movies)")
print(f"   Test set: {len(test_ratings)} ratings")
print(f"   Training set sparsity: {(1 - np.count_nonzero(train_matrix) / train_matrix.size) * 100:.2f}%")

# Adjust feature dimensions to match training matrix
print("Adjusting feature dimensions to match training matrix...")
user_train_idx = [users[users['user_id'] == uid].index[0] for uid in train_matrix.index if uid in users['user_id'].values]
train_user_features_pca = user_features_pca[user_train_idx]
train_user_features_svd = user_features_svd[user_train_idx]

movie_train_idx = [movies[movies['movie_id'] == mid].index[0] for mid in train_matrix.columns if mid in movies['movie_id'].values]
train_movie_features_pca = movie_features_pca[movie_train_idx]
train_movie_features_svd = movie_features_svd[movie_train_idx]
train_movie_genre_features_pca = movie_genre_features_pca[movie_train_idx]
train_movie_genre_features_svd = movie_genre_features_svd[movie_train_idx]

print(f"   Training user features: PCA{train_user_features_pca.shape}, SVD{train_user_features_svd.shape}")
print(f"   Training movie complete features: PCA{train_movie_features_pca.shape}, SVD{train_movie_features_svd.shape}")
print(f"   Training movie genre-only features: PCA{train_movie_genre_features_pca.shape}, SVD{train_movie_genre_features_svd.shape}")

# Validate dimension matching
assert train_user_features_pca.shape[0] == train_matrix.shape[0], f"User feature dimension mismatch"
assert train_movie_features_pca.shape[0] == train_matrix.shape[1], f"Movie feature dimension mismatch"
print("Feature dimension validation passed")

# Display feature statistics
print(f"\nFeature statistics:")
print(f"   User PCA feature mean: {train_user_features_pca.mean():.4f}, std: {train_user_features_pca.std():.4f}")
print(f"   User SVD feature mean: {train_user_features_svd.mean():.4f}, std: {train_user_features_svd.std():.4f}")
print(f"   Movie complete PCA feature mean: {train_movie_features_pca.mean():.4f}, std: {train_movie_features_pca.std():.4f}")
print(f"   Movie complete SVD feature mean: {train_movie_features_svd.mean():.4f}, std: {train_movie_features_svd.std():.4f}")
print(f"   Movie genre-only PCA feature mean: {train_movie_genre_features_pca.mean():.4f}, std: {train_movie_genre_features_pca.std():.4f}")
print(f"   Movie genre-only SVD feature mean: {train_movie_genre_features_svd.mean():.4f}, std: {train_movie_genre_features_svd.std():.4f}")

# Initialize all models
print("Initializing all recommendation models...")
models = {}

# Basic User CF models (3 types)
print("\n1. User-based Collaborative Filtering models:")
models['UserCF(rating_only)'] = UserBasedCF(train_matrix, use_user_features=False)
models['UserCF(rating+PCA_user_features)'] = UserBasedCF(train_matrix, train_user_features_pca, use_user_features=True, feature_type='PCA')
models['UserCF(rating+SVD_user_features)'] = UserBasedCF(train_matrix, train_user_features_svd, use_user_features=True, feature_type='SVD')

# Basic Item CF models (5 types)
print("\n2. Item-based Collaborative Filtering models:")
models['ItemCF(rating_only)'] = ItemBasedCF(train_matrix, use_content_features=False)
models['ItemCF(rating+PCA_complete_features)'] = ItemBasedCF(train_matrix, train_movie_features_pca, use_content_features=True, feature_type='PCA', feature_scope='complete')
models['ItemCF(rating+SVD_complete_features)'] = ItemBasedCF(train_matrix, train_movie_features_svd, use_content_features=True, feature_type='SVD', feature_scope='complete')
models['ItemCF(rating+PCA_genre_only)'] = ItemBasedCF(train_matrix, train_movie_genre_features_pca, use_content_features=True, feature_type='PCA', feature_scope='genre_only')
models['ItemCF(rating+SVD_genre_only)'] = ItemBasedCF(train_matrix, train_movie_genre_features_svd, use_content_features=True, feature_type='SVD', feature_scope='genre_only')

# Hybrid recommendation models (selected 6 representative combinations)
print("\n3. Hybrid Recommendation models:")
models['HybridRec(rating_only)'] = HybridRecommender(
    models['UserCF(rating_only)'], 
    models['ItemCF(rating_only)'],
    user_weight=0.5, item_weight=0.5
)

models['HybridRec(PCA_user+rating_only_item)'] = HybridRecommender(
    models['UserCF(rating+PCA_user_features)'], 
    models['ItemCF(rating_only)'],
    user_weight=0.5, item_weight=0.5
)

models['HybridRec(SVD_user+rating_only_item)'] = HybridRecommender(
    models['UserCF(rating+SVD_user_features)'], 
    models['ItemCF(rating_only)'],
    user_weight=0.5, item_weight=0.5
)

models['HybridRec(rating_only_user+PCA_genre_only)'] = HybridRecommender(
    models['UserCF(rating_only)'], 
    models['ItemCF(rating+PCA_genre_only)'],
    user_weight=0.5, item_weight=0.5
)

models['HybridRec(PCA_user+PCA_complete_features)'] = HybridRecommender(
    models['UserCF(rating+PCA_user_features)'], 
    models['ItemCF(rating+PCA_complete_features)'],
    user_weight=0.5, item_weight=0.5
)

models['HybridRec(SVD_user+SVD_genre_only)'] = HybridRecommender(
    models['UserCF(rating+SVD_user_features)'], 
    models['ItemCF(rating+SVD_genre_only)'],
    user_weight=0.5, item_weight=0.5
)

print(f"\nModel initialization completed! Total {len(models)} recommendation models:")
for i, name in enumerate(models.keys(), 1):
    print(f"   {i:2d}. {name}")

# Train all models and display detailed information
print("\n" + "="*60)
print("Starting training for all models...")
print("="*60)

for i, (name, model) in enumerate(models.items(), 1):
    print(f"[{i}/{len(models)}] Training model: {name}")
    
    # Display model configuration information
    if hasattr(model, 'user_cf'):
        # Hybrid model
        print(f"   └─ User CF: {model.user_cf.name}")
        print(f"   └─ Item CF: {model.item_cf.name}")
        print(f"   └─ Weights: user{model.user_weight:.1f} + item{model.item_weight:.1f}")
    else:
        # Single model
        if hasattr(model, 'use_user_features') and model.use_user_features:
            print(f"   └─ User feature dimensions: {model.user_features.shape}")
        if hasattr(model, 'use_content_features') and model.use_content_features:
            print(f"   └─ Movie feature dimensions: {model.movie_features.shape} ({model.feature_scope})")
    
    # Train model
    model.compute_similarities()
    
    # Display similarity matrix information
    if hasattr(model, 'user_similarity') and model.user_similarity is not None:
        print(f"   └─ User similarity matrix: {model.user_similarity.shape} (non-zero elements: {np.count_nonzero(model.user_similarity)})")
    if hasattr(model, 'item_similarity') and model.item_similarity is not None:
        print(f"   └─ Item similarity matrix: {model.item_similarity.shape} (non-zero elements: {np.count_nonzero(model.item_similarity)})")
    
    print(f"   [SUCCESS] Completed\n")

print("All models training completed!")

Complete Recommendation Model Initialization and Training (Including Genre-only and SVD Options)
Data splitting completed:
   Training set: (943, 1653) (users x movies)
   Test set: 20000 ratings
   Training set sparsity: 94.87%
Adjusting feature dimensions to match training matrix...
   Training user features: PCA(943, 18), SVD(943, 18)
   Training movie complete features: PCA(1653, 13), SVD(1653, 13)
   Training movie genre-only features: PCA(1653, 18), SVD(1653, 18)
Feature dimension validation passed

Feature statistics:
   User PCA feature mean: -0.0000, std: 0.4273
   User SVD feature mean: 0.0884, std: 0.4874
   Movie complete PCA feature mean: -0.0003, std: 0.4354
   Movie complete SVD feature mean: 0.0118, std: 0.5150
   Movie genre-only PCA feature mean: -0.0001, std: 0.2744
   Movie genre-only SVD feature mean: 0.0506, std: 0.3055
Initializing all recommendation models...

1. User-based Collaborative Filtering models:
[SUCCESS] UserCF(rating_only) initialized
[SUCCESS] UserC

In [9]:
# 9. Model Evaluation and Results Comparison (Enhanced)
def evaluate_models_enhanced(models, test_ratings, train_matrix):
    """Evaluate all models and generate detailed comparison report (Enhanced)"""
    print("=" * 80)
    print("Model Evaluation and Results Comparison (Enhanced)")
    print("=" * 80)

    # Prepare test data
    test_sample_size = min(500, len(test_ratings))
    test_sample = test_ratings.sample(test_sample_size, random_state=CONFIG['RANDOM_SEED'])
    print(f"Test settings:")
    print(f"   Test sample size: {test_sample_size} / {len(test_ratings)}")
    print(f"   Training set coverage: users{len(train_matrix.index)}, movies{len(train_matrix.columns)}")

    # Statistics of test samples in training set coverage
    test_users_in_train = test_sample['user_id'].isin(train_matrix.index).sum()
    test_movies_in_train = test_sample['movie_id'].isin(train_matrix.columns).sum()
    print(f"   Test user coverage: {test_users_in_train}/{len(test_sample)} ({test_users_in_train/len(test_sample)*100:.1f}%)")
    print(f"   Test movie coverage: {test_movies_in_train}/{len(test_sample)} ({test_movies_in_train/len(test_sample)*100:.1f}%)")

    results = []
    detailed_results = {}
    
    for i, (name, model) in enumerate(models.items(), 1):
        print(f"\n[{i}/{len(models)}] Evaluating model: {name}")
        
        predictions = []
        actual_ratings = []
        prediction_times = []
        
        # Prediction process statistics
        valid_predictions = 0
        cold_start_users = 0
        cold_start_movies = 0
        
        for _, row in test_sample.iterrows():
            user_id, movie_id, actual_rating = row['user_id'], row['movie_id'], row['rating']
            
            # Cold start detection
            if user_id not in train_matrix.index:
                cold_start_users += 1
                continue
            if movie_id not in train_matrix.columns:
                cold_start_movies += 1
                continue
            
            # Predict rating
            import time
            start_time = time.time()
            pred_rating = model.predict_rating(user_id, movie_id)
            prediction_time = time.time() - start_time
            
            predictions.append(pred_rating)
            actual_ratings.append(actual_rating)
            prediction_times.append(prediction_time)
            valid_predictions += 1
        
        if len(predictions) > 0:
            # Calculate evaluation metrics
            rmse = np.sqrt(mean_squared_error(actual_ratings, predictions))
            mae = mean_absolute_error(actual_ratings, predictions)
            avg_pred_time = np.mean(prediction_times) * 1000  # Convert to milliseconds
            
            # Prediction distribution statistics
            pred_mean = np.mean(predictions)
            pred_std = np.std(predictions)
            actual_mean = np.mean(actual_ratings)
            
            results.append({
                'Model': name,
                'RMSE': rmse,
                'MAE': mae,
                'Test_Samples': len(predictions),
                'Avg_Pred_Time_ms': avg_pred_time
            })
            
            detailed_results[name] = {
                'predictions': predictions,
                'actual_ratings': actual_ratings,
                'cold_start_users': cold_start_users,
                'cold_start_movies': cold_start_movies
            }
            
            print(f"   RMSE: {rmse:.4f}, MAE: {mae:.4f} (samples: {valid_predictions}, time: {avg_pred_time:.2f}ms)")
        else:
            print(f"   [WARNING] Model cannot be evaluated (valid test samples: {valid_predictions}/{len(test_sample)})")
    
    # Generate detailed results comparison table
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('RMSE')
    
    print("\n" + "="*100)
    print("Model Performance Leaderboard (sorted by RMSE)")
    print("="*100)
    print(results_df.to_string(index=False, float_format='%.4f'))
    
    # Detailed experimental results analysis
    print("\n" + "="*80)
    print("Detailed Experimental Results Analysis")
    print("="*80)
    
    best_model = results_df.iloc[0]
    worst_model = results_df.iloc[-1]
    print(f"Best model: {best_model['Model']} (RMSE: {best_model['RMSE']:.4f})")
    print(f"Worst model: {worst_model['Model']} (RMSE: {worst_model['RMSE']:.4f})")
    improvement = ((worst_model['RMSE'] - best_model['RMSE']) / worst_model['RMSE']) * 100
    print(f"Performance gap: {improvement:.2f}%")

    # Feature value analysis
    print(f"\nFeature Value Analysis:")
    
    # 1. User feature value (PCA vs SVD)
    user_rating_only = results_df[results_df['Model'] == 'UserCF(rating_only)']['RMSE'].iloc[0] if 'UserCF(rating_only)' in results_df['Model'].values else None
    user_pca = results_df[results_df['Model'] == 'UserCF(rating+PCA_user_features)']['RMSE'].iloc[0] if 'UserCF(rating+PCA_user_features)' in results_df['Model'].values else None
    user_svd = results_df[results_df['Model'] == 'UserCF(rating+SVD_user_features)']['RMSE'].iloc[0] if 'UserCF(rating+SVD_user_features)' in results_df['Model'].values else None
    
    if user_rating_only and user_pca:
        pca_improvement = ((user_rating_only - user_pca) / user_rating_only) * 100
        print(f"   User PCA features: {'Effective' if pca_improvement > 0 else 'Ineffective'} (RMSE improvement {pca_improvement:.2f}%)")
    
    if user_rating_only and user_svd:
        svd_improvement = ((user_rating_only - user_svd) / user_rating_only) * 100
        print(f"   User SVD features: {'Effective' if svd_improvement > 0 else 'Ineffective'} (RMSE improvement {svd_improvement:.2f}%)")
    
    # 2. Movie feature value (complete vs genre-only)
    item_rating_only = results_df[results_df['Model'] == 'ItemCF(rating_only)']['RMSE'].iloc[0] if 'ItemCF(rating_only)' in results_df['Model'].values else None
    item_pca_full = results_df[results_df['Model'] == 'ItemCF(rating+PCA_complete_features)']['RMSE'].iloc[0] if 'ItemCF(rating+PCA_complete_features)' in results_df['Model'].values else None
    item_pca_genre = results_df[results_df['Model'] == 'ItemCF(rating+PCA_genre_only)']['RMSE'].iloc[0] if 'ItemCF(rating+PCA_genre_only)' in results_df['Model'].values else None
    
    if item_rating_only and item_pca_full:
        full_improvement = ((item_rating_only - item_pca_full) / item_rating_only) * 100
        print(f"   Movie complete features: {'Effective' if full_improvement > 0 else 'Ineffective'} (RMSE improvement {full_improvement:.2f}%)")
    
    if item_rating_only and item_pca_genre:
        genre_improvement = ((item_rating_only - item_pca_genre) / item_rating_only) * 100
        print(f"   Movie genre-only features: {'Effective' if genre_improvement > 0 else 'Ineffective'} (RMSE improvement {genre_improvement:.2f}%)")
    
    # 3. PCA vs SVD comparison
    pca_models = results_df[results_df['Model'].str.contains('PCA')]
    svd_models = results_df[results_df['Model'].str.contains('SVD')]
    
    if len(pca_models) > 0 and len(svd_models) > 0:
        pca_avg_rmse = pca_models['RMSE'].mean()
        svd_avg_rmse = svd_models['RMSE'].mean()
        
        if pca_avg_rmse < svd_avg_rmse:
            improvement = ((svd_avg_rmse - pca_avg_rmse) / svd_avg_rmse) * 100
            print(f"   PCA outperforms SVD: average RMSE improvement {improvement:.2f}%")
        else:
            improvement = ((pca_avg_rmse - svd_avg_rmse) / pca_avg_rmse) * 100
            print(f"   SVD outperforms PCA: average RMSE improvement {improvement:.2f}%")
    
    # Save detailed results
    results_file = os.path.join(CONFIG['OUTPUT_PATH'], 'result', 'comprehensive_evaluation_results.json')
    results_df.to_json(results_file, orient='records', indent=2, force_ascii=False)
    print(f"\nComprehensive evaluation results saved to: {results_file}")
    
    return results_df, detailed_results

# Execute comprehensive model evaluation
evaluation_results, detailed_results = evaluate_models_enhanced(models, test_ratings, train_matrix)

Model Evaluation and Results Comparison (Enhanced)
Test settings:
   Test sample size: 500 / 20000
   Training set coverage: users943, movies1653
   Test user coverage: 500/500 (100.0%)
   Test movie coverage: 500/500 (100.0%)

[1/14] Evaluating model: UserCF(rating_only)
   RMSE: 0.9898, MAE: 0.7879 (samples: 500, time: 0.18ms)

[2/14] Evaluating model: UserCF(rating+PCA_user_features)
   RMSE: 0.9835, MAE: 0.7825 (samples: 500, time: 0.18ms)

[3/14] Evaluating model: UserCF(rating+SVD_user_features)
   RMSE: 0.9733, MAE: 0.7804 (samples: 500, time: 0.18ms)

[4/14] Evaluating model: ItemCF(rating_only)
   RMSE: 0.9259, MAE: 0.7267 (samples: 500, time: 0.18ms)

[5/14] Evaluating model: ItemCF(rating+PCA_complete_features)
   RMSE: 0.9547, MAE: 0.7568 (samples: 500, time: 0.20ms)

[6/14] Evaluating model: ItemCF(rating+SVD_complete_features)
   RMSE: 0.9499, MAE: 0.7495 (samples: 500, time: 0.19ms)

[7/14] Evaluating model: ItemCF(rating+PCA_genre_only)
   RMSE: 0.9859, MAE: 0.7747 (sam

In [None]:
# 10. Project Summary
print("=" * 80)
print("MovieLens Recommendation System Project Summary")
print("=" * 80)

summary = {
    'project_name': 'MovieLens 100k Recommendation System Comprehensive Comparison Experiment',
    'experiment_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset': {
        'rating_count': len(ratings),
        'user_count': ratings['user_id'].nunique(),
        'movie_count': ratings['movie_id'].nunique(),
        'sparsity': f"{(1 - len(ratings) / (ratings['user_id'].nunique() * ratings['movie_id'].nunique())) * 100:.2f}%"
    },
    'feature_engineering': {
        'user_features': ['age', 'age_group', 'gender', 'occupation', 'region'],
        'movie_complete_features': ['genres(19D)', 'year', 'decade'],
        'movie_genre_only_features': ['genres(19D)'],
        'dimensionality_reduction_methods': ['PCA', 'SVD'],
        'PCA_user_feature_dimensions': user_features_pca.shape[1],
        'SVD_user_feature_dimensions': user_features_svd.shape[1],
        'PCA_movie_complete_feature_dimensions': movie_features_pca.shape[1],
        'SVD_movie_complete_feature_dimensions': movie_features_svd.shape[1],
        'PCA_movie_genre_only_feature_dimensions': movie_genre_features_pca.shape[1],
        'SVD_movie_genre_only_feature_dimensions': movie_genre_features_svd.shape[1]
    },
    'experimental_design': {
        'algorithm_types': ['User-based Collaborative Filtering', 'Item-based Collaborative Filtering', 'Hybrid Recommendation'],
        'feature_combinations': ['rating_only', 'rating+user_features', 'rating+movie_complete_features', 'rating+movie_genre_only_features'],
        'dimensionality_reduction_comparison': ['PCA vs SVD'],
        'movie_feature_comparison': ['complete_features vs genre_only_features'],
        'total_model_count': len(models)
    },
    'evaluation_metrics': ['RMSE', 'MAE', 'prediction_time'],
    'best_model': {
        'name': evaluation_results.iloc[0]['Model'],
        'RMSE': f"{evaluation_results.iloc[0]['RMSE']:.4f}",
        'MAE': f"{evaluation_results.iloc[0]['MAE']:.4f}"
    },
    'output_files': {
        'result_directory': CONFIG['OUTPUT_PATH'],
        'comprehensive_evaluation_results': 'result/comprehensive_evaluation_results.json'
    }
}

# Print summary
for key, value in summary.items():
    if isinstance(value, dict):
        print(f"{key.replace('_', ' ').title()}:")
        for sub_key, sub_value in value.items():
            print(f"   - {sub_key.replace('_', ' ')}: {sub_value}")
    elif isinstance(value, list):
        print(f"{key.replace('_', ' ').title()}: {', '.join(map(str, value))}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value}")
    print()

print("Experiment completed!")
print("\nKey Findings:")
print(f"   1. Compared {len(models)} different recommendation algorithm configurations")
print("   2. Analyzed the value of user features and movie features")
print("   3. Compared the effects of PCA and SVD dimensionality reduction methods")
print("   4. Contrasted movie complete features vs genre-only features effects")
print("   5. Provided detailed feature importance analysis and algorithm performance comparison")

# Save project summary
summary_file = os.path.join(CONFIG['OUTPUT_PATH'], 'comprehensive_project_summary.json')
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print(f"Project summary saved to: {summary_file}")