# Movie Recommendation System - Data Preprocessing
## MovieLens Dataset Processing

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import joblib
import re
import os

## 1. Load Data

In [None]:
# Load movies
movies = pd.read_csv('../data/movie.csv')
print(f"Movies: {len(movies)} entries")
movies.head()

In [None]:
# Load ratings (sample for memory efficiency)
# Full dataset has 20M+ ratings, we'll sample for efficiency
ratings = pd.read_csv('../data/rating.csv')
print(f"Ratings: {len(ratings)} entries")
ratings.head()

## 2. Extract Year from Title

In [None]:
def extract_year(title):
    """Extract year from movie title like 'Toy Story (1995)'"""
    match = re.search(r'\((\d{4})\)', str(title))
    if match:
        return int(match.group(1))
    return None

def clean_title(title):
    """Remove year from title"""
    return re.sub(r'\s*\(\d{4}\)\s*$', '', str(title)).strip()

movies['year'] = movies['title'].apply(extract_year)
movies['clean_title'] = movies['title'].apply(clean_title)
movies.head()

## 3. Calculate Average Ratings

In [None]:
# Calculate average rating and count per movie
rating_stats = ratings.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    rating_count=('rating', 'count')
).reset_index()

# Merge with movies
movies = movies.merge(rating_stats, on='movieId', how='left')
movies['avg_rating'] = movies['avg_rating'].fillna(0)
movies['rating_count'] = movies['rating_count'].fillna(0)

print(f"Movies with ratings: {(movies['rating_count'] > 0).sum()}")
movies.head(10)

## 4. Process Genres

In [None]:
# Get all unique genres
all_genres = set()
for genres in movies['genres'].dropna():
    for g in genres.split('|'):
        all_genres.add(g)

print(f"Total genres: {len(all_genres)}")
print(sorted(all_genres))

In [None]:
# Create genre features (one-hot encoding)
genre_list = sorted(list(all_genres - {'(no genres listed)'}))

for genre in genre_list:
    movies[f'genre_{genre}'] = movies['genres'].fillna('').apply(
        lambda x: 1 if genre in x.split('|') else 0
    )

movies.head()

## 5. Create Feature Matrix for Similarity

In [None]:
# Feature columns (genres + normalized rating + year)
genre_cols = [col for col in movies.columns if col.startswith('genre_')]

# Normalize numerical features
scaler = MinMaxScaler()

# Handle missing years
movies['year_normalized'] = movies['year'].fillna(movies['year'].median())
movies['rating_normalized'] = movies['avg_rating']

# Create feature matrix
feature_cols = genre_cols + ['rating_normalized', 'year_normalized']
X = movies[feature_cols].values

# Scale features
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")

## 6. Clustering for Faster Recommendations

In [None]:
# Apply KMeans clustering
n_clusters = 50  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
movies['cluster'] = kmeans.fit_predict(X_scaled)

print(f"Clusters created: {n_clusters}")
print(movies['cluster'].value_counts().head(10))

## 7. Save Processed Data

In [None]:
# Create models directory if not exists
os.makedirs('../models', exist_ok=True)

# Save processed movies
movies.to_csv('../models/movies_with_clusters.csv', index=False)
print("Saved: movies_with_clusters.csv")

# Save scaled features
joblib.dump(X_scaled, '../models/movies_X_scaled.pkl')
print("Saved: movies_X_scaled.pkl")

# Save scaler
joblib.dump(scaler, '../models/movies_scaler.pkl')
print("Saved: movies_scaler.pkl")

# Save genre list
joblib.dump(genre_list, '../models/genre_list.pkl')
print("Saved: genre_list.pkl")

## 8. Summary Statistics

In [None]:
print("=" * 50)
print("DATASET SUMMARY")
print("=" * 50)
print(f"Total Movies: {len(movies)}")
print(f"Total Ratings: {len(ratings)}")
print(f"Year Range: {movies['year'].min()} - {movies['year'].max()}")
print(f"Genres: {len(genre_list)}")
print(f"Clusters: {n_clusters}")
print(f"Movies with 100+ ratings: {(movies['rating_count'] >= 100).sum()}")
print("=" * 50)

In [None]:
# Top rated movies (with at least 1000 ratings)
popular_movies = movies[movies['rating_count'] >= 1000].sort_values('avg_rating', ascending=False)
print("\nTop 20 Highest Rated Movies (1000+ ratings):")
print(popular_movies[['clean_title', 'year', 'genres', 'avg_rating', 'rating_count']].head(20).to_string())