In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Phase 1 — Data Selection & Understanding  

In this first phase we:
- Select a manageable subset of the Netflix Prize data.
- Learn the structure of the files.
- Stream large files in chunks (to fit into RAM).
- Merge ratings with movie titles for readability.

We'll end up with a clean `DataFrame` of **UserID, MovieID, Rating, Title**.


In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', 50)

# Path to your dataset (adjust as needed for Kaggle)
DATA_DIR = Path("/kaggle/input/netflix-prize-data/").resolve()
COMBINED_FILE = DATA_DIR / "combined_data_1.txt"
MOVIE_TITLES = DATA_DIR / "movie_titles.csv"

print("Files exist?", COMBINED_FILE.exists(), MOVIE_TITLES.exists())


Files exist? True True


In [7]:
def stream_combined_ratings(path: Path, max_rows: int = None) -> pd.DataFrame:
    """
    Stream-read combined_data file.
    Returns DataFrame with UserID, MovieID, Rating, Date.
    max_rows lets you limit rows for memory reasons.
    """
    rows = []
    current_movie = None
    count = 0
    with open(path, 'r', encoding='latin-1') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.endswith(':'):  # new movie block
                current_movie = int(line[:-1])
                continue
            user_id, rating, date = line.split(',')
            rows.append((int(user_id), current_movie, float(rating), date))
            count += 1
            if max_rows is not None and count >= max_rows:
                break
    return pd.DataFrame(rows, columns=['UserID','MovieID','Rating','Date'])

# Example: read first 500k rows (adjust based on RAM)
ratings_df = stream_combined_ratings(COMBINED_FILE, max_rows=500_000)
ratings_df.head()


Unnamed: 0,UserID,MovieID,Rating,Date
0,1488844,1,3.0,2005-09-06
1,822109,1,5.0,2005-05-13
2,885013,1,4.0,2005-10-19
3,30878,1,4.0,2005-12-26
4,823519,1,3.0,2004-05-03


In [10]:
rows = []
with open(MOVIE_TITLES, 'r', encoding='latin-1') as f:
    for line in f:
        # split only first two commas, rest stays in title
        parts = line.strip().split(',', 2)
        if len(parts) < 3:
            continue
        movie_id = int(parts[0])
        year = int(parts[1]) if parts[1].isdigit() else None
        title = parts[2]
        rows.append((movie_id, year, title))

titles_df = pd.DataFrame(rows, columns=['MovieID','Year','Title'])
titles_df.head()


Unnamed: 0,MovieID,Year,Title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [11]:
ratings_df = ratings_df.merge(
    titles_df[['MovieID','Title']], on='MovieID', how='left'
)
ratings_df.head()


Unnamed: 0,UserID,MovieID,Rating,Date,Title
0,1488844,1,3.0,2005-09-06,Dinosaur Planet
1,822109,1,5.0,2005-05-13,Dinosaur Planet
2,885013,1,4.0,2005-10-19,Dinosaur Planet
3,30878,1,4.0,2005-12-26,Dinosaur Planet
4,823519,1,3.0,2004-05-03,Dinosaur Planet


# Phase 2 — Pre-processing  

We’ll clean and type-convert our data to make it ready for modelling:
- Convert IDs to integers.
- Convert Rating to float.
- Parse Date to datetime.
- Drop obvious missing values if any.
- Split into train/test per user (80/20) to simulate a recommendation scenario.


In [12]:
# Ensure correct types
ratings_df['UserID'] = ratings_df['UserID'].astype('int32')
ratings_df['MovieID'] = ratings_df['MovieID'].astype('int32')
ratings_df['Rating'] = ratings_df['Rating'].astype('float32')

# Convert date to datetime
ratings_df['Date'] = pd.to_datetime(ratings_df['Date'], errors='coerce')

# Drop rows with missing values (should be minimal)
ratings_df = ratings_df.dropna(subset=['UserID','MovieID','Rating'])

ratings_df.info()
ratings_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   UserID   500000 non-null  int32         
 1   MovieID  500000 non-null  int32         
 2   Rating   500000 non-null  float32       
 3   Date     500000 non-null  datetime64[ns]
 4   Title    500000 non-null  object        
dtypes: datetime64[ns](1), float32(1), int32(2), object(1)
memory usage: 13.4+ MB


Unnamed: 0,UserID,MovieID,Rating,Date,Title
0,1488844,1,3.0,2005-09-06,Dinosaur Planet
1,822109,1,5.0,2005-05-13,Dinosaur Planet
2,885013,1,4.0,2005-10-19,Dinosaur Planet
3,30878,1,4.0,2005-12-26,Dinosaur Planet
4,823519,1,3.0,2004-05-03,Dinosaur Planet


In [14]:
from tqdm.notebook import tqdm

def split_train_test_per_user(df, test_size=0.2, seed=42):
    np.random.seed(seed)
    train_parts = []
    test_parts = []
    for user_id, user_df in tqdm(df.groupby('UserID'), desc="Splitting users"):
        # random permute indices for this user
        idx = np.arange(len(user_df))
        np.random.shuffle(idx)
        cutoff = int(len(idx) * (1 - test_size))
        train_idx = idx[:cutoff]
        test_idx = idx[cutoff:]
        train_parts.append(user_df.iloc[train_idx])
        test_parts.append(user_df.iloc[test_idx])
    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df = pd.concat(test_parts).reset_index(drop=True)
    return train_df, test_df

train_df, test_df = split_train_test_per_user(ratings_df, test_size=0.2)

print("Train size:", len(train_df), "Test size:", len(test_df))
train_df.head()


Splitting users:   0%|          | 0/215008 [00:00<?, ?it/s]

Train size: 266701 Test size: 233299


Unnamed: 0,UserID,MovieID,Rating,Date,Title
0,7,28,4.0,2005-05-23,Lilo and Stitch
1,7,83,5.0,2005-10-30,Silkwood
2,7,8,5.0,2005-07-30,What the #$*! Do We Know!?
3,79,84,3.0,2004-07-06,The Powerpuff Girls Movie
4,87,95,1.0,2005-05-19,Dona Herlinda and Her Son


# Phase 3 — Baseline Models  

We start with simple baselines to understand the recommendation concept.

## 1. Popularity-based Recommendation  
Recommend the movies that are most popular (highest average rating or most ratings overall).  
No personalization yet.

## 2. User-based Collaborative Filtering (CF)  
Find users who rated things similarly (cosine similarity) and recommend what they liked.  

These baselines give us a sense of how well “trivial” models perform before we try more advanced methods.


In [15]:
# Compute average rating and count per movie from the training set
movie_stats = (
    train_df.groupby(['MovieID','Title'])['Rating']
    .agg(['mean','count'])
    .reset_index()
    .rename(columns={'mean':'AvgRating','count':'NumRatings'})
)

# Top-N by number of ratings (or by avg rating if you prefer)
top_movies = movie_stats.sort_values('NumRatings', ascending=False).head(20)
top_movies[['Title','NumRatings','AvgRating']].head(10)


Unnamed: 0,Title,NumRatings,AvgRating
29,Something's Gotta Give,46142,3.723159
142,The Game,22541,3.823122
27,Lilo and Stitch,21337,3.796035
110,Duplex (Widescreen),16802,3.082907
117,Rambo: First Blood Part II,11857,3.410137
82,Silkwood,11319,3.706246
57,Dragonheart,11156,3.592954
107,Spartan,11009,3.180034
76,Congo,8644,2.843359
7,What the #$*! Do We Know!?,6888,3.1491


In [16]:
def recommend_popular(n=10):
    return top_movies.head(n)[['Title','NumRatings','AvgRating']]

recommend_popular(10)


Unnamed: 0,Title,NumRatings,AvgRating
29,Something's Gotta Give,46142,3.723159
142,The Game,22541,3.823122
27,Lilo and Stitch,21337,3.796035
110,Duplex (Widescreen),16802,3.082907
117,Rambo: First Blood Part II,11857,3.410137
82,Silkwood,11319,3.706246
57,Dragonheart,11156,3.592954
107,Spartan,11009,3.180034
76,Congo,8644,2.843359
7,What the #$*! Do We Know!?,6888,3.1491


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Build a small user-item matrix for demo (e.g., first 5,000 users)
sample_users = train_df['UserID'].drop_duplicates().sample(5000, random_state=42)
user_item = (
    train_df[train_df['UserID'].isin(sample_users)]
    .pivot_table(index='UserID', columns='MovieID', values='Rating')
    .fillna(0)
)

# Compute cosine similarity between users
user_sim = cosine_similarity(user_item)
user_sim_df = pd.DataFrame(user_sim, index=user_item.index, columns=user_item.index)

def recommend_user_based(user_id, top_n=10):
    if user_id not in user_sim_df.index:
        return recommend_popular(top_n)  # fallback
    # get most similar users (excluding self)
    similar_users = user_sim_df.loc[user_id].drop(user_id).sort_values(ascending=False).index[:10]
    # gather movies those similar users rated highly
    similar_ratings = train_df[train_df['UserID'].isin(similar_users)]
    # exclude movies the target user has already rated
    seen_movies = set(train_df.loc[train_df['UserID']==user_id, 'MovieID'])
    # compute average rating per movie among similar users
    recs = (
        similar_ratings[~similar_ratings['MovieID'].isin(seen_movies)]
        .groupby(['MovieID','Title'])['Rating'].mean()
        .reset_index()
        .sort_values('Rating', ascending=False)
        .head(top_n)
    )
    return recs[['Title','Rating']]

# Example usage:
sample_user = user_item.index[0]
recommend_user_based(sample_user, top_n=10)


Unnamed: 0,Title,Rating


# Phase 4 — Matrix Factorization (SVD)

Matrix Factorization captures hidden (latent) factors for users and movies.

- Each user and each movie is represented by a vector of *k* latent factors.
- The dot product of these vectors gives the predicted rating.

We’ll use the [`surprise`](https://surprise.readthedocs.io/) library’s **SVD** implementation to train a simple model and compare it to our baselines.


In [18]:
# Install scikit-surprise if not already installed
# (In Kaggle you may need to uncomment)
# !pip install scikit-surprise

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_split
from surprise.model_selection import cross_validate
from surprise import accuracy


In [19]:
# Surprise expects a dataframe with at least user, item, rating columns.
# We'll use our train_df/test_df prepared earlier.

reader = Reader(rating_scale=(1, 5))  # Netflix ratings 1–5
data = Dataset.load_from_df(train_df[['UserID', 'MovieID', 'Rating']], reader)

# Surprise has its own train/test split, but we can also build a Trainset from ours:
trainset = data.build_full_trainset()

# Build testset from our test_df
testset = list(test_df[['UserID','MovieID','Rating']].itertuples(index=False, name=None))


In [20]:
# Train SVD
algo = SVD(n_factors=50, reg_all=0.02, lr_all=0.005, n_epochs=20)
algo.fit(trainset)

# Predict on testset
predictions = algo.test(testset)

# Evaluate RMSE
rmse = accuracy.rmse(predictions)
print("Test RMSE:", rmse)


RMSE: 1.0000
Test RMSE: 1.0000445726603848


In [21]:
def recommend_svd(user_id, n=10):
    """Recommend n movies to user_id using trained SVD model."""
    # Get all movie IDs
    all_movie_ids = train_df['MovieID'].unique()
    # Movies user has already rated
    seen_movies = set(train_df.loc[train_df['UserID']==user_id, 'MovieID'])
    # Predict rating for all unseen movies
    preds = []
    for mid in all_movie_ids:
        if mid not in seen_movies:
            preds.append((mid, algo.predict(user_id, mid).est))
    # Sort by predicted rating
    top_n = sorted(preds, key=lambda x: x[1], reverse=True)[:n]
    # Attach titles
    recs = titles_df[titles_df['MovieID'].isin([m for m,_ in top_n])][['MovieID','Title']]
    scores = pd.DataFrame(top_n, columns=['MovieID','PredRating'])
    return recs.merge(scores, on='MovieID').sort_values('PredRating', ascending=False)

# Example usage:
user_example = train_df['UserID'].iloc[0]
recommend_svd(user_example, n=10)


Unnamed: 0,MovieID,Title,PredRating
0,13,Lord of the Rings: The Return of the King: Ext...,4.82693
3,33,Aqua Teen Hunger Force: Vol. 1,4.726719
9,106,Stevie Ray Vaughan and Double Trouble: Live at...,4.549478
2,32,ABC Primetime: Mel Gibson's The Passion of the...,4.456822
7,97,Mostly Martha,4.407733
6,85,Elfen Lied,4.3951
8,98,The Battle of Algiers: Bonus Material,4.372178
5,76,I Love Lucy: Season 2,4.352892
1,25,Inspector Morse 31: Death Is Now My Neighbour,4.312666
4,37,Zatoichi's Conspiracy,4.297958


# Phase 5 — Evaluation & Metrics

We already computed **RMSE** for rating prediction.
Now we’ll add ranking metrics:

- **Precision@k:** Of the top-k recommendations, what fraction are actually relevant?
- **Recall@k:** Of all the relevant items for the user, how many did we recommend in the top-k?

This shows how well our model does at recommending good movies rather than just predicting ratings.


In [23]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a list of Surprise predictions."""
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # sort each user's predictions and keep top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Build top-N recommendations for all users
top_n_preds = get_top_n(predictions, n=10)


In [24]:
def precision_recall_at_k(predictions, k=10, threshold=4.0):
    """Return precision@k and recall@k for each user."""
    # map user to list of predictions sorted by est descending
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()
    
    for uid, user_ratings in user_est_true.items():
        # sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # number of relevant items in top k
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])
        n_rel_k = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])
        
        precisions[uid] = n_rec_k / k if k != 0 else 1
        recalls[uid] = n_rel_k / n_rel if n_rel != 0 else 1
    
    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4.0)

print("Mean Precision@10:", np.mean(list(precisions.values())))
print("Mean Recall@10:", np.mean(list(recalls.values())))


Mean Precision@10: 0.06254930049114453
Mean Recall@10: 0.999990720819508


In [25]:
# You can peek at similar movies by looking at item factors (algo.qi)
# For demonstration: compute cosine similarity between movie latent factors

from sklearn.metrics.pairwise import cosine_similarity

movie_factors = algo.qi  # latent factors for each movie (ordered by inner id)
similarities = cosine_similarity(movie_factors)

# Map back to MovieIDs:
inner_to_raw = {inner: algo.trainset.to_raw_iid(inner) for inner in range(algo.trainset.n_items)}

def similar_movies(movie_id, top_n=10):
    """Find top_n most similar movies (by latent factors) to a given movie_id."""
    inner_id = algo.trainset.to_inner_iid(movie_id)
    sim_scores = list(enumerate(similarities[inner_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # skip the first one (itself)
    sim_scores = sim_scores[1:top_n+1]
    sim_movie_ids = [int(inner_to_raw[idx]) for idx, _ in sim_scores]
    return titles_df[titles_df['MovieID'].isin(sim_movie_ids)][['MovieID','Title']]

# Example: find movies similar to a specific MovieID
example_movie_id = train_df['MovieID'].iloc[0]
similar_movies(example_movie_id, top_n=5)


Unnamed: 0,MovieID,Title
17,18,Immortal Beloved
83,84,The Powerpuff Girls Movie
117,118,Rambo: First Blood Part II
118,119,Travel the World by Train: Africa
121,122,Cube 2: Hypercube
