In [15]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# 1. Load Raw Data
df = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')
df = pd.merge(df, movies, on='movieId')


# Filter out movies with < 10 ratings
movie_stats = df.groupby('title')['rating'].count()
popular_movies = movie_stats[movie_stats >= 10].index
df_clean = df[df['title'].isin(popular_movies)]

print(f"SVD Training Data: {df_clean['title'].nunique()} Movies")


# 3. Load into Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'rating']], reader)

SVD Training Data: 2269 Movies


In [18]:
# 1. Split the data (80% Train, 20% Test)
trainset, testset = train_test_split(data, test_size=0.20, random_state=42)

# 2. Initialize the SVD algorithm
# n_factors=100 means we want to find 100 hidden features (e.g., Action, Romance, etc.)
svd_model = SVD(n_factors=100, random_state=42)

# 3. Train the model
print("Training SVD model (Matrix Factorization)...")
svd_model.fit(trainset)
print(" Model Trained Successfully!")

Training SVD model (Matrix Factorization)...
 Model Trained Successfully!


In [21]:
# 1. Predict ratings for the test set
predictions = svd_model.test(testset)

# 2. Calculate RMSE
print("Evaluating SVD Model...")
rmse_svd = accuracy.rmse(predictions)

print(f"Final SVD RMSE: {rmse_svd}")

Evaluating SVD Model...
RMSE: 0.8517
Final SVD RMSE: 0.8517464529669303


In [19]:
def get_svd_recommendations(user_id, n_recommendations=5):
    # 1. Get a list of all movie IDs in the dataset
    # We use the raw dataframe 'df' from the first cell
    all_movie_ids = df['movieId'].unique()
    
    # 2. Get movies the user has ALREADY rated
    user_rated_movies = df[df['userId'] == user_id]['movieId']
    
    # 3. Create a list of movies the user has NOT seen
    # (We only want to recommend new things)
    movies_to_predict = [mid for mid in all_movie_ids if mid not in user_rated_movies.values]
    
    # 4. Predict the rating for every unseen movie
    predictions = []
    for movie_id in movies_to_predict:
        # svd_model.predict(uid, iid) returns an object
        pred = svd_model.predict(user_id, movie_id)
        predictions.append((movie_id, pred.est))
        
    # 5. Sort by highest predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # 6. Get top N
    top_n = predictions[:n_recommendations]
    
    # 7. Print the results (Map ID to Title)
    # We need the movies.csv to get titles
    movies_meta = pd.read_csv('../data/movies.csv')
    
    print(f"Top {n_recommendations} Recommendations for User {user_id}:")
    for movie_id, estimated_rating in top_n:
        title = movies_meta[movies_meta['movieId'] == movie_id]['title'].values[0]
        print(f"{title} (Est. Rating: {estimated_rating:.2f})")


# Recommend 5 movies for User 1
get_svd_recommendations(user_id=2)

Top 5 Recommendations for User 2:
Lord of the Rings: The Return of the King, The (2003) (Est. Rating: 4.61)
Clockwork Orange, A (1971) (Est. Rating: 4.60)
L.A. Confidential (1997) (Est. Rating: 4.58)
Seven Samurai (Shichinin no samurai) (1954) (Est. Rating: 4.55)
Seventh Seal, The (Sjunde inseglet, Det) (1957) (Est. Rating: 4.55)
