# Movie Recommendation System

This notebook implements a movie recommendation system using the MovieLens 100K dataset.

In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import urllib.request
import zipfile

# Create data directory if it doesn't exist
data_dir = 'Ttask-4.ml-100k'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Download and extract dataset if not already present
dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
zip_path = os.path.join(data_dir, 'ml-100k.zip')

if not os.path.exists(os.path.join(data_dir, 'u.data')):
    print('Downloading MovieLens 100K dataset...')
    try:
        urllib.request.urlretrieve(dataset_url, zip_path)
        print('Download complete. Extracting files...')
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print('Extraction complete!')
        
        # Clean up zip file
        os.remove(zip_path)
    except Exception as e:
        print(f'Error downloading or extracting dataset: {str(e)}')
        raise

# Load the data
try:
    ratings_df = pd.read_csv(os.path.join(data_dir, 'ml-100k', 'u.data'), 
                            sep='\t', 
                            names=['user_id', 'movie_id', 'rating', 'timestamp'])
    
    movies_df = pd.read_csv(os.path.join(data_dir, 'ml-100k', 'u.item'),
                           sep='|',
                           encoding='latin-1',
                           names=['movie_id', 'title', 'release_date', 'video_release_date',
                                 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                                 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
                                 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
    
    print('Dataset loaded successfully!')
except Exception as e:
    print(f'Error loading dataset: {str(e)}')
    raise

Downloading MovieLens 100K dataset...
Download complete. Extracting files...
Extraction complete!
Dataset loaded successfully!


## Data Preparation

Create the user-movie ratings matrix

In [7]:
# Create the user-movie ratings matrix
ratings_matrix = ratings_df.pivot(index='user_id', 
                                 columns='movie_id', 
                                 values='rating').fillna(0)

print(f'Shape of ratings matrix: {ratings_matrix.shape}')

Shape of ratings matrix: (943, 1682)


## Implement User-Based Collaborative Filtering

In [9]:
def get_user_similarities(ratings_matrix):
    """Calculate similarity between users using cosine similarity."""
    return cosine_similarity(ratings_matrix)

def get_user_recommendations(user_id, ratings_matrix, similarity_matrix, n_recommendations=5):
    """Generate movie recommendations for a user.
    
    Args:
        user_id: The ID of the user to generate recommendations for
        ratings_matrix: The user-movie ratings matrix
        similarity_matrix: Matrix of user similarities
        n_recommendations: Number of recommendations to generate
        
    Returns:
        List of recommended movie IDs
    """
    user_idx = ratings_matrix.index.get_loc(user_id)
    user_similarities = similarity_matrix[user_idx]
    
    # Get movies the user hasn't rated
    user_ratings = ratings_matrix.iloc[user_idx]
    unrated_movies = user_ratings[user_ratings == 0].index
    
    # Calculate predicted ratings
    predictions = []
    for movie_id in unrated_movies:
        movie_ratings = ratings_matrix[movie_id]
        pred_rating = np.sum(user_similarities * movie_ratings) / np.sum(np.abs(user_similarities))
        predictions.append((movie_id, pred_rating))
    
    # Sort and return top N recommendations
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n_recommendations]

# Calculate user similarities
similarity_matrix = get_user_similarities(ratings_matrix)

# Example: Get recommendations for user 1
user_id = 1
recommendations = get_user_recommendations(user_id, ratings_matrix, similarity_matrix)

print(f'\nTop 5 movie recommendations for user {user_id}:')
for movie_id, pred_rating in recommendations:
    movie_title = movies_df[movies_df['movie_id'] == movie_id]['title'].iloc[0]
    print(f'Movie: {movie_title}, Predicted Rating: {pred_rating:.2f}')


Top 5 movie recommendations for user 1:
Movie: Schindler's List (1993), Predicted Rating: 2.04
Movie: E.T. the Extra-Terrestrial (1982), Predicted Rating: 1.87
Movie: One Flew Over the Cuckoo's Nest (1975), Predicted Rating: 1.79
Movie: English Patient, The (1996), Predicted Rating: 1.74
Movie: Scream (1996), Predicted Rating: 1.70


## Evaluate the Recommendation System

In [10]:
def calculate_rmse(ratings_matrix, similarity_matrix, test_size=0.2):
    """Calculate RMSE for the recommendation system."""
    n_users, n_movies = ratings_matrix.shape
    test_ratings = []
    predictions = []
    
    # For each user, hide some ratings and try to predict them
    for user_idx in range(n_users):
        user_ratings = ratings_matrix.iloc[user_idx]
        rated_movies = user_ratings[user_ratings > 0].index
        
        if len(rated_movies) > 0:
            # Select random movies to test
            n_test = max(1, int(len(rated_movies) * test_size))
            test_movies = np.random.choice(rated_movies, n_test, replace=False)
            
            for movie_id in test_movies:
                actual_rating = ratings_matrix.iloc[user_idx][movie_id]
                
                # Temporarily set rating to 0 for prediction
                temp_ratings = ratings_matrix.copy()
                temp_ratings.iloc[user_idx][movie_id] = 0
                
                # Calculate predicted rating
                movie_ratings = temp_ratings[movie_id]
                pred_rating = np.sum(similarity_matrix[user_idx] * movie_ratings) / \
                             np.sum(np.abs(similarity_matrix[user_idx]))
                
                test_ratings.append(actual_rating)
                predictions.append(pred_rating)
    
    return np.sqrt(mean_squared_error(test_ratings, predictions))

# Calculate RMSE
rmse = calculate_rmse(ratings_matrix, similarity_matrix)
print(f'\nRoot Mean Square Error: {rmse:.4f}')

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  temp_ratings.iloc[user_idx][movie_id] = 0



Root Mean Square Error: 2.8176
