In [1]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162674 sha256=6a99ad8052f986eb3a8747592258ed4977f87918bad8dd96fbdfcbdbff954d49
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy  # Add this import

# Load MovieLens dataset (you can download it from https://grouplens.org/datasets/movielens/)
# For this example, we'll use the small dataset.
data = pd.read_csv('/content/ratings_small.csv')

# Surprise library requires a specific format for the dataset
reader = Reader(rating_scale=(0.5, 5))
dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=42)

# Define the parameter grid for tuning
param_grid = {'k': [20, 30, 40], 'sim_options': {'name': ['cosine'], 'user_based': [True]}}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)
grid_search.fit(dataset)

# Get the best parameters
best_params = grid_search.best_params['rmse']
print("Best Parameters:", best_params)

# Use the best parameters to train the model
best_model = KNNBasic(k=best_params['k'], sim_options=best_params['sim_options'])
best_model.fit(trainset)

# Get predictions for the test set
predictions = best_model.test(testset)

# Evaluate the model's performance
accuracy.rmse(predictions)

# Example: Get movie recommendations for a specific user
user_id = 1
user_movies = data[data['userId'] == user_id]['movieId'].unique()

# Exclude movies the user has already rated
unrated_movies = data['movieId'].unique()
unrated_movies = [movie_id for movie_id in unrated_movies if movie_id not in user_movies]

# Get predictions for unrated movies
user_predictions = [best_model.predict(user_id, movie_id) for movie_id in unrated_movies]

# Sort predictions by estimated rating in descending order
sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True)

# Display the top N recommended movies
top_n = 10
top_recommendations = [(pred.iid, pred.est) for pred in sorted_predictions[:top_n]]
print(f"\nTop {top_n} Recommendations for User {user_id}:\n", top_recommendations)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Best Parameters: {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': True}}
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9925

Top 10 Recommendations for User 1:
 [(2086, 5), (1860, 5), (5017, 5), (5062, 5), (51471, 5), (7136, 5), (8955, 5), (3038, 5), (4088, 5), (4522, 