In [None]:
!pip install numpy
! pip install scikit-surprise



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import Reader


In [None]:
# Load the MovieLens dataset
movies_raw='https://raw.githubusercontent.com/Sumanth-Guduru/movie_recomendation_system/main/movies.csv'
ratings_raw='https://raw.githubusercontent.com/Sumanth-Guduru/movie_recomendation_system/main/ratings.csv'
ratings = pd.read_csv(ratings_raw,encoding='unicode_escape')
movies = pd.read_csv(movies_raw,encoding='unicode_escape')
movies

In [None]:
# Preprocess the movie data for content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Create a Surprise Reader object
reader = Reader(rating_scale=(0.5, 5))

In [None]:
# Load the data into a Surprise Dataset object
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
# Create a train/test split
trainset, testset = train_test_split(data, test_size=0.2)

In [None]:
# Create a collaborative filtering algorithm object
sim_options = {'name': 'pearson_baseline', 'user_based': True}
cf_algo = KNNBaseline(sim_options=sim_options)

In [None]:
# Train the collaborative filtering algorithm on the training set
trainset = data.build_full_trainset()
cf_algo.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7de1d816d600>

In [None]:
# Create a content-based filtering function
def content_based_recommendations(title):
    try:
        idx = movies[movies['title'] == title].index[0]
    except IndexError:
        return "Movie not found"
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]['title']


In [None]:
def collaborative_filtering_recommendations(user_id):
    # Load the dataset
    data = Dataset.load_builtin('ml-100k')

    # Build and fit the algorithm using the entire dataset as the trainset
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    trainset = data.build_full_trainset()
    cf_algo = KNNWithMeans(sim_options=sim_options, trainset=trainset)

    cf_algo.fit(trainset)

    # Get the unrated movies of the user
    user_unseen_movies = []
    for movie_id in trainset.all_items():
        if not trainset.ur[user_id] or movie_id not in set([rating[0] for rating in trainset.ur[user_id]]):
            user_unseen_movies.append((trainset.to_raw_uid(user_id), trainset.to_raw_iid(movie_id), 3.0))

    # Use the trainset attribute of the trainset object
    user_recommendations = cf_algo.test(user_unseen_movies)

    user_recommendations = [(int(r.iid), r.est) for r in user_recommendations]
    user_recommendations.sort(key=lambda x: x[1], reverse=True)

    return user_recommendations


In [None]:
# Create a Surprise Reader object
reader = Reader(rating_scale=(0.5, 5))

# Load the data into a Surprise Dataset object
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Define the algorithm
sim_options = {'name': 'pearson_baseline', 'user_based': True}
cf_algo = KNNWithMeans(sim_options=sim_options)

# Compute cross-validation scores
cv_results = cross_validate(cf_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print the average RMSE and MAE scores
print("Average RMSE:", np.mean(cv_results['test_rmse']))
print("Average MAE:", np.mean(cv_results['test_mae']))


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8924  0.9032  0.8938  0.9013  0.8915  0.8964  0.0049  
MAE (testset)     0.6785  0.6846  0.6783  0.6834  0.6759  0.6802  0.0033  
Fit time          0.61    0.89    0.76    0.32    0.28    0.57    0.24    
Test time         2.75    4.76    2.31    1.00    0.

In [None]:
# Create a hybrid filtering function
def hybrid_recommendations(title, user_id):
    content_based_movies = content_based_recommendations(title)
    collaborative_filtering_movies =collaborative_filtering_recommendations(user_id)
    recommended_movies = []
    for movie in content_based_movies:
        if movie in collaborative_filtering_movies:
            recommended_movies.append(movie)
    for movie in content_based_movies:
        if movie not in recommended_movies:
            recommended_movies.append(movie)
        if len(recommended_movies) == 5:
            break
    return recommended_movies

In [None]:
# Get hybrid recommendations for a user
user_id = 42
movie_title = "Jumanji (1995)"
recommended_movies = hybrid_recommendations(movie_title, user_id)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [None]:
# Print the recommended movies
print("Recommended movies for user", user_id)
print(recommended_movies)

Recommended movies for user 42
['Indian in the Cupboard, The (1995)', 'NeverEnding Story III, The (1994)', 'Escape to Witch Mountain (1975)', "Darby O'Gill and the Little People (1959)", 'Return to Oz (1985)']
