**Import library **bold text**

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162732 sha256=4d4c947bb37beeec3a44494556cce0c79f097de60b76d17d087d80385d760b3c
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Excess the dataset from github**

In [2]:
# URL of the CSV file hosted on GitHub
movie_url = "https://media.githubusercontent.com/media/pujan08/Movie_Recommender_System/main/movies.csv"
rating_url ="https://media.githubusercontent.com/media/pujan08/Movie_Recommender_System/main/ratings.csv"
tags_url = "https://media.githubusercontent.com/media/pujan08/Movie_Recommender_System/main/tags.csv"


**# Read the CSV files into DataFrames**

In [3]:
movies_df = pd.read_csv(movie_url)
ratings_df = pd.read_csv(rating_url,nrows=1000000)
tags_df = pd.read_csv(tags_url)

**#Print the first 10 rows**

In [4]:
print(movies_df.head(10))
print("\n")
print(ratings_df.head(10))
print("\n")
print(tags_df.head(10))


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
5                        Action|Crime|Thriller  
6                               Comedy|Romance  
7                           Adventure|Children  
8       

**Merge movies and tags for content filtering**

In [5]:
tags_grouped = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.dropna())).reset_index()
movie_tags_df = pd.merge(movies_df, tags_grouped, on='movieId', how='left')


**#Form metadata by joining all tag fields for each movie_title**

In [6]:
movie_tags_df['metadata'] = movie_tags_df['title'] + ' ' + movie_tags_df['tag'].fillna('')
movie_tags_df['metadata'] = movie_tags_df['metadata'].str.strip()


**# Build TF-IDF Vectorizer model and TruncatedSVD for content filtering**


In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_tags_df['metadata'])

lda = LatentDirichletAllocation(n_components=100)
latent_matrix_1 = lda.fit_transform(tfidf_matrix)

**# Compute item-item similarity matrix**

In [8]:
item_similarity_matrix = cosine_similarity(tfidf_matrix)

**Define hybrid model.**

In [9]:
def hybrid_recommendation(user_id, movie_title, item_similarity_matrix, ratings_df, movies_df, top_n=10, content_alpha=0.5, collab_alpha=0.5):
    # Get content-based scores
    content_scores = content_based_scores(movie_title, latent_matrix_1, movie_tags_df)

    # Get collaborative filtering scores
    collab_scores = collaborative_filtering_scores(user_id, movie_title, item_similarity_matrix, ratings_df, movies_df)

    # Combine scores using weighted average
    hybrid_scores = content_alpha * content_scores + collab_alpha * collab_scores

    # Get indices of top recommendations
    top_indices = np.argsort(hybrid_scores)[::-1][:top_n]

    # Get movie titles corresponding to the top indices
    recommended_movies = [movies_df.iloc[index]['title'] for index in top_indices]

    return recommended_movies

def content_based_scores(movie_title, latent_matrix_1, movie_tags_df):
    # Get movie metadata
    movie_metadata = movie_tags_df.loc[movie_tags_df['title'] == movie_title, 'metadata'].values[0]

    # Transform movie metadata
    tfidf_vector = tfidf.transform([movie_metadata])
    content_scores = cosine_similarity(tfidf_vector, tfidf_matrix)

    return content_scores.flatten()

def collaborative_filtering_scores(user_id, movie_title, item_similarity_matrix, ratings_df, movies_df):
    # Get the user's ratings
    user_ratings = ratings_df[ratings_df['userId'] == user_id]

    # Get the movie's index in the movies DataFrame
    movie_index = movies_df[movies_df['title'] == movie_title].index

    # If the movie is not found in the DataFrame, return a default score
    if len(movie_index) == 0:
        return 0.0

    movie_index = movie_index[0]  # Extract the index value

    # Get similarity scores for the target movie with all other movies
    similarity_scores = item_similarity_matrix[movie_index]

    # Get indices of movies the user has rated
    rated_movies_indices = [movies_df[movies_df['movieId'] == movie_id].index[0] for movie_id in user_ratings['movieId']]

    # Compute weighted sum of similarity scores with the user's ratings
    collab_scores = np.sum(similarity_scores[rated_movies_indices] * user_ratings['rating'].values) / np.sum(similarity_scores[rated_movies_indices])

    return collab_scores

**#call the functions**

In [10]:
# Example usage
user_id = 1
movie_title = 'Toy Story (1995)'
recommended_movies = hybrid_recommendation(user_id, movie_title, item_similarity_matrix, ratings_df, movies_df)
print("Hybrid Recommendations:")
print(recommended_movies)

Hybrid Recommendations:
['Toy Story (1995)', 'Toy Story 2 (1999)', "Bug's Life, A (1998)", 'Monsters, Inc. (2001)', 'Finding Nemo (2003)', 'Ice Age (2002)', 'Toy Story 3 (2010)', 'Ratatouille (2007)', 'Incredibles, The (2004)', 'Monsters University (2013)']


**Popularity-based Recommender**

In [None]:
def popularity_recommender(df, top_n=10):
    popular_movies = df.groupby('movieId')['rating'].count().sort_values(ascending=False).index[:top_n]
    return df[df['movieId'].isin(popular_movies)]['movieId'].unique()

**Content-based Filtering**

In [None]:
def content_based_recommender(df, movie_title, top_n=10):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['metadata'])

    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    movie_indices = pd.Series(df.index, index=df['title'])

    def get_recommendations(title):
        idx = movie_indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]
        movie_indices_local = [i[0] for i in sim_scores]  # Change variable name to avoid UnboundLocalError
        return df['title'].iloc[movie_indices_local].values

    return get_recommendations(movie_title)


**Collaborative Filtering**

In [None]:
def collaborative_filtering_recommender(df, user_id, top_n=10):
    label_encoders = {}
    for column in ['userId', 'movieId']:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

    user_movie_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    sparse_user_movie_matrix = csr_matrix(user_movie_matrix.values)
    svd_collab = TruncatedSVD(n_components=100)
    latent_matrix_2 = svd_collab.fit_transform(sparse_user_movie_matrix)

    def get_recommendations(user_id):
        user_idx = label_encoders['userId'].transform([user_id])[0]
        user_ratings = df[df['userId'] == user_id]
        user_unrated_movies = df[~df['movieId'].isin(user_ratings['movieId'])]['movieId'].unique()

        predictions = [(mid, latent_matrix_2[user_idx][mid]) for mid in user_unrated_movies if mid < len(latent_matrix_2[user_idx])]
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [df[df['movieId'] == mid]['movieId'].values[0] for mid, _ in predictions[:top_n]]

    return get_recommendations(user_id)


***Matrix Factorization***

In [None]:
def matrix_factorization_recommender(df, user_id, top_n=10):
    label_encoders = {}
    for column in ['userId', 'movieId']:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

    user_movie_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    sparse_user_movie_matrix = csr_matrix(user_movie_matrix.values)
    svd_collab = TruncatedSVD(n_components=100)
    latent_matrix_2 = svd_collab.fit_transform(sparse_user_movie_matrix)

    def get_recommendations(user_id):
        user_idx = label_encoders['userId'].transform([user_id])[0]
        user_ratings = df[df['userId'] == user_id]
        user_unrated_movies = df[~df['movieId'].isin(user_ratings['movieId'])]['movieId'].unique()

        predictions = [(mid, latent_matrix_2[user_idx][mid]) for mid in user_unrated_movies if mid < len(latent_matrix_2[user_idx])]
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [df[df['movieId'] == mid]['movieId'].values[0] for mid, _ in predictions[:top_n]]

    return get_recommendations(user_id)


**Example usage of each recommender**

In [None]:
user_id = 1
movie_title = 'Toy Story (1995)'
print("Popularity-based Recommender:")
print(popularity_recommender(ratings_df))
print("\nContent-based Filtering:")
print(content_based_recommender(movie_tags_df, movie_title))
print("\nCollaborative Filtering:")
print(collaborative_filtering_recommender(ratings_df, user_id))
print("\nMatrix Factorization:")
print(matrix_factorization_recommender(ratings_df, user_id))

Popularity-based Recommender:
[ 260  296  318  589  593  110  480 2571  356  527]

Content-based Filtering:
['Toy Story 2 (1999)' "Bug's Life, A (1998)" 'Monsters, Inc. (2001)'
 'Finding Nemo (2003)' 'Ice Age (2002)' 'Toy Story 3 (2010)'
 'Ratatouille (2007)' 'Incredibles, The (2004)'
 'Monsters University (2013)' 'Up (2009)']

Collaborative Filtering:
[0, 13, 6, 19, 25, 96, 42, 34, 35, 15]

Matrix Factorization:
[0, 15, 28, 17, 6, 50, 10, 14, 1, 40]
