In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162989 sha256=904d76c083a08c7606c3e45d2391cb52ad5e5805f5de14072ebacc771946c96f
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


## Task 1: Creating Recommendation Systems


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy

### Content-Based Filtering (using movies.csv)

In [3]:
# Load movies data
movies = pd.read_csv('movies.csv')


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [6]:
# Extract genres
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))

# Load a subset of movies data
# movies_subset = movies.head(10000)  # Using a smaller subset
movies_subset = movies.head(20000)


In [7]:
# Initialize TF-IDF Vectorizer with sparse matrix output
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset['genres'])

# Calculate cosine similarity in batches
batch_size = 100                        # Adjust as needed based on memory constraints
num_movies = tfidf_matrix.shape[0]
cosine_sim = []

for start in range(0, num_movies, batch_size):
    end = min(start + batch_size, num_movies)
    cosine_batch = cosine_similarity(tfidf_matrix[start:end], tfidf_matrix)
    cosine_sim.append(cosine_batch)

# Concatenate cosine similarity batches
cosine_sim = np.concatenate(cosine_sim, axis=0)


### Collaborative Filtering (using ratings.csv)

In [8]:
# Load ratings data
ratings = pd.read_csv('ratings.csv')

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880000.0
1,1,306,3.5,1147869000.0
2,1,307,5.0,1147869000.0
3,1,665,5.0,1147879000.0
4,1,899,3.5,1147869000.0


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5339064 entries, 0 to 5339063
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  float64
dtypes: float64(2), int64(2)
memory usage: 162.9 MB


In [11]:
ratings.shape

(5339064, 4)

In [12]:
ratings_subset = ratings.head(20000)  # Use a smaller subset for testing

# Define the Reader object
reader = Reader(rating_scale=(0.5, 5))

# Load data from DataFrame into Dataset object
data = Dataset.load_from_df(ratings_subset[['userId', 'movieId', 'rating']], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [13]:
# Convert trainset to Pandas DataFrame
df_ratings = pd.DataFrame(trainset.all_ratings(), columns=['user_id', 'item_id', 'rating'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,4.0
1,0,524,3.0
2,0,528,4.0
3,0,822,3.5
4,0,845,4.5


In [14]:
# Use a basic collaborative filtering algorithm
sim_options = {
    'name': 'cosine',
    'user_based': True
}

model = KNNBasic(sim_options=sim_options)
model.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7b0ad07d4400>

## Task 2: Generating Recommendations

### Content-Based Recommendations

* Defines a function named get_recommendations that takes movie_title, cosine_sim, and movies as input parameters.

* Finds the index of the input movie_title and Retrieves the genre of the input movie_title from the movies DataFrame

* Retrieves the precomputed cosine similarity scores and Sorts the similarity scores in descending order based on the cosine similarity value

* Retrieves the top 10 similar movies (excluding the input movie itself) using sim_scores = sim_scores[1:11].

In [15]:
def get_recommendations(movie_title, cosine_sim=cosine_sim, movies=movies):
    # Get movie index
    movie_index = movies[movies['title'] == movie_title].index[0]

    # Get movie genre
    movie_genre = movies.loc[movie_index, 'genres']
    print(f"Genre of '{movie_title}': {movie_genre}")
    print("")

    # Get similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies

    # Get indices of recommended movies
    recommended_movie_indices = [i[0] for i in sim_scores]

    # Print recommended movies
    print("Recommended Movies:")
    for idx, movie_idx in enumerate(recommended_movie_indices, 1):
        recommended_movie_title = movies.loc[movie_idx, 'title']
        # recommended_movie_genre = movies.loc[movie_idx, 'genres']
        print(f"{idx}. {recommended_movie_title}")


In [16]:
# Example usage
movie_title = 'Toy Story (1995)'
get_recommendations(movie_title)

Genre of 'Toy Story (1995)': Adventure Animation Children Comedy Fantasy

Recommended Movies:
1. Antz (1998)
2. Toy Story 2 (1999)
3. Adventures of Rocky and Bullwinkle, The (2000)
4. Emperor's New Groove, The (2000)
5. Monsters, Inc. (2001)
6. DuckTales: The Movie - Treasure of the Lost Lamp (1990)
7. Wild, The (2006)
8. Shrek the Third (2007)
9. Tale of Despereaux, The (2008)
10. Asterix and the Vikings (Astérix et les Vikings) (2006)


In [17]:
# Example usage
movie_title = 'GoldenEye (1995)'
get_recommendations(movie_title)

Genre of 'GoldenEye (1995)': Action Adventure Thriller

Recommended Movies:
1. Broken Arrow (1996)
2. Cliffhanger (1993)
3. Executive Decision (1996)
4. Surviving the Game (1994)
5. Rock, The (1996)
6. Chain Reaction (1996)
7. Maximum Risk (1996)
8. Die Hard 2 (1990)
9. Anaconda (1997)
10. Con Air (1997)


### Collaborative Filtering Recommendations


The below codes does the following:
* It first identifies movies that have not been rated by the specified user_id by comparing the movie_Id values in the ratings DataFrame.

* For each unrated movie, the function uses the trained collaborative filtering "**model**" to predict the rating that the user_id might give to that movie.
The predicted ratings along with the corresponding movieId values are collected in a list.

* The list of predicted ratings is sorted in descending order to identify the top-rated unrated movies. The function selects the top 10 recommended movies based on the predicted ratings.

* For each recommended movieId, the function retrieves the corresponding movie title from the movies DataFrame.

In [18]:
def get_collaborative_filtering_recommendations(user_id, model, movies, ratings):
    # Get unrated movies by the user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId']
    all_movies = ratings['movieId'].unique()
    unrated_movies = list(set(all_movies) - set(rated_movies))

    # Predict ratings for unrated movies
    predicted_ratings = []
    for movie_id in unrated_movies:
        prediction = model.predict(user_id, movie_id)
        predicted_ratings.append((prediction.est, movie_id))

    # Sort predicted ratings and get top recommendations
    predicted_ratings.sort(reverse=True, key=lambda x: x[0])
    top_recommendations = predicted_ratings[:10]

    # Get movie titles and ratings of top recommendations
    recommendations = []
    for rating, movie_id in top_recommendations:
        movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
        recommendations.append((movie_title, rating))

    # Print recommended movies with ratings
    print("Recommended Movies and Ratings:")
    for idx, (movie_title, rating) in enumerate(recommendations, 1):
        print(f"{idx}. {movie_title} - Rating: {rating:.2f}")


In [19]:
# Example usage:
user_id = 1  # Specify the user ID for recommendations
get_collaborative_filtering_recommendations(user_id, model, movies, ratings)

Recommended Movies and Ratings:
1. Bed of Roses (1996) - Rating: 5.00
2. Things to Do in Denver When You're Dead (1995) - Rating: 5.00
3. Once Upon a Time... When We Were Colored (1995) - Rating: 5.00
4. Beautiful Girls (1996) - Rating: 5.00
5. Amateur (1994) - Rating: 5.00
6. Blue in the Face (1995) - Rating: 5.00
7. Speechless (1994) - Rating: 5.00
8. Body Snatchers (1993) - Rating: 5.00
9. Endless Summer 2, The (1994) - Rating: 5.00
10. Fearless (1993) - Rating: 5.00


In [20]:
# Example usage:
user_id = 70
get_collaborative_filtering_recommendations(user_id, model, movies, ratings)

Recommended Movies and Ratings:
1. Once Upon a Time... When We Were Colored (1995) - Rating: 5.00
2. White Squall (1996) - Rating: 5.00
3. Amateur (1994) - Rating: 5.00
4. 8 Seconds (1994) - Rating: 5.00
5. Being Human (1993) - Rating: 5.00
6. Body Snatchers (1993) - Rating: 5.00
7. Endless Summer 2, The (1994) - Rating: 5.00
8. Ref, The (1994) - Rating: 5.00
9. Song of the Little Road (Pather Panchali) (1955) - Rating: 5.00
10. World of Apu, The (Apur Sansar) (1959) - Rating: 5.00


## Task 3: Evaluation and Comparison

### For content-based Recommendations

In [41]:
import pandas as pd
import numpy as np

# Load movies and ratings data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

movies = movies.head(20000)
ratings = ratings.head(20000)

# Merge movies and ratings data
merged_data = pd.merge(ratings, movies, on='movieId', how='left')


In [42]:
merged_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [43]:
merged_data.shape

(20000, 6)

In [44]:
def prepare_test_data(merged_data):
    test_data = {}

    if merged_data.empty:
        print("Error: Merged data is empty. Cannot prepare test data.")
        return test_data

    # Group ratings by userId
    grouped = merged_data.groupby('userId')

    # Iterate over each user's ratings
    for user_id, group in grouped:
        # Get rated movies and ratings for the user
        rated_movies = list(group['title'])
        test_data[user_id] = rated_movies

    return test_data

In [48]:
# Example usage for the test_data and printing the first 2 elements of this dictionary
test_data = prepare_test_data(merged_data)
for idx, (key, value) in enumerate(list(test_data.items())[:2], 1):
    print(f"{idx}. {key}: {value}")

1. 1: ['Pulp Fiction (1994)', 'Three Colors: Red (Trois couleurs: Rouge) (1994)', 'Three Colors: Blue (Trois couleurs: Bleu) (1993)', 'Underground (1995)', "Singin' in the Rain (1952)", 'Dirty Dancing (1987)', 'Delicatessen (1991)', 'Ran (1985)', 'Seventh Seal, The (Sjunde inseglet, Det) (1957)', 'Bridge on the River Kwai, The (1957)', 'M (1931)', 'Gattaca (1997)', 'Back to the Future Part II (1989)', 'Back to the Future Part III (1990)', 'Fanny and Alexander (Fanny och Alexander) (1982)', 'NeverEnding Story, The (1984)', 'Nights of Cabiria (Notti di Cabiria, Le) (1957)', 'Tango (1998)', 'Saragossa Manuscript, The (Rekopis znaleziony w Saragossie) (1965)', 'Run Lola Run (Lola rennt) (1998)', 'Black Cat, White Cat (Crna macka, beli macor) (1998)', 'Good Morning, Vietnam (1987)', 'Idiots, The (Idioterne) (1998)', 'Requiem for a Dream (2000)', 'In the Mood For Love (Fa yeung nin wa) (2000)', 'Moulin Rouge (2001)', 'Night, The (Notte, La) (1960)', 'Cries and Whispers (Viskningar och rop) (

In [50]:
import numpy as np

def evaluate_content_based_recommender(test_data, get_recommendations_func):
    precision_scores = []
    recall_scores = []

    for user_id, rated_movies in test_data.items():
        # Initialize sets to store all rated and recommended movies for the user
        all_rated_movies = set(rated_movies)
        all_recommended_movies = set()

        # Generate recommendations based on each rated movie
        for movie_title in rated_movies:
            recommendations = get_recommendations_func(movie_title)
            if recommendations is not None:
                all_recommended_movies.update(recommendations)

        # Compute precision and recall based on all rated and recommended movies
        relevant_and_recommended = all_rated_movies.intersection(all_recommended_movies)
        precision = len(relevant_and_recommended) / len(all_recommended_movies) if len(all_recommended_movies) > 0 else 0
        recall = len(relevant_and_recommended) / len(all_rated_movies) if len(all_rated_movies) > 0 else 0

        precision_scores.append(precision)
        recall_scores.append(recall)

    # Calculate mean precision and recall
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)

    return mean_precision, mean_recall


In [56]:
def get_content_recommendation(movie_title, cosine_sim=cosine_sim, movies=movies):
    # Check if movies DataFrame is empty
    if movies.empty:
        print("Error: Movies DataFrame is empty.")
        return None

    try:
        # Get movie index
        movie_index = movies.index[movies['title'] == movie_title].tolist()[0]

        # Get similarity scores for the movie
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]  # Top 10 similar movies

        # Get indices of recommended movies
        recommended_movie_indices = [i[0] for i in sim_scores]

        # Collect recommended movie titles in a list
        recommended_movies = [movies.loc[movie_idx, 'title'] for movie_idx in recommended_movie_indices]

        return recommended_movies

    except IndexError:
        # print(f"Error: Movie title '{movie_title}' not found in the movies DataFrame.")
        return None


In [57]:
mean_precision, mean_recall = evaluate_content_based_recommender(test_data, get_content_recommendation)
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")

Mean Precision: 0.0672
Mean Recall: 0.3862


### For Collabarative filtering-based Recommendations

Using **Root mean square error** loss function for evaluation:
* RMSE < 1: Typically considered very good, indicating accurate predictions with small errors.

* 1 ≤ RMSE < 2: Reasonable performance, with predictions somewhat close to actual ratings.

* RMSE ≥ 2: May indicate poor performance, with predictions significantly deviating from actual ratings.

In [58]:
# Make predictions
predictions = model.test(testset)

# Evaluate model
accuracy.rmse(predictions)


RMSE: 1.0267


1.026719819746358

Therefore, our Collabarative filtering-based Recommendations system is making somewhat **Reasonable Predictions** which are **close to actual ratings**



### Comparing the Recommendation Systems

In [None]:
# Compare Systems
def compare_recommendation_systems(user_id, actual_ratings):
    # Evaluate collaborative filtering
    precision_cf, recall_cf, f1_score_cf = evaluate_content_based(user_id, actual_ratings)

    # Evaluate content-based filtering
    precision_cb, recall_cb, f1_score_cb = evaluate_collaborative_filtering(user_id, actual_ratings)

    # Print results
    print("Collaborative Filtering Metrics:")
    print(f"Precision: {precision_cf:.4f}, Recall: {recall_cf:.4f}, F1-Score: {f1_score_cf:.4f}")

    print("\nContent-Based Filtering Metrics:")
    print(f"Precision: {precision_cb:.4f}, Recall: {recall_cb:.4f}, F1-Score: {f1_score_cb:.4f}")

# Example Usage
user_id = 1  # Specify the user for evaluation
actual_user_ratings = ratings  # Use actual ratings for evaluation

# Compare recommendation systems
compare_recommendation_systems(user_id, actual_user_ratings)