Imports and Warnings

In [8]:
!pip install implicit
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load Data

In [5]:
# Load the data
ratings = pd.read_csv('/content/drive/MyDrive/FINAL PROJECT/movies.csv')
movies = pd.read_csv('//content/drive/MyDrive/FINAL PROJECT/ratings.csv')

# Display the first few rows of the ratings dataframe
ratings.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Create Matrix Function

In [20]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

def create_X(df):
    # Ensure the column names are correct based on your DataFrame
    user_column = 'userId'  # Change if necessary (e.g., 'user_id')
    movie_column = 'movieId'  # Change if necessary (e.g., 'movie_id')
    rating_column = 'rating'  # Change if necessary (e.g., 'score')

    # Get the number of unique users and movies
    N = df[user_column].nunique()  # Unique users
    M = df[movie_column].nunique()  # Unique movies

    # Create mappings from userId/movieId to index
    user_mapper = {user: idx for idx, user in enumerate(df[user_column].unique())}
    movie_mapper = {movie: idx for idx, movie in enumerate(df[movie_column].unique())}

    # Reverse mappings from index to userId/movieId
    user_inv_mapper = {idx: user for idx, user in enumerate(df[user_column].unique())}
    movie_inv_mapper = {idx: movie for idx, movie in enumerate(df[movie_column].unique())}

    # Map the user and movie IDs to their corresponding indices
    user_index = [user_mapper[user] for user in df[user_column]]
    movie_index = [movie_mapper[movie] for movie in df[movie_column]]

    # Create the sparse matrix
    X = csr_matrix((df[rating_column], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

# Assuming you have a DataFrame `ratings` that has the required columns
# Replace 'ratings' with your actual DataFrame variable if it's named differently
ratings = pd.DataFrame({
    'userId': [1, 2, 3, 1, 2, 3],  # Example data
    'movieId': [101, 102, 103, 104, 105, 106],
    'rating': [5, 4, 3, 2, 5, 1]
})

# Now, call the create_X function and pass the ratings DataFrame
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

# Optionally, print the resulting matrix and mappings
print(X)
print(user_mapper)
print(movie_mapper)


  (0, 0)	5
  (1, 1)	4
  (2, 2)	3
  (3, 0)	2
  (4, 1)	5
  (5, 2)	1
{1: 0, 2: 1, 3: 2}
{101: 0, 102: 1, 103: 2, 104: 3, 105: 4, 106: 5}


Install and Import thefuzz

In [22]:
# Install thefuzz if not already installed
!pip install thefuzz

from thefuzz import fuzz
from thefuzz import process

# Example movie DataFrame
# Ensure the column name is correct (e.g., 'title' column in your `movies` DataFrame)
import pandas as pd

# Example structure of 'movies' DataFrame (modify this based on your actual DataFrame)
movies = pd.DataFrame({
    'movieId': [1, 2, 3],
    'title': ['The Matrix', 'Legally Blonde', 'Inception']
})

def movie_finder(title):
    # Ensure the column name 'title' exists in your DataFrame
    all_titles = movies['title'].tolist()  # If the column is named differently, change it here
    closest_match = process.extractOne(title, all_titles)
    return closest_match[0]

# Creating the mapping of movie titles to movie IDs
movie_title_mapper = dict(zip(movies['title'], movies['movieId']))
movie_title_inv_mapper = dict(zip(movies['movieId'], movies['title']))

# Mapping movieId to index (Make sure you have `movie_mapper` defined elsewhere in your code)
movie_mapper = {movie_id: idx for idx, movie_id in enumerate(movies['movieId'].unique())}

def get_movie_index(title):
    fuzzy_title = movie_finder(title)
    movie_id = movie_title_mapper[fuzzy_title]
    movie_idx = movie_mapper.get(movie_id, None)  # Use .get to handle missing keys
    return movie_idx

def get_movie_title(movie_idx):
    movie_id = movie_inv_mapper.get(movie_idx, None)  # Use .get to handle missing keys
    if movie_id is None:
        return "Unknown Movie"
    title = movie_title_inv_mapper.get(movie_id, "Unknown Movie")  # Use .get to handle missing keys
    return title

# Test the functions
movie_index = get_movie_index('Legally Blonde')
movie_title = get_movie_title(1)  # Assuming 1 is a valid movie index

print(f"Movie Index for 'Legally Blonde': {movie_index}")
print(f"Movie Title for Index 1: {movie_title}")


Movie Index for 'Legally Blonde': 1
Movie Title for Index 1: Unknown Movie


Restrict BLAS Thread Usage

In [23]:
import threadpoolctl

# Restrict BLAS thread usage
threadpoolctl.threadpool_limits(1, "blas")


<threadpoolctl.threadpool_limits at 0x7ddf3a47bb50>

Train the ALS Model

In [24]:
# Train the ALS model with more iterations and factors if necessary
model = implicit.als.AlternatingLeastSquares(factors=100, iterations=50)
model.fit(X)


  0%|          | 0/50 [00:00<?, ?it/s]

Find Related Movies

In [25]:
movie_of_interest = 'Strange Days'
movie_index = get_movie_index(movie_of_interest)
related = model.similar_items(movie_index)
print(list(zip(related[0], related[1])))


[(0, 1.0), (1, -1.7851681e-07), (2, -4.5610344e-05), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0)]


In [31]:
print(f"Because you watched {movie_finder(movie_of_interest)}...")
for r in zip(related[0], related[1]):
    recommended_title = get_movie_title(int(r[0]))  # Convert r[0] to int
    if recommended_title != movie_finder(movie_of_interest):
        print(recommended_title)


Because you watched The Matrix...
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie
Unknown Movie


User Ratings

In [27]:
user_id = 96
user_ratings = ratings[ratings['userId'] == user_id].merge(movies[['movieId', 'title']], on='movieId')
user_ratings = user_ratings.sort_values('rating', ascending=False)
print(f"Number of movies rated by user {user_id}: {user_ratings['movieId'].nunique()}")



Number of movies rated by user 96: 0


In [28]:
# Sorting and selecting top 5 movies
user_ratings = ratings[ratings['userId'] == user_id].merge(movies[['movieId', 'title']], on='movieId')
user_ratings = user_ratings.sort_values('rating', ascending=False)
top_5 = user_ratings.head(5)
top_5



Unnamed: 0,userId,movieId,rating,title


In [29]:
bottom_5 = user_ratings[user_ratings['rating'] < 3].tail()
bottom_5


Unnamed: 0,userId,movieId,rating,title


Recommendations for a User

In [30]:
# Generate a transpose of the X matrix
X_t = X.T.tocsr()

# Check if the user_id is in user_mapper
if user_id in user_mapper:
    user_idx = user_mapper[user_id]

    # Get recommendations for the specified user index
    recommendations = model.recommend(user_idx, X_t, N=10, filter_already_liked_items=False)

    for r in recommendations:
        movie_idx = int(r[0])  # Ensure movie_idx is an integer
        if movie_idx in movie_inv_mapper:
            recommended_title = get_movie_title(movie_idx)
            print(recommended_title)
        else:
            print(f"Movie index {movie_idx} not found in movie_inv_mapper")
else:
    print(f"User ID {user_id} not found in user_mapper")


User ID 96 not found in user_mapper
