In [1]:
import numpy as np
import pandas as pd
import cupy as cp
import cudf
import dask_cudf as ddf
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path= r"C:\Users\sandi\Desktop\My Git\Netflix Recommender System\data.csv"
data = cudf.read_csv(file_path, names=['Movie_Id', 'User_Id', 'User_Rating', 'Date'])

data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [4]:
data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [5]:
data.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype
---  ------       --------------    -----
 0   Movie_Id     1000000 non-null  int64
 1   User_Id      1000000 non-null  int64
 2   User_Rating  1000000 non-null  int64
 3   Date         1000000 non-null  object
dtypes: int64(3), object(1)
memory usage: 36.2+ MB


In [6]:
data.nunique()

Movie_Id          225
User_Id        283705
User_Rating         5
Date             2166
dtype: int64

In [7]:
numeric_columns = data.select_dtypes(include=['int64']).columns

# Convert numeric columns to int32
for col in numeric_columns:
    data[col] = data[col].astype('int32')

data.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype
---  ------       --------------    -----
 0   Movie_Id     1000000 non-null  int32
 1   User_Id      1000000 non-null  int32
 2   User_Rating  1000000 non-null  int32
 3   Date         1000000 non-null  object
dtypes: int32(3), object(1)
memory usage: 24.8+ MB


<p style="color: yellow;">By converting to int32 datatype we are saving much memory.</p>


<p style="color: yellow;">The 'Date' column is not in the datetime format so let's convert it into datetime format first.</p>

In [8]:
data['Date'] = cudf.to_datetime(data['Date'])
data.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype
---  ------       --------------    -----
 0   Movie_Id     1000000 non-null  int32
 1   User_Id      1000000 non-null  int32
 2   User_Rating  1000000 non-null  int32
 3   Date         1000000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int32(3)
memory usage: 19.1 MB


<p style="color: yellow;">Let's sort the ratings by date in ascending order.</p>

In [9]:
data= data.sort_values(by='Date')
data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date
254722,55,1972971,1,1999-12-09
241446,46,510180,3,1999-12-20
290861,77,830363,3,1999-12-21
233922,45,355883,2,1999-12-25
312691,81,1435350,4,1999-12-27


In [1]:
file_path= r"/mnt/c/Users/sandi/Desktop/My Git/Netflix Recommender System/data.csv"

# Read the CSV file line by line and split each line using the first two commas
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    lines = file.readlines()

data_list = []
for line in lines:
    parts = line.strip().split(',', 2)  # Split into three parts at most: Movie_Id, Year, Movie_Title
    data_list.append(parts)

# Create a DataFrame from the processed data
movie_data = pd.DataFrame(data_list, columns=['Movie_Id', 'Year', 'Movie_Title'])

# Convert 'Movie_Id' and 'Year' columns to appropriate data types
movie_data['Movie_Id'] = movie_data['Movie_Id'].astype(int)
movie_data['Year'] = pd.to_numeric(movie_data['Year'], errors='coerce')

movie_data.head()

: 

: 


<p style="color: yellow;">Let's load the movie_titles dataframe now</p>

In [None]:
movie_data.shape

(17770, 3)

In [None]:
movie_data.isnull().mean()

Movie_Id       0.000000
Year           0.000394
Movie_Title    0.000000
dtype: float64

As the 'Year' column in movie_data has some null values for now we will fill those values with 0.

In [None]:
movie_data= movie_data.fillna(0)

movie_data.head()

Unnamed: 0,Movie_Id,Year,Movie_Title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
movie_data.isnull().mean()

Movie_Id       0.0
Year           0.0
Movie_Title    0.0
dtype: float64

In [None]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie_Id     17770 non-null  int64  
 1   Year         17770 non-null  float64
 2   Movie_Title  17770 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 416.6+ KB


In [None]:
movie_data.memory_usage()

Index             128
Movie_Id       142160
Year           142160
Movie_Title    142160
dtype: int64

In [None]:
type(movie_data)

pandas.core.frame.DataFrame

In [None]:
Unique_Movie_Titles_List = list(data['Movie_Id'].unique().to_pandas())

In [None]:
len(Unique_Movie_Titles_List)

225

In [None]:
# Filter movie_data based on Unique_Movie_Titles_List
filtered_movie_data = movie_data[movie_data['Movie_Id'].isin(Unique_Movie_Titles_List)]

filtered_movie_data.head()


Unnamed: 0,Movie_Id,Year,Movie_Title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
movie_data_cudf= cudf.from_pandas(filtered_movie_data)

movie_data_cudf.head()

Unnamed: 0,Movie_Id,Year,Movie_Title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
movie_data_cudf.shape

(225, 3)

In [165]:
from imdb import IMDb

ia = IMDb()

# Searching for the movie
movies = ia.search_movie('Isle of Man TT 2004 Review')

# Assuming you want to work with the first search result
if movies:
    movie = movies[0]

    # Fetching the complete information for the movie
    ia.update(movie, info=['main', 'cast'])

    # Printing the names of the directors of the movie
    print('Directors:')
    for director in movie['directors']:
        print(director['name'])

    # Printing the first three actor names separated by commas
    print('First Three Actors:')
    actor_names = [actor['name'] for actor in movie['cast'][:3]]
    print(', '.join(actor_names))

    # Printing the first three genres separated by commas
    print('First Three Genres:')
    genre_names = movie['genres'][:3]
    print(', '.join(genre_names))

    # Printing the year of release
    print('Year of Release:', movie['year'])
else:
    print('No movie found with the given title.')


Directors:
David Niblock
First Three Actors:
Simon McGregor-Wood, John Ingram, Gary Johnson
First Three Genres:
Documentary
Year of Release: 2012


In [166]:
from imdb import IMDb

ia = IMDb()

# Searching for the movie
movies = ia.search_movie('The Rise and Fall of ECW')

# Assuming you want to work with the first search result
if movies:
    movie = movies[0]

    # Fetching the complete information for the movie
    ia.update(movie, info=['main', 'cast'])

    # Printing the names of the directors of the movie
    print('Directors:')
    for director in movie['directors']:
        print(director['name'])

    # Printing the first three actor names separated by commas
    print('First Three Actors:')
    actor_names = [actor['name'] for actor in movie['cast'][:3]]
    print(', '.join(actor_names))

    # Printing the first three genres separated by commas
    print('First Three Genres:')
    genre_names = movie['genres'][:3]
    print(', '.join(genre_names))

    # Printing the year of release
    print('Year of Release:', movie['year'])
else:
    print('No movie found with the given title.')

Directors:
Kevin Dunn
First Three Actors:
Abdullah the Butcher, Donna Adamo, Bill Alfonso
First Three Genres:
Documentary, Sport
Year of Release: 2004


In [167]:
type(movie_data_cudf)

cudf.core.dataframe.DataFrame

In [168]:
movie_data_cudf.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 225 entries, 0 to 224
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Movie_Id     225 non-null    int64
 1   Year         225 non-null    float64
 2   Movie_Title  225 non-null    object
dtypes: float64(1), int64(1), object(1)
memory usage: 10.8+ KB


In [169]:
import cudf
import pandas as pd
from imdb import IMDb

# IMDb instance
ia = IMDb()

# Define a function to extract and format director names
def get_directors(movie_id):
    movie = ia.get_movie(movie_id)
    ia.update(movie, info=['main'])

    directors = movie.get('directors', [])
    director_names = [director.get('name', None) for director in directors]
    return ", ".join(director_names[:2]) if director_names else None

# Define a function to extract and format actor names
def get_actors(movie_id):
    movie = ia.get_movie(movie_id)
    ia.update(movie, info=['cast'])

    cast = movie.get('cast', [])
    actor_names = [actor.get('name', None) for actor in cast[:3]]
    return ", ".join(actor_names)

# Define a function to extract and format genre names
def get_genres(movie_id):
    movie = ia.get_movie(movie_id)
    ia.update(movie, info=['main'])

    genres = movie.get('genres', [])
    return ", ".join(genres[:3])

# Assuming you already have 'movie_data_cudf' DataFrame

# Convert cudf DataFrame to pandas DataFrame
movie_data_pandas = movie_data_cudf.to_pandas()

# Apply the defined functions using pandas
movie_data_pandas['Director1'] = movie_data_pandas['Movie_Id'].apply(get_directors)
movie_data_pandas[['Actor1', 'Actor2', 'Actor3']] = movie_data_pandas['Movie_Id'].apply(get_actors).str.split(', ', expand=True)
movie_data_pandas[['Genre1', 'Genre2', 'Genre3']] = movie_data_pandas['Movie_Id'].apply(get_genres).str.split(', ', expand=True)

# Convert back to cudf DataFrame
movie_data_cudf = cudf.from_pandas(movie_data_pandas)

# Display the first few rows of the resulting cudf DataFrame
movie_data_cudf.head()

Unnamed: 0,Movie_Id,Year,Movie_Title,Director1,Actor1,Actor2,Actor3,Genre1,Genre2,Genre3
0,1,2003.0,Dinosaur Planet,William K.L. Dickson,Carmencita,,,Documentary,Short,
1,2,2004.0,Isle of Man TT 2004 Review,Émile Reynaud,,,,Animation,Short,
2,3,1997.0,Character,Émile Reynaud,,,,Animation,Comedy,Short
3,4,1994.0,Paula Abdul's Get Up & Dance,Émile Reynaud,,,,Animation,Short,
4,5,2004.0,The Rise and Fall of ECW,William K.L. Dickson,Charles Kayser,John Ott,,Short,Comedy,


In [170]:
movie_data_cudf.isnull().mean()

Movie_Id       0.000000
Year           0.000000
Movie_Title    0.000000
Director1      0.048889
Actor1         0.000000
Actor2         0.880000
Actor3         0.955556
Genre1         0.000000
Genre2         0.288889
Genre3         0.924444
dtype: float64

In [None]:
path=r'C:\Users\sandi\Desktop\My Git\Netflix Recommender System'

movie_data_cudf.to_csv(path+movie_data_cudf.csv)

From the above we can see features such as 'Actor2', 'Actor3', 'Genre3' etc. has a large percentage of null values, so we will remove those columns. For 'Genre2' also there is a large number of null values, so we have to remove that column too. But before that we will save a copy of this dataframe into csv so taht we can resume our work form at this point where we want.

We will extract some more features using Cinemagoer library.

In [None]:
merged_movie_data = data.merge(movie_data, on='Movie_Id', how='left')

merged_movie_data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


In [None]:
merged_movie_data.nunique()

Movie_Id         1962
User_Id        447845
User_Rating         5
Date             2179
Year               82
Movie_Title      1955
dtype: int64

As we can see there are 447845 users and 1962 Movie_Ids but 1955 Movie_titles. So let's see if there is any null values.

In [None]:
merged_movie_data.isnull().mean()

Movie_Id       0.0
User_Id        0.0
User_Rating    0.0
Date           0.0
Year           0.0
Movie_Title    0.0
dtype: float64

There is some null value in the 'Year' column but there are no null values in 'Movie_Id' and Movie_Title' column thus there may be same Movie_Id for more than one Movie_Title. Let's check it.

In [None]:
# Group by Movie_Title and check the number of unique Movie_Id values
title_id_counts = merged_movie_data.groupby('Movie_Title')['Movie_Id'].nunique()

# Filter titles with more than one unique Movie_Id
titles_with_multiple_ids = title_id_counts[title_id_counts > 1]

# Print the titles and the associated counts
print("Movie Titles with Multiple Movie Ids:")
print(titles_with_multiple_ids)
len(merged_movie_data)

Movie Titles with Multiple Movie Ids:
Movie_Title
Crash Dive                            2
Dr. Quinn                             2
Hamlet                                2
Jack                                  2
Journey to the Center of the Earth    2
The Alamo                             2
The In-Laws                           2
Name: Movie_Id, dtype: int32


10000000

In [None]:
# Convert cuDF DataFrame to pandas DataFrame
merged_movie_data_pandas = merged_movie_data.to_pandas()

# Group by Movie_Title and get the list of unique Movie_Id values
title_id_lists = merged_movie_data_pandas.groupby('Movie_Title')['Movie_Id'].unique()

# Filter titles with more than one unique Movie_Id
titles_with_multiple_ids = title_id_lists[title_id_lists.apply(len) > 1]

print("the number of movies with non-unique Movie_Ids are: ", len(titles_with_multiple_ids))

# Print the titles and the corresponding Movie_Id lists
for title, ids in titles_with_multiple_ids.items():
    print(f"Movie Title: {title}, Movie Ids: {ids}")


the number of movies with non-unique Movie_Ids are:  7
Movie Title: Crash Dive, Movie Ids: [ 63 379]
Movie Title: Dr. Quinn, Movie Ids: [ 350 1015]
Movie Title: Hamlet, Movie Ids: [1505  903]
Movie Title: Jack, Movie Ids: [305 172]
Movie Title: Journey to the Center of the Earth, Movie Ids: [1260  486]
Movie Title: The Alamo, Movie Ids: [ 535 1918]
Movie Title: The In-Laws, Movie Ids: [ 569 1645]


So as we can see there are 7 movies which has two or more Movie_Ids, so it would be better to retain only one of their Movie_Id by replacing with the other (preferably the first one).

In [None]:
# Mapping of Movie_Title to the corresponding first Movie_Id
first_ids_mapping = merged_movie_data_pandas.drop_duplicates(subset=['Movie_Title']).set_index('Movie_Title')['Movie_Id']

# Update Movie_Id values for duplicates
merged_movie_data_pandas['Movie_Id'] = merged_movie_data_pandas['Movie_Title'].map(first_ids_mapping)

merged_movie_data_pandas.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


In [None]:
merged_movie_data_pandas.nunique()

Movie_Id         1955
User_Id        447845
User_Rating         5
Date             2179
Year               82
Movie_Title      1955
dtype: int64

Now the number of unique Movie_Id and Movie_Title do match

In [None]:
merged_movie_data_pandas.isnull().mean()

Movie_Id       0.0
User_Id        0.0
User_Rating    0.0
Date           0.0
Year           0.0
Movie_Title    0.0
dtype: float64

In [None]:
deduplicated_data = merged_movie_data_pandas.drop_duplicates()

deduplicated_data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


In [None]:
duplicate_rows_deleted= merged_movie_data_pandas.shape[0]- deduplicated_data.shape[0]
print("Number of dulicate rows deleted=", duplicate_rows_deleted)

Number of dulicate rows deleted= 0


In [None]:
merged_movie_data_pandas= deduplicated_data

merged_movie_data_pandas.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


In [None]:
merged_movie_data= cudf.from_pandas(merged_movie_data_pandas)

merged_movie_data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


In [None]:
merged_movie_data= merged_movie_data.reset_index(drop=True)

merged_movie_data.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Year,Movie_Title
0,1144,555812,5,2000-01-09,1991,Fried Green Tomatoes
1,1144,295485,4,2000-01-09,1991,Fried Green Tomatoes
2,1144,2292741,3,2000-01-09,1991,Fried Green Tomatoes
3,1144,2480395,5,2000-01-09,1991,Fried Green Tomatoes
4,1144,1566905,3,2000-01-09,1991,Fried Green Tomatoes


Now we will use cinemagoer library to add some more feature columns like- 'Genre', 'Director', 'Actor' etc. Genre, in merged_movie_data_pandas

In [None]:
pip install git+https://github.com/cinemagoer/cinemagoer

Collecting git+https://github.com/cinemagoer/cinemagoer
  Cloning https://github.com/cinemagoer/cinemagoer to /tmp/pip-req-build-j68jnp54
  Running command git clone --filter=blob:none --quiet https://github.com/cinemagoer/cinemagoer /tmp/pip-req-build-j68jnp54
  Resolved https://github.com/cinemagoer/cinemagoer to commit dc7964e067533fbe7cc1f0b43fb2d09efbdf8328
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
from imdb import IMDb

ia = IMDb()

# Searching for the movie
movies = ia.search_movie('The Hound of the Baskervilles')

# Assuming you want to work with the first search result
if movies:
    movie = movies[0]

    # Fetching the complete information for the movie
    ia.update(movie, info=['main', 'cast'])

    # Printing the names of the directors of the movie
    print('Directors:')
    for director in movie['directors']:
        print(director['name'])

    # Printing the first three actor names separated by commas
    print('First Three Actors:')
    actor_names = [actor['name'] for actor in movie['cast'][:3]]
    print(', '.join(actor_names))

    # Printing the first three genres separated by commas
    print('First Three Genres:')
    genre_names = movie['genres'][:3]
    print(', '.join(genre_names))

    # Printing the year of release
    print('Year of Release:', movie['year'])
else:
    print('No movie found with the given title.')


2023-08-24 10:17:36,109 ERROR [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/__init__.py:844: unknown information set "cast"
ERROR:imdbpy:unknown information set "cast"


Directors:
Sidney Lanfield
First Three Actors:
Richard Greene, Basil Rathbone, Wendy Barrie
First Three Genres:
Crime, Horror, Mystery
Year of Release: 1939


In [None]:
# importing the module
from imdb import Cinemagoer

# creating instance of IMDb
ia = Cinemagoer()

# id
code = "6077448"

# getting information
series = ia.get_movie(code)

# getting cast of the series
cast = series.data['cast']

# printing the object i.e name
print(series)

# print the cast
for i in range(3):
	print(cast[i])


Sacred Games
Saif Ali Khan
Nawazuddin Siddiqui
Neeraj Kabi


In [None]:
merged_movie_data.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 6 columns):
 #   Column       Dtype
---  ------       -----
 0   Movie_Id     int32
 1   User_Id      int32
 2   User_Rating  int32
 3   Date         datetime64[ns]
 4   Year         int64
 5   Movie_Title  object
dtypes: datetime64[ns](1), int32(3), int64(1), object(1)
memory usage: 470.5+ MB


In [None]:
import cudf
import numpy as np
import os

# Suppress all error output
def suppress_errors(func, *args, **kwargs):
    try:
        with open(os.devnull, 'w') as null:
            sys.stderr = null
            result = func(*args, **kwargs)
            sys.stderr = sys.__stderr__  # Reset stderr
            return result
    except:
        sys.stderr = sys.__stderr__  # Reset stderr
        return None

from imdb import IMDb

# Define your 'merged_movie_data' cuDF DataFrame here

# Convert 'Year' column to int32
merged_movie_data['Year'] = merged_movie_data['Year'].astype('int32')

# Create a key column to keep track of the original rows
merged_movie_data['key'] = cudf.Series(np.arange(len(merged_movie_data), dtype='int32'))

# Create a DataFrame to store extracted details
details_data = {'Director1': None, 'Director2': None,
                'Actor1': None, 'Actor2': None, 'Actor3': None,
                'Genre1': None, 'Genre2': None, 'Genre3': None}
movie_details = cudf.DataFrame(details_data)

# Suppress all error output for the IMDb search
def get_movie_details(movie_title, movie_year):
    return suppress_errors(ia.search_movie, movie_title)

ia = IMDb()

# Create empty lists to store extracted details
director1_list = []
director2_list = []
actor1_list = []
actor2_list = []
actor3_list = []
genre1_list = []
genre2_list = []
genre3_list = []

# Iterate through the DataFrame using to_pandas().itertuples()
for row in merged_movie_data.to_pandas().itertuples():
    movie_title = row.Movie_Title
    movie_year = row.Year

    movie_details_data = get_movie_details(movie_title, movie_year)

    if movie_details_data:
        movie_details = movie_details_data[0]  # Get the first movie from the list

        # Extract director names
        directors = movie_details.get('directors', [])
        director_names = [director['name'] for director in directors]
        director1 = director_names[0] if director_names else None
        director2 = director_names[1] if len(director_names) > 1 else None

        # Extract actor names
        actors = movie_details.get('cast', [])
        actor_names = [actor['name'] for actor in actors[:3]]
        actor1 = actor_names[0] if actor_names else None
        actor2 = actor_names[1] if len(actor_names) > 1 else None
        actor3 = actor_names[2] if len(actor_names) > 2 else None

        # Extract genre names
        genres = movie_details.get('genres', [])
        genre1 = genres[0] if genres else None
        genre2 = genres[1] if len(genres) > 1 else None
        genre3 = genres[2] if len(genres) > 2 else None
    else:
        director1, director2, actor1, actor2, actor3, genre1, genre2, genre3 = [None] * 8

    director1_list.append(director1)
    director2_list.append(director2)
    actor1_list.append(actor1)
    actor2_list.append(actor2)
    actor3_list.append(actor3)
    genre1_list.append(genre1)
    genre2_list.append(genre2)
    genre3_list.append(genre3)

# Add extracted details to movie_details DataFrame
movie_details['Director1'] = cudf.Series(director1_list)
movie_details['Director2'] = cudf.Series(director2_list)
movie_details['Actor1'] = cudf.Series(actor1_list)
movie_details['Actor2'] = cudf.Series(actor2_list)
movie_details['Actor3'] = cudf.Series(actor3_list)
movie_details['Genre1'] = cudf.Series(genre1_list)
movie_details['Genre2'] = cudf.Series(genre2_list)
movie_details['Genre3'] = cudf.Series(genre3_list)

# Join the movie details to the original DataFrame
merged_movie_data = cudf.concat([merged_movie_data, movie_details], axis=1)

merged_movie_data.head()


It's working alright let's try to update our dataset based on 'Genre', 'Director' and 'Actor' of the movies.


In [None]:
merged_movie_data_pandas= merged_movie_data_pandas.reset_index(drop=True)

merged_movie_data_pandas.head()

Unnamed: 0,Movie_Id,User_Id,User_Rating,Date,Movie_Title
0,1367,510180,5,1999-11-11,The Piano
1,1798,510180,5,1999-11-11,Lethal Weapon
2,607,122223,4,1999-12-08,Speed
3,1202,122223,4,1999-12-08,National Lampoon's Vacation
4,1367,122223,3,1999-12-08,The Piano


<span style="color: yellow;"> As we have the merged data at this time it would better to separate the dataset into train and test set. We will divide the dataset into time based splitting with 80% data as train data and 20% data as test data. A time based splitting is preferred here as new movies and users get added with time which can be recommended to the existing and new users based on previous ratings on eisting movies.</span>


In [None]:
merged_movie_data_pandas= merged_movie_data_pandas.reset_index(drop=True)

split_index = int(len(merged_movie_data_pandas) * 0.8)

# Split the dataset into train and test sets
train_data = merged_movie_data_pandas[:split_index]
test_data = merged_movie_data_pandas[split_index:]

# Print the shapes of the train and test sets
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

In [None]:
avg_user_rating= round(train_data['User_Rating'].mean(), 6)

print("The average user rating is:", avg_user_rating)

In [None]:
train_data.loc[:, 'day_of_week'] = train_data.loc[:, 'Date'].dt.day_name()

train_data.head()

In [None]:
# Create a count plot using Seaborn
plt.figure(figsize=(6, 4))
ax = sns.countplot(data=train_data, x='day_of_week')

# Format y-axis labels using the readable function
def readable(num, units='M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return str(num / 10**3) + " K"
    elif units == 'm':
        return str(num / 10**6) + " M"
    elif units == 'b':
        return str(num / 10**9) + " B"

# Get y-axis ticks
y_ticks = ax.get_yticks()
formatted_y_ticks = [readable(y) for y in y_ticks]

# Set formatted y-axis labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

ax.set_yticklabels(formatted_y_ticks)

# Set grid lines to be rendered below other plot elements
ax.set_axisbelow(True)

# Add grid lines with red color
ax.grid(True, color='red', linestyle='--', linewidth=0.5)

plt.show()

From the above analysis it shows that people watch more movies/ tv shows on weekdays while on weekends the number of ratings are fewer.

In [None]:
plt.figure(figsize=(6, 4))
ax = train_data.resample('M', on='Date')['User_Rating'].count().plot()

ax.set_title('Number of Ratings per Month')
plt.xlabel('Month')
plt.ylabel('Number of Ratings per month')

# Format y-axis labels using the readable function
y_ticks = ax.get_yticks()
y_labels = [readable(value, 'M') for value in y_ticks]

# Set y-axis tick positions and labels
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_labels)

# Customize other plot elements
ax.grid(True, alpha=0.5)
plt.axhline(y=y_ticks[-1], color='red', alpha=0.5, linewidth=0.5, linestyle='--')

plt.show()

Let's analyse the ratings by users, first let's check the most ratings then we will see the average number of ratings per user.

In [None]:
no_of_ratings_by_user= train_data.groupby(by='User_Id')['User_Rating'].count().sort_values(ascending=False)

print("The number of ratings by users are arranged in descending order as follows:")

no_of_ratings_by_user

In [None]:
fig = plt.figure(figsize=plt.figaspect(.25))

ax1 = plt.subplot(121)
sns.kdeplot(no_of_ratings_by_user, fill=True, color='green', ax=ax1)
plt.xlabel('No of ratings by user')
plt.title("PDF")

ax2 = plt.subplot(122)
sns.kdeplot(no_of_ratings_by_user, fill=True, color='green', cumulative=True,ax=ax2)
plt.xlabel('No of ratings by user')
plt.title('CDF')

ax1.grid(True)
ax2.grid(True)

# Set grid line properties
ax1.grid(color='red', alpha=0.5, linewidth=1, linestyle='--')
ax2.grid(color='red', alpha=0.5, linewidth=1, linestyle='--')

plt.show()

Let's analyse the rating given by a single user.

In [None]:
no_of_ratings_per_movie = train_data.groupby(by='Movie_Id')['User_Rating'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_movie.values)
plt.title('Number of ratings per Movie')
plt.xlabel('Movie_Title')
plt.ylabel('No of ratings per movie')
ax.set_xticklabels([])

ax.grid(True)
ax.grid(color='red', alpha=0.5, linewidth=1, linestyle='--')

plt.show()

<span style="color: yellow;">The graph above shows there are some movies which has received a large number of ratings while there are movies which has received very few ratings. If the number of ratings correlates to number of times the movie has been watched then it dircetly translates to some movies are very popular thus has been watched many times/ by many users while other movies are not that popular.</span>


In [None]:
fig, ax = plt.subplots()
plt.title('Distribution of Ratings over the dataset', fontsize=10)

def readable(num, units='M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return "{:.2f} K".format(num / 10**3)
    elif units == 'm':
        return "{:.2f} M".format(num / 10**6)
    elif units == 'b':
        return "{:.2f} B".format(num / 10**9)

# Calculate value counts and sort by index (rating values)
rating_counts = train_data['User_Rating'].value_counts().sort_index()

y_ticks = rating_counts.values
y_labels = [readable(round(value, 2), 'M') for value in y_ticks]

# Create grid lines as horizontal lines
for y_tick in y_ticks:
    ax.axhline(y_tick, color='red', alpha=0.5, linewidth=0.5, linestyle='--', zorder=0)

# Plot bars on top of the grid lines
plt.bar(rating_counts.index, rating_counts.values, color=[plt.cm.viridis(idx / len(rating_counts)) for idx in range(len(rating_counts))], zorder=1)
plt.ylabel('No. of Ratings in Millions')
plt.xlabel('Rating')

# Manually set y-axis tick labels
plt.yticks(y_ticks, y_labels)

plt.show()


In [None]:
fig, ax = plt.subplots()

sns.countplot(x='day_of_week', data=train_data, ax=ax)

plt.title('No of ratings on each day of week')
plt.xlabel('')
plt.ylabel('Total no of ratings')

# Adjust y-axis range to start from 0
ax.set_ylim(bottom=0)

# Get y-ticks and format using the readable function
y_ticks = train_data['User_Rating'].value_counts().sort_index().values
y_labels = [readable(y) for y in y_ticks]
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_labels)

# Add horizontal grid lines and set them behind the bars
for y in y_ticks:
    ax.axhline(y, color='red', alpha=0.5, linewidth=0.5, linestyle='--', zorder=-1)

plt.show()


In [None]:
avg_rating_per_day = train_data.groupby(by=['day_of_week'])['User_Rating'].mean()
print(" AVerage ratings")
print("-"*30)
print(round(avg_rating_per_day, 6))
print("\n")

In [None]:
print("The number of users in the train datset is:", train_data['User_Id'].nunique())
print("The number of movies in the train datset is:", train_data['Movie_Id'].nunique())

In [None]:
print("The total number of ratings by users in the train dataset is:", len(train_data['User_Rating']))

In [None]:
rated_percentage= round(((len(train_data['User_Rating'])/(train_data['User_Id'].nunique()))/(train_data['Movie_Id'].nunique()))*100, 6)

print("So the percentage of rated movies are:", rated_percentage)

In [None]:

# Calculate average rating for each movie
average_ratings_each_movie = train_data.groupby('Movie_Title')['User_Rating'].mean()

# Calculate average rating given by each user
average_ratings_each_user = train_data.groupby('User_Id')['User_Rating'].mean()

# Use .loc to set new columns without triggering the warning
train_data = train_data.assign(average_ratings_each_movie=train_data['Movie_Title'].map(average_ratings_each_movie))
train_data = train_data.assign(average_ratings_each_user=train_data['User_Id'].map(average_ratings_each_user))

train_data.head()

In [None]:
sns.kdeplot(train_data['average_ratings_each_user'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Average Ratings Each User')
plt.show()

In [None]:
sns.kdeplot(train_data['average_ratings_each_user'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Average Ratings Each Movie')
plt.show()

<span style="color: yellow;"> As the PDF follows a normal distribution we can fill the null values in the test_data for new 'Movie_Title' and 'User_Id' with the mean values of 'average_ratings_each_movie' and 'average_ratings_each_user' respectively.</span>

In [None]:
# Calculate total positive and negative ratings by each user
user_total_ratings = train_data.groupby('User_Id')['User_Rating'].agg(user_total_pos_rating=lambda x: (x >= 4).sum(), user_total_neg_rating=lambda x: (x <= 3).sum())

# Calculate total positive and negative ratings for each movie
movie_total_ratings = train_data.groupby('Movie_Title')['User_Rating'].agg(movie_total_pos_rating=lambda x: (x >= 4).sum(), movie_total_neg_rating=lambda x: (x <= 3).sum())

# Merge the calculated user ratings back to the original DataFrame
train_data = train_data.merge(user_total_ratings, on='User_Id', how='left')

# Merge the calculated movie ratings back to the DataFrame
train_data = train_data.merge(movie_total_ratings, on='Movie_Title', how='left')

# Display the updated train_data DataFrame
train_data.head()


In [None]:
sns.kdeplot(train_data['user_total_pos_rating'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Positive Ratings by Each User')
plt.show()

In [None]:
sns.kdeplot(train_data['user_total_neg_rating'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Negative Ratings by Each User')
plt.show()

In [None]:
sns.kdeplot(train_data['movie_total_pos_rating'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Total Positive Ratings Each Movie')
plt.show()

In [None]:
sns.kdeplot(train_data['movie_total_neg_rating'], fill=True)
plt.xlabel('Average Ratings')
plt.ylabel('PDF')
plt.title('PDF of Total Negative Ratings Each Movie')
plt.show()

In [None]:
train_data= train_data.reset_index(drop=True)
train_data.head()

In [None]:
# Calculate average rating for each movie
average_ratings_each_movie = round(train_data.groupby('Movie_Title')['average_ratings_each_movie'].mean(), 6)
pos_ratings_each_movie = round(train_data.groupby('Movie_Title')['movie_total_pos_rating'].mean(), 6)
neg_ratings_each_movie = round(train_data.groupby('Movie_Title')['movie_total_neg_rating'].mean(), 6)

# Calculate average rating given by each user
average_ratings_each_user = round(train_data.groupby('User_Id')['average_ratings_each_user'].mean(), 6)
pos_ratings_each_user = round(train_data.groupby('User_Id')['user_total_pos_rating'].mean(), 6)
neg_ratings_each_user = round(train_data.groupby('User_Id')['user_total_neg_rating'].mean(), 6)

median_of_average_ratings_each_movie = round(train_data['average_ratings_each_movie'].median(), 6)
median_pos_ratings_each_movie = round(train_data['movie_total_pos_rating'].median(), 6)
median_neg_ratings_each_movie = round(train_data['movie_total_neg_rating'].median(), 6)

median_of_average_ratings_each_user = round(train_data['average_ratings_each_user'].median(), 6)
median_pos_ratings_each_user = round(train_data['user_total_pos_rating'].median(), 6)
median_neg_ratings_each_user = round(train_data['user_total_neg_rating'].median(), 6)

test_data = test_data.assign(average_ratings_each_movie=test_data['Movie_Title'].map(average_ratings_each_movie).fillna(median_of_average_ratings_each_movie))
test_data = test_data.assign(movie_total_pos_rating=test_data['Movie_Title'].map(pos_ratings_each_movie).fillna(median_pos_ratings_each_movie))
test_data = test_data.assign(movie_total_neg_rating=test_data['Movie_Title'].map(neg_ratings_each_movie).fillna(median_neg_ratings_each_movie))

test_data = test_data.assign(average_ratings_each_user=test_data['User_Id'].map(average_ratings_each_user).fillna(median_of_average_ratings_each_user))
test_data = test_data.assign(user_total_pos_rating=test_data['User_Id'].map(pos_ratings_each_user).fillna(median_pos_ratings_each_user))
test_data = test_data.assign(user_total_neg_rating=test_data['User_Id'].map(neg_ratings_each_user).fillna(median_neg_ratings_each_user))

test_data.head()

To check if our mapping was successful let's try out some movie_id and user_id from the train_data and see if they have matched properly.


In [None]:
train_data[train_data['Movie_Id']==1843].head()

In [None]:
train_data[train_data['User_Id']==2506834].head()

Everything looks perfectly mapped, now let's check for the null values.



In [None]:
test_data.reset_index().isnull().mean()

<span style="color: yellow;"> Everything is alright, now we will create a dataframe consisting of 'average_ratings_each_movie', 'average_ratings_each_user' and 'recommended_ratings_each_movie'. The features 'average_ratings_each_movie' and 'average_ratings_each_user' will serve as unique signature for each user which can be used to represent personal biases/ choices from the average population and same also stands for the movies. To make a new dataset let's drop all the unnecessary columns.</span>

In [None]:
train_data_modified= train_data.drop(['Movie_Id', 'Date', 'day_of_week'], axis=1)

train_data_modified.head()

In [None]:
train_data_modified= train_data_modified.reset_index(drop= True)

train_data_modified.head()

In [None]:
train_data_modified.info()

we will convert all int64/ float64 datatype columns to int32/ float32 for faster processing as our data is large.

In [None]:
# Convert int64 columns to int32
int64_columns = ['user_total_pos_rating', 'user_total_neg_rating', 'movie_total_pos_rating', 'movie_total_neg_rating']
train_data_modified[int64_columns] = train_data_modified[int64_columns].astype('int32')

# Convert float64 columns to float32
float64_columns = ['average_ratings_each_movie', 'average_ratings_each_user']
train_data_modified[float64_columns] = train_data_modified[float64_columns].astype('float32')

In [None]:
# Convert columns to string/object data type
train_data_modified['User_Id'] = train_data_modified['User_Id'].astype(str)

In [None]:
train_data_modified.info()

<span style="color: yellow;"> Let's do the same feature enginering on the test data.</span>

In [None]:
test_data_modified= test_data.drop(['Date', 'Movie_Id'], axis=1)

test_data_modified.head()

In [None]:
test_data_modified= test_data_modified.reset_index(drop= True)

test_data_modified.head()

In [None]:
test_data_modified.info()

In [None]:
# Convert int64 columns to int32
int64_columns = ['user_total_pos_rating', 'user_total_neg_rating', 'movie_total_pos_rating', 'movie_total_neg_rating']
test_data_modified[int64_columns] = test_data_modified[int64_columns].astype('int32')

# Convert float64 columns to float32
float64_columns = ['average_ratings_each_movie', 'average_ratings_each_user']
test_data_modified[float64_columns] = test_data_modified[float64_columns].astype('float32')


In [None]:
# Convert columns to string/object data type
test_data_modified['User_Id'] = test_data_modified['User_Id'].astype(str)

In [None]:
test_data_modified.info()

<span style="color: yellow;"> Let's scale the data now</span>

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Assuming 'train_data_modified' is your DataFrame
train_data_scaled = train_data_modified.copy()

# Get the column names before scaling
column_names = train_data_scaled.select_dtypes(include='number').drop(['User_Rating'], axis=1).columns

# Fit the scaler on the entire DataFrame and transform the data
train_data_scaled[column_names] = scaler.fit_transform(train_data_scaled[column_names])

train_data_scaled.head()

In [None]:
train_data_scaled_modified= train_data_scaled.drop(['Movie_Title',	'User_Id'], axis=1)
train_data_scaled_modified.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Assuming 'test_data_modified' is your DataFrame
test_data_scaled = test_data_modified.copy()

# Get the column names before scaling
column_names = test_data_scaled.select_dtypes(include='number').drop(['User_Rating'], axis=1).columns

# Fit the scaler on the entire DataFrame and transform the data
test_data_scaled[column_names] = scaler.fit_transform(test_data_scaled[column_names])

test_data_scaled.head()

In [None]:
test_data_scaled_modified= test_data_scaled.drop(['Movie_Title', 'User_Id'], axis=1)
test_data_scaled_modified.head()

In [None]:
# Get the list of column names from train_data
train_columns = train_data_scaled_modified.columns.tolist()

# Reorder columns in test_data to match the order of train_data columns
test_data_scaled_modified = test_data_scaled_modified[train_columns]

test_data_scaled_modified.head()


<span style="color: green;"> Building Models</span>

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Separate features and target
X = train_data_scaled_modified.drop(['User_Rating'], axis=1) # Features from scaled DataFrame
y = train_data_scaled_modified['User_Rating'] # Target variable

# Initialize the Ridge (L2 regularized) LinearRegression model with regularization parameter alpha
alpha = 0.01  # Adjust this value based on your preference
model_l2 = Ridge(alpha=alpha)

# Fit the model to the training data
model_l2.fit(X, y)

# Predict target values on the test set
y_pred = model_l2.predict(test_data_scaled_modified.drop(['User_Rating'], axis=1))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data_scaled_modified['User_Rating'], y_pred)

rmse = np.sqrt(mse)

# Calculate R-squared value
r2 = r2_score(test_data_scaled_modified['User_Rating'], y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared value: {r2:.2f}")

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Separate features and target
X = train_data_scaled_modified.drop(['User_Rating'], axis=1)  # Features from scaled DataFrame
y = train_data_scaled_modified['User_Rating']  # Target variable

# Initialize the Decision Tree Regressor with min_samples_leaf
min_samples_leaf = 100
model_tree = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf)

# Fit the model to the training data
model_tree.fit(X, y)

# Predict target values on the test set
y_pred = model_tree.predict(test_data_scaled_modified.drop(['User_Rating'], axis=1))

# Calculate Mean Squared Error
mse = mean_squared_error(test_data_scaled_modified['User_Rating'], y_pred)

rmse = np.sqrt(mse)

# Calculate R-squared value
r2 = r2_score(test_data_scaled_modified['User_Rating'], y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared value: {r2:.2f}")

In [None]:
import cuml
import cudf
import numpy as np

# Separate features and target
X = train_data_scaled_modified.drop(['User_Rating'], axis=1)  # Features from scaled DataFrame
y = train_data_scaled_modified['User_Rating']  # Target variable

# Convert pandas dataframes to cuDF dataframes
X_cudf = cudf.DataFrame.from_pandas(X)
y_cudf = cudf.Series(y)

# Initialize the Random Forest Regressor
n_estimators = 100
model_rf = cuml.ensemble.RandomForestRegressor(n_estimators=n_estimators)

# Fit the model to the training data
model_rf.fit(X_cudf, y_cudf)

# Convert test data to cuDF dataframe
test_data_cudf = cudf.DataFrame.from_pandas(test_data_scaled_modified)

# Predict target values on the test set
y_pred_cudf = model_rf.predict(test_data_cudf)

# Convert cuDF prediction to numpy array for metrics calculation
y_pred_np = y_pred_cudf.to_array()

# Calculate Mean Squared Error
mse = mean_squared_error(test_data_scaled_modified['User_Rating'], y_pred_np)

rmse = np.sqrt(mse)

# Calculate R-squared value
r2 = r2_score(test_data_scaled_modified['User_Rating'], y_pred_np)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared value: {r2:.2f}")


ModuleNotFoundError: ignored

<span style="color: yellow;"> Both linear regression and DecisionTreeRegressor are throwing good RMSE and R-Squared values. We will use linear regression as it is easy to tune and fast for predicting linear relationship. We will make a pivot table with 100 Movie_Id and 1000 User_Id from test_data to fill up the pivot table with our linear regression model. We will make a sample movie recommendation system based on 'recommended_ratings_each_movie' based on 'user rating'.</span>

In [None]:
num_users_to_select = 1000
num_movies_to_select = 100

# Get unique user IDs and movie IDs from the test_data
unique_user_ids = test_data_scaled['User_Id'].unique()
unique_movie_ids = test_data_scaled['Movie_Id'].unique()

# Randomly select 1000 unique user IDs
selected_user_ids = np.random.choice(unique_user_ids, size=num_users_to_select, replace=False)

# Randomly select 100 movie IDs
selected_movie_ids = np.random.choice(unique_movie_ids, size=num_movies_to_select, replace=False)

# Filter the test_data based on the selected user and movie IDs
filtered_data = test_data_scaled[(test_data_scaled['User_Id'].isin(selected_user_ids)) & (test_data_scaled['Movie_Id'].isin(selected_movie_ids))]

# Create a pivot table using the filtered data
pivot_table = pd.pivot_table(filtered_data, values='recommended_ratings_each_movie', index='User_Id', columns='Movie_Id')

# Display the pivot table
print(pivot_table)

<span style="color: yellow;">As per the above calculation as the percentage of rated that is actual filled rows are very sparse we can create a sparse matrix of User_Id and Movie_Title to fill the corresponding ratings while filling the non-available ratings as zero.</span>

In [None]:
from scipy.sparse import csr_matrix, save_npz, load_npz
import os

npz_file_path = r"C:\Users\sandi\Desktop\My Git\Netflix Recommender System\sparse_ratings_matrix_train.npz"

# Check if the NPZ file exists
if os.path.exists(npz_file_path):
    # Load the sparse matrix from the existing NPZ file
    sparse_ratings_matrix_train = load_npz(npz_file_path)
else:
    num_users = np.max(train_data['User_Id'])
    num_movies = np.max(train_data['Movie_Id'])
    sparse_ratings_matrix_train = csr_matrix((train_data['User_Rating'], (train_data['User_Id'], train_data['Movie_Id'])), shape=(num_users + 1, num_movies + 1))

    # Save the sparse matrix as an NPZ file
    save_npz(npz_file_path, sparse_ratings_matrix_train)

# Now you have the sparse matrix loaded or created and loaded
print(sparse_ratings_matrix_train)

In [None]:
user, movie = sparse_ratings_matrix_train.shape
non_zero_rating = sparse_ratings_matrix_train.count_nonzero()

print(f"Sparsity Of Train matrix : {round(((1-(non_zero_rating/(user*movie))) * 100), 4)}% ")

In [None]:
from scipy.sparse import csr_matrix, save_npz, load_npz
import os

npz_file_path = r"C:\Users\sandi\Desktop\My Git\Netflix Recommender System\sparse_ratings_matrix_test.npz"

# Check if the NPZ file exists
if os.path.exists(npz_file_path):
    # Load the sparse matrix from the existing NPZ file
    sparse_ratings_matrix_test = load_npz(npz_file_path)
else:
    num_users = np.max(test_data['User_Id'])
    num_movies = np.max(test_data['Movie_Id'])
    sparse_ratings_matrix_test = csr_matrix((test_data['User_Rating'], (test_data['User_Id'], test_data['Movie_Id'])), shape=(num_users + 1, num_movies + 1))

    # Save the sparse matrix as an NPZ file
    save_npz(npz_file_path, sparse_ratings_matrix_test)

# Now you have the sparse matrix loaded or created and loaded
print(sparse_ratings_matrix_test)

In [None]:
user, movie = sparse_ratings_matrix_test.shape
non_zero_rating = sparse_ratings_matrix_test.count_nonzero()

print(f"Sparsity Of Test matrix : {round(((1-(non_zero_rating/(user*movie))) * 100), 4)}% ")

In [None]:
train_averages = dict()
# get the global average of ratings in our train set.
train_global_average = sparse_ratings_matrix_train.sum()/sparse_ratings_matrix_train.count_nonzero()
train_averages['global'] = round(train_global_average, 4)
train_averages

In [None]:
def get_average_ratings(sparse_matrix, of_users):
    # Determine the axis for calculation (1 for users, 0 for movies)
    ax = 1 if of_users else 0

    # Sum of ratings for each user/movie
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1

    # Create a boolean matrix indicating if a user rated a movie
    is_rated = sparse_matrix != 0

    # Number of ratings for each user/movie
    no_of_ratings = is_rated.sum(axis=ax).A1

    # Get the shape of the sparse matrix (max_user, max_movie)
    max_user, max_movie = sparse_matrix.shape

    # Create a dictionary of users/movies and their average ratings
    average_ratings = {
        i: sum_of_ratings[i] / no_of_ratings[i]
        for i in range(max_user if of_users else max_movie)
        if no_of_ratings[i] != 0
                      }

    # Return the dictionary of average ratings
    return average_ratings

In [None]:
average_ratings_users = get_average_ratings(sparse_ratings_matrix_train, of_users=True)

average_ratings_movies = get_average_ratings(sparse_ratings_matrix_train, of_users=False)

In [None]:
train_data.head()

In [None]:
train_averages['User_Id'] = get_average_ratings(sparse_ratings_matrix_train, of_users=True)
print('\nAverage rating of user 510180 :',round(train_averages['User_Id'][510180], 4))

In [None]:
train_averages['Movie_Id'] = get_average_ratings(sparse_ratings_matrix_train, of_users=False)
print('\nAverage rating of Movie_Id 10341 :',round(train_averages['Movie_Id'][10341], 4))

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=plt.figaspect(.25))
fig.suptitle('Avg Ratings per User and per Movie', fontsize=15)

ax1.set_title('User Avg Rating')
user_averages = [r for r in train_averages['User_Id'].values()]
sns.kdeplot(user_averages, ax=ax1, cumulative=True, label='CDF')
sns.kdeplot(user_averages, ax=ax1, label='PDF')
ax1.grid(True)  # Adding gridlines to the first subplot

ax2.set_title('Movie Avg Rating')
movie_averages = [r for r in train_averages['Movie_Id'].values()]
sns.kdeplot(movie_averages, ax=ax2, cumulative=True, label='CDF')
sns.kdeplot(movie_averages, ax=ax2, label='PDF')
ax2.grid(True)  # Adding gridlines to the second subplot

plt.show()