In [2]:
#Import all libraries that will be used
import pandas as pd
import numpy as np

In [3]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
#load the ratings and movies file into the dataframe
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [7]:
movie_ratings = pd.merge(movies,ratings, on='movieId', how='inner')
movie_ratings.drop('timestamp', axis=1, inplace=True)
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0


In [8]:
#movie_rating_df = 
reviews = movie_ratings.groupby(['title'])['rating'].agg(['count','mean']).round(1)
reviews.head()

Unnamed: 0_level_0,count,mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""BLOW THE NIGHT!"" Let's Spend the Night Together (1983)",1,3.0
"""Great Performances"" Cats (1998)",179,2.9
#1 Cheerleader Camp (2010),9,2.1
#Captured (2017),2,3.8
#Female Pleasure (2018),3,3.7


In [9]:
movie_ratings.shape

(25000095, 5)

In [10]:
#movie_ratings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000095 entries, 0 to 25000094
Data columns (total 5 columns):
 #   Column   Dtype  
---  ------   -----  
 0   movieId  int64  
 1   title    object 
 2   genres   object 
 3   userId   int64  
 4   rating   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.5 GB


In [11]:
#movie_ratings.describe().round()

Unnamed: 0,movieId,userId,rating
count,25000095.0,25000095.0,25000095.0
mean,21388.0,81189.0,4.0
std,39199.0,46792.0,1.0
min,1.0,1.0,0.0
25%,1196.0,40510.0,3.0
50%,2947.0,80914.0,4.0
75%,8623.0,121557.0,4.0
max,209171.0,162541.0,5.0


In [12]:
movie_ratings=movie_ratings.astype({'movieId':'int32','userId':'int32','genres':'category'})

In [13]:
#mov = pd.crosstab(index=movie_ratings['userId'], columns=movie_ratings['title'],values=movie_ratings['rating'], aggfunc='sum')

In [14]:
movie_ratings.shape

(25000095, 5)

In [15]:
ratings.shape

(25000095, 4)

In [16]:
ratings.nunique()

userId         162541
movieId         59047
rating             10
timestamp    20115267
dtype: int64

In [17]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [18]:
#movie_ratings.nunique()

In [19]:
user_counts = movie_ratings['userId'].value_counts()
user_counts

72315     32202
80974      9178
137293     8913
33844      7919
20055      7488
          ...  
127695       20
152765       20
90815        20
70828        20
141949       20
Name: userId, Length: 162541, dtype: int64

In [20]:
valid_user_ids = user_counts[user_counts > 0].index
valid_user_ids

Int64Index([ 72315,  80974, 137293,  33844,  20055, 109731,  92046,  49403,
             30879, 115102,
            ...
            151863, 112112,  75766, 127773, 111735, 127695, 152765,  90815,
             70828, 141949],
           dtype='int64', length=162541)

In [21]:
filtered_ratings = movie_ratings[movie_ratings['userId'].isin(valid_user_ids)]

In [22]:
batch_size = 5000  # Set the batch size, produces best result when batch size is between 4000 - 5000 for my system
total_users = len(valid_user_ids)
num_batches = total_users // batch_size + 1
num_batches

33

##### The error you're encountering, "MemoryError: Unable to allocate 469. KiB for an array with shape (6, 10000) and data type float64," indicates that your code is running out of memory while trying to allocate memory for an array with shape (6, 10000) and data type float64.
##### The issue is most likely caused by the concatenation of DataFrames mov and batch_mov inside the loop. As the loop progresses, the mov DataFrame keeps growing as you concatenate more batches, eventually causing a memory error.

##### To resolve this issue, you can modify your code to append each batch to a list and then concatenate them outside the loop. 
##### Here's an updated version of your code that avoids the memory error:

In [24]:
output_dir = r"batch_files/"  # Specify the directory to store the sets

In [31]:
# create the pivot table in batches of 5000
#When processing a large dataset of 4GB, 
#the pivot_table function is generally more efficient than the crosstab function in Python.

#we can further reduce memory consumption by storing the intermediate results on disk 
#instead of keeping them all in memory. One approach is to save each batch of the pivot table 
#to a separate file and read them later for concatenation.

output_dir = r"batch_files/"  # Specify the directory to store the sets

for i in range(num_batches):
    start_index = i * batch_size
    end_index = start_index + batch_size
    batch_users = valid_user_ids[start_index:end_index]
    print("Running batch no: " + str(i))

    batch_mov = filtered_ratings[filtered_ratings['userId'].isin(batch_users)].pivot_table(
        index='userId',
        columns='title',
        values='rating',
        aggfunc='sum'
    ).astype('float32')  # Convert to float32 to save memory

    print("Writing batch no (" + str(i) + ") to disk")
    batch_mov.to_csv(output_dir + f"batch_{i}.csv", index=False)  # Write each batch to disk as a separate file


Running batch no: 0
Writing batch no (0) to disk
Running batch no: 1
Writing batch no (1) to disk
Running batch no: 2
Writing batch no (2) to disk
Running batch no: 3
Writing batch no (3) to disk
Running batch no: 4
Writing batch no (4) to disk
Running batch no: 5
Writing batch no (5) to disk
Running batch no: 6
Writing batch no (6) to disk
Running batch no: 7
Writing batch no (7) to disk
Running batch no: 8
Writing batch no (8) to disk
Running batch no: 9
Writing batch no (9) to disk
Running batch no: 10
Writing batch no (10) to disk
Running batch no: 11
Writing batch no (11) to disk
Running batch no: 12
Writing batch no (12) to disk
Running batch no: 13
Writing batch no (13) to disk
Running batch no: 14
Writing batch no (14) to disk
Running batch no: 15
Writing batch no (15) to disk
Running batch no: 16
Writing batch no (16) to disk
Running batch no: 17
Writing batch no (17) to disk
Running batch no: 18
Writing batch no (18) to disk
Running batch no: 19
Writing batch no (19) to disk


In [None]:
user_rating_list = []  # Re-initialize the list to store DataFrames

for i in range(num_batches):
    #print("Loading batch no (" + str(i) + ") from disk")
    batch_mov = pd.read_csv(output_dir + f"batch_{i}.csv")
    user_rating_list.append(batch_mov)  # Append each batch DataFrame to the list
    if i % 5==0:
        #print("Appending batch no (" + str(i) + ") to user_rating")
        user_rating.append(user_rating_list)
        user_rating_list = []

#user_rating = pd.concat(user_rating_list, ignore_index=True)  # Concatenate all DataFrames from the list

In [25]:
# we will recommend movies based on below 3 movies
userInput = ["300 Spartans, The (1962)", "2 Fast 2 Furious (2003)", "12 Angry Men (1957)"]

In [55]:
user_rating.corrwith(user_rating[userInput[0]], method='pearson')

title
"Great Performances" Cats (1998)                      NaN
'Round Midnight (1986)                          -0.500000
'Til There Was You (1997)                             NaN
'burbs, The (1989)                               0.866025
'night Mother (1986)                                  NaN
                                                   ...   
Dischord (2001)                                       NaN
Down and Derby (2005)                                 NaN
Jails, Hospitals & Hip-Hop (2000)                     NaN
Fists in the Pocket (I Pugni in tasca) (1965)         NaN
Stacy's Knights (1982)                                NaN
Length: 10676, dtype: float64

In [56]:
similarity = user_rating.corrwith(user_rating[userInput[0]], method = 'pearson') 
+ user_rating.corrwith(user_rating[userInput[1]], method = 'pearson') 
+ user_rating.corrwith(user_rating[userInput[2]], method = 'pearson')

title
"Great Performances" Cats (1998)                 1.000000
'Round Midnight (1986)                           0.429263
'Til There Was You (1997)                        0.051933
'burbs, The (1989)                               0.094722
'night Mother (1986)                             0.258762
                                                   ...   
Dischord (2001)                                       NaN
Down and Derby (2005)                                 NaN
Jails, Hospitals & Hip-Hop (2000)                     NaN
Fists in the Pocket (I Pugni in tasca) (1965)         NaN
Stacy's Knights (1982)                                NaN
Length: 10676, dtype: float64

In [58]:
correlatedMovies = pd.DataFrame(similarity, columns = ['correlation'])
correlatedMovies = pd.merge(correlatedMovies, reviews, on = 'title')
correlatedMovies = pd.merge(correlatedMovies, movies, on = 'title')

In [68]:
# as we want most similar movies first order the dataframe based on correlation values highest to lowest

# for better recommendation consider movies having more than average rating (3.5 via data analysis)
# and have reviews of more than 300  users

# we have only displayed top 10 recommendation
final_recommendation = correlatedMovies.query('mean>3.5 and count>300').sort_values('correlation', ascending=False)
final_recommendation.head(10)

Unnamed: 0,title,correlation,count,mean,movieId,genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,1.0,958,4.1,7099,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",1.0,365,4.0,6666,Comedy|Drama|Fantasy
7826,Red Rock West (1992),1.0,2270,3.8,373,Thriller
8653,Slaughterhouse-Five (1972),1.0,327,3.7,8690,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),1.0,941,3.6,3102,Thriller
1889,"Christmas Carol, A (Scrooge) (1951)",1.0,368,3.8,8492,Drama|Fantasy
1884,"Chorus, The (Les Choristes) (2004)",1.0,376,4.0,27815,Drama
2248,Croupier (1998),1.0,1186,3.9,3783,Crime|Drama
3323,Finding Neverland (2004),1.0,3043,3.9,8970,Drama
7083,Open Your Eyes (Abre los ojos) (1997),1.0,1160,3.8,2594,Drama|Romance|Sci-Fi|Thriller


In [71]:
# Let's remove the recommendation that user has already watched 
final_recommendation = final_recommendation[np.isin(final_recommendation['title'], userInput, invert=True)]
final_recommendation.head()

Unnamed: 0,title,correlation,count,mean,movieId,genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,1.0,958,4.1,7099,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",1.0,365,4.0,6666,Comedy|Drama|Fantasy
7826,Red Rock West (1992),1.0,2270,3.8,373,Thriller
8653,Slaughterhouse-Five (1972),1.0,327,3.7,8690,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),1.0,941,3.6,3102,Thriller


In [80]:
#del final_recommendation['movieId']
#del final_recommendation['correlation']
title = ('Movie Suggestion based on '+userInput[0]+ ", "+ userInput[1]+ ", "+userInput[2])
final_recommendation.rename(columns={'title':title,
                                     'count':'Number of Ratings',
                                     'mean':'Ratings',
                                     'genres':'Genres'
                                    },
                           inplace=True)
final_recommendation.head(10)

Unnamed: 0,"Movie Suggestion based on 300 Spartans, The (1962), 2 Fast 2 Furious (2003), 12 Angry Men (1957)",Number of Ratings,Ratings,Genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,958,4.1,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",365,4.0,Comedy|Drama|Fantasy
7826,Red Rock West (1992),2270,3.8,Thriller
8653,Slaughterhouse-Five (1972),327,3.7,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),941,3.6,Thriller
1889,"Christmas Carol, A (Scrooge) (1951)",368,3.8,Drama|Fantasy
1884,"Chorus, The (Les Choristes) (2004)",376,4.0,Drama
2248,Croupier (1998),1186,3.9,Crime|Drama
3323,Finding Neverland (2004),3043,3.9,Drama
7083,Open Your Eyes (Abre los ojos) (1997),1160,3.8,Drama|Romance|Sci-Fi|Thriller
