In [14]:
#Import all libraries that will be used
import pandas as pd
import numpy as np
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [17]:
column_movies = ["movieId", "title", "genres"] 
column_ratings = ["userId", "movieId", "rating", "timestamp"]
try:
    movies = pd.read_table("movies.dat", sep="::", header=None, names=column_movies)
    ratings = pd.read_table("ratings.dat", sep="::", header=None, names=column_ratings)
    
    # Process the DataFrame as needed
except pd.errors.ParserError:
    print("Error while parsing the file. Please check the data format or structure.")

In [19]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [21]:
movie_ratings = pd.merge(movies,ratings, on='movieId', how='inner')
movie_ratings.drop('timestamp', axis=1, inplace=True)
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,1.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,3.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,5.0


In [22]:
#movie_rating_df = 
reviews = movie_ratings.groupby(['title'])['rating'].agg(['count','mean']).round(1)
reviews.head()

Unnamed: 0_level_0,count,mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Great Performances"" Cats (1998)",6,3.6
'Round Midnight (1986),50,3.7
'Til There Was You (1997),302,2.8
"'burbs, The (1989)",1504,3.0
'night Mother (1986),211,3.5


In [23]:
movie_ratings.shape

(10000054, 5)

In [24]:
movie_ratings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000054 entries, 0 to 10000053
Data columns (total 5 columns):
 #   Column   Dtype  
---  ------   -----  
 0   movieId  int64  
 1   title    object 
 2   genres   object 
 3   userId   int64  
 4   rating   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8 GB


In [25]:
movie_ratings.describe().round()

Unnamed: 0,movieId,userId,rating
count,10000054.0,10000054.0,10000054.0
mean,4120.0,35870.0,4.0
std,8938.0,20585.0,1.0
min,1.0,1.0,0.0
25%,648.0,18123.0,3.0
50%,1834.0,35740.0,4.0
75%,3624.0,53608.0,4.0
max,65133.0,71567.0,5.0


In [26]:
movie_ratings=movie_ratings.astype({'movieId':'int32','userId':'int32','genres':'category'})

In [27]:
movie_ratings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000054 entries, 0 to 10000053
Data columns (total 5 columns):
 #   Column   Dtype   
---  ------   -----   
 0   movieId  int32   
 1   title    object  
 2   genres   category
 3   userId   int32   
 4   rating   float64 
dtypes: category(1), float64(1), int32(2), object(1)
memory usage: 1.0 GB


In [28]:
ratings.nunique()

userId         69878
movieId        10677
rating            10
timestamp    7096905
dtype: int64

In [29]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [30]:
movie_ratings.isna().sum()

movieId    0
title      0
genres     0
userId     0
rating     0
dtype: int64

In [31]:
#movie_ratings.nunique()

movieId    10677
title      10676
genres       797
userId     69878
rating        10
dtype: int64

In [32]:
user_counts = movie_ratings['userId'].value_counts()
user_counts

59269    7359
67385    7047
14463    5169
68259    4483
27468    4449
         ... 
52009      20
52396      20
53572      20
18672      20
17445      20
Name: userId, Length: 69878, dtype: int64

In [33]:
valid_user_ids = user_counts[user_counts > 0].index
valid_user_ids

Int64Index([59269, 67385, 14463, 68259, 27468,  3817, 19635, 63134, 58357,
            27584,
            ...
            40133, 30182, 42967, 48218, 48418, 52009, 52396, 53572, 18672,
            17445],
           dtype='int64', length=69878)

In [35]:
filtered_ratings = movie_ratings[movie_ratings['userId'].isin(valid_user_ids)]
filtered_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,1.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,3.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,5.0


In [36]:
batch_size = 5000  # Set the batch size
total_users = len(valid_user_ids)
num_batches = total_users // batch_size + 1
num_batches

14

In [37]:
user_rating_list = []  # Initialize the list to store batches
user_rating = pd.DataFrame()

for i in range(num_batches):
    start_index = i * batch_size
    end_index = start_index + batch_size
    batch_users = valid_user_ids[start_index:end_index]
    print("Running batch no: "+ str(i))

    batch_mov = pd.crosstab(index=filtered_ratings[filtered_ratings['userId'].isin(batch_users)]['userId'],
                            columns=filtered_ratings[filtered_ratings['userId'].isin(batch_users)]['title'],
                            values=filtered_ratings[filtered_ratings['userId'].isin(batch_users)]['rating'],
                            aggfunc='sum')
    print("Appending batch no ("+ str(i) +") to the list")
    user_rating_list.append(batch_mov)  # Append each batch to the list
    
print("Before concatenation")
user_rating = pd.concat(user_rating_list, ignore_index=True)  # Concatenate all batches


Running batch no: 0
Appending batch no (0) to the list
Running batch no: 1
Appending batch no (1) to the list
Running batch no: 2
Appending batch no (2) to the list
Running batch no: 3
Appending batch no (3) to the list
Running batch no: 4
Appending batch no (4) to the list
Running batch no: 5
Appending batch no (5) to the list
Running batch no: 6
Appending batch no (6) to the list
Running batch no: 7
Appending batch no (7) to the list
Running batch no: 8
Appending batch no (8) to the list
Running batch no: 9
Appending batch no (9) to the list
Running batch no: 10
Appending batch no (10) to the list
Running batch no: 11
Appending batch no (11) to the list
Running batch no: 12
Appending batch no (12) to the list
Running batch no: 13
Appending batch no (13) to the list
Before concatenation


In [44]:
user_rating.tail()

title,"""Great Performances"" Cats (1998)",'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),...All the Marbles (a.k.a. The California Dolls) (1981),...And God Created Woman (Et Dieu... créa la femme) (1956),...And God Spoke (1993),...And Justice for All (1979),...,Besotted (2001),"Child I Never Was, The (Leben lang kurze Hosen Tragen, Ein) (2002)",Emerald Cowboy (2002),"Hi-Line, The (1999)",Love Forbidden (Défense d'aimer) (2002),Dischord (2001),Down and Derby (2005),"Jails, Hospitals & Hip-Hop (2000)",Fists in the Pocket (I Pugni in tasca) (1965),Stacy's Knights (1982)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71504,,,,,,,,,,,...,,,,,,,,,,
71506,,,,,,,,,,,...,,,,,,,,,,
71510,,,,,,,,,,,...,,,,,,,,,,
71520,,,,,,,,,,,...,,,,,,,,,,
71522,,,,,,,,,,,...,,,,,,,,,,


In [51]:
user_rating.columns[:100]

Index(['"Great Performances" Cats (1998)', ''Round Midnight (1986)',
       ''Til There Was You (1997)', ''burbs, The (1989)',
       ''night Mother (1986)', '*batteries not included (1987)',
       '...All the Marbles (a.k.a. The California Dolls) (1981)',
       '...And God Created Woman (Et Dieu... créa la femme) (1956)',
       '...And God Spoke (1993)', '...And Justice for All (1979)',
       '1, 2, 3, Sun (Un, deuz, trois, soleil) (1993)', '1-900 (06) (1994)',
       '10 (1979)', '10 Items or Less (2006)', '10 Rillington Place (1971)',
       '10 Things I Hate About You (1999)', '10 to Midnight (1983)',
       '10,000 B.C. (2008)', '100 Feet (2008)', '100 Girls (2000)',
       '100 Rifles (1969)',
       '1000 Eyes of Dr. Mabuse, The (Tausend Augen des Dr. Mabuse, Die) (1960)',
       '101 Dalmatians (1996)', '101 Reykjavik (101 Reykjavík) (2000)',
       '102 Dalmatians (2000)', '10th & Wolf (2006)',
       '10th Victim, The (La Decima Vittima) (1965)',
       '11'09"01 - Septem

In [54]:
userInput = ["300 Spartans, The (1962)", "2 Fast 2 Furious (2003)", "12 Angry Men (1957)"]

In [55]:
user_rating.corrwith(user_rating[userInput[0]], method='pearson')

title
"Great Performances" Cats (1998)                      NaN
'Round Midnight (1986)                          -0.500000
'Til There Was You (1997)                             NaN
'burbs, The (1989)                               0.866025
'night Mother (1986)                                  NaN
                                                   ...   
Dischord (2001)                                       NaN
Down and Derby (2005)                                 NaN
Jails, Hospitals & Hip-Hop (2000)                     NaN
Fists in the Pocket (I Pugni in tasca) (1965)         NaN
Stacy's Knights (1982)                                NaN
Length: 10676, dtype: float64

In [56]:
similarity = user_rating.corrwith(user_rating[userInput[0]], method = 'pearson') 
+ user_rating.corrwith(user_rating[userInput[1]], method = 'pearson') 
+ user_rating.corrwith(user_rating[userInput[2]], method = 'pearson')

title
"Great Performances" Cats (1998)                 1.000000
'Round Midnight (1986)                           0.429263
'Til There Was You (1997)                        0.051933
'burbs, The (1989)                               0.094722
'night Mother (1986)                             0.258762
                                                   ...   
Dischord (2001)                                       NaN
Down and Derby (2005)                                 NaN
Jails, Hospitals & Hip-Hop (2000)                     NaN
Fists in the Pocket (I Pugni in tasca) (1965)         NaN
Stacy's Knights (1982)                                NaN
Length: 10676, dtype: float64

In [58]:
correlatedMovies = pd.DataFrame(similarity, columns = ['correlation'])
correlatedMovies = pd.merge(correlatedMovies, reviews, on = 'title')
correlatedMovies = pd.merge(correlatedMovies, movies, on = 'title')

In [59]:
correlatedMovies.head()

Unnamed: 0,title,correlation,count,mean,movieId,genres
0,"""Great Performances"" Cats (1998)",,6,3.6,51372,Musical
1,'Round Midnight (1986),-0.5,50,3.7,26564,Drama|Musical
2,'Til There Was You (1997),,302,2.8,779,Drama|Romance
3,"'burbs, The (1989)",0.866025,1504,3.0,2072,Comedy
4,'night Mother (1986),,211,3.5,3112,Drama


In [68]:
# as we want most similar movies first order the dataframe based on correlation values highest to lowest

# for better recommendation consider movies having more than average rating (3.5 via data analysis)
# and have reviews of more than 300  users

# we have only displayed top 10 recommendation
final_recommendation = correlatedMovies.query('mean>3.5 and count>300').sort_values('correlation', ascending=False)
final_recommendation.head(10)

Unnamed: 0,title,correlation,count,mean,movieId,genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,1.0,958,4.1,7099,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",1.0,365,4.0,6666,Comedy|Drama|Fantasy
7826,Red Rock West (1992),1.0,2270,3.8,373,Thriller
8653,Slaughterhouse-Five (1972),1.0,327,3.7,8690,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),1.0,941,3.6,3102,Thriller
1889,"Christmas Carol, A (Scrooge) (1951)",1.0,368,3.8,8492,Drama|Fantasy
1884,"Chorus, The (Les Choristes) (2004)",1.0,376,4.0,27815,Drama
2248,Croupier (1998),1.0,1186,3.9,3783,Crime|Drama
3323,Finding Neverland (2004),1.0,3043,3.9,8970,Drama
7083,Open Your Eyes (Abre los ojos) (1997),1.0,1160,3.8,2594,Drama|Romance|Sci-Fi|Thriller


In [71]:
# Let's remove the recommendation that user has already watched 
final_recommendation = final_recommendation[np.isin(final_recommendation['title'], userInput, invert=True)]
final_recommendation.head()

Unnamed: 0,title,correlation,count,mean,movieId,genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,1.0,958,4.1,7099,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",1.0,365,4.0,6666,Comedy|Drama|Fantasy
7826,Red Rock West (1992),1.0,2270,3.8,373,Thriller
8653,Slaughterhouse-Five (1972),1.0,327,3.7,8690,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),1.0,941,3.6,3102,Thriller


In [80]:
#del final_recommendation['movieId']
#del final_recommendation['correlation']
title = ('Movie Suggestion based on '+userInput[0]+ ", "+ userInput[1]+ ", "+userInput[2])
final_recommendation.rename(columns={'title':title,
                                     'count':'Number of Ratings',
                                     'mean':'Ratings',
                                     'genres':'Genres'
                                    },
                           inplace=True)
final_recommendation.head(10)

Unnamed: 0,"Movie Suggestion based on 300 Spartans, The (1962), 2 Fast 2 Furious (2003), 12 Angry Men (1957)",Number of Ratings,Ratings,Genres
6705,Nausicaä of the Valley of the Winds (Kaze no t...,958,4.1,Adventure|Animation|Drama|Fantasy|Sci-Fi
2656,"Discreet Charm of the Bourgeoisie, The (Le Cha...",365,4.0,Comedy|Drama|Fantasy
7826,Red Rock West (1992),2270,3.8,Thriller
8653,Slaughterhouse-Five (1972),327,3.7,Comedy|Drama|Sci-Fi|War
4984,Jagged Edge (1985),941,3.6,Thriller
1889,"Christmas Carol, A (Scrooge) (1951)",368,3.8,Drama|Fantasy
1884,"Chorus, The (Les Choristes) (2004)",376,4.0,Drama
2248,Croupier (1998),1186,3.9,Crime|Drama
3323,Finding Neverland (2004),3043,3.9,Drama
7083,Open Your Eyes (Abre los ojos) (1997),1160,3.8,Drama|Romance|Sci-Fi|Thriller
