In [2]:
import pandas as pd
import numpy as np
import scipy.stats

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

print(movies.head())
print(ratings.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
df = pd.merge(ratings, movies, on= 'movieId', how= 'inner')

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
aggrate = df.groupby('title').agg(meanrate = ('rating', 'mean'), numberrating = ('rating', 'count')).reset_index()

In [9]:
aggrate200 = aggrate[aggrate['numberrating'] > 200]

aggrate200.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, 405 to 9119
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         18 non-null     object 
 1   meanrate      18 non-null     float64
 2   numberrating  18 non-null     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 576.0+ bytes


In [10]:
aggrate200.sort_values(by= 'numberrating', ascending= False).head()

Unnamed: 0,title,meanrate,numberrating
3158,Forrest Gump (1994),4.164134,329
7593,"Shawshank Redemption, The (1994)",4.429022,317
6865,Pulp Fiction (1994),4.197068,307
7680,"Silence of the Lambs, The (1991)",4.16129,279
5512,"Matrix, The (1999)",4.192446,278


In [11]:
df200 = pd.merge(df, aggrate200[['title']], on= 'title', how= 'inner')

df200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4338 entries, 0 to 4337
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     4338 non-null   int64  
 1   movieId    4338 non-null   int64  
 2   rating     4338 non-null   float64
 3   timestamp  4338 non-null   int64  
 4   title      4338 non-null   object 
 5   genres     4338 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 203.5+ KB


In [12]:
matrix = df200.pivot_table(index= 'userId', columns= 'title', values= 'rating')

matrix.head()

title,American Beauty (1999),Apollo 13 (1995),Braveheart (1995),Fight Club (1999),Forrest Gump (1994),Independence Day (a.k.a. ID4) (1996),Jurassic Park (1993),"Matrix, The (1999)",Pulp Fiction (1994),Schindler's List (1993),Seven (a.k.a. Se7en) (1995),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Terminator 2: Judgment Day (1991),Toy Story (1995),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,5.0,,4.0,5.0,4.0,3.0,4.0,5.0,3.0,5.0,5.0,,4.0,5.0,5.0,,4.0,5.0
2,,,,,,,,,,,,3.0,,,,,,
3,,,,,,,,,,0.5,,,,,,,,
4,5.0,,,2.0,,,,1.0,1.0,,2.0,,5.0,5.0,5.0,,,
5,,3.0,4.0,,,,,,5.0,5.0,,3.0,,,,3.0,4.0,4.0


In [14]:
maxtrixnorm = matrix.subtract(matrix.mean(axis= 1), axis = 'rows')

maxtrixnorm.head()

title,American Beauty (1999),Apollo 13 (1995),Braveheart (1995),Fight Club (1999),Forrest Gump (1994),Independence Day (a.k.a. ID4) (1996),Jurassic Park (1993),"Matrix, The (1999)",Pulp Fiction (1994),Schindler's List (1993),Seven (a.k.a. Se7en) (1995),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Terminator 2: Judgment Day (1991),Toy Story (1995),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.6,,-0.4,0.6,-0.4,-1.4,-0.4,0.6,-1.4,0.6,0.6,,-0.4,0.6,0.6,,-0.4,0.6
2,,,,,,,,,,,,0.0,,,,,,
3,,,,,,,,,,0.0,,,,,,,,
4,1.75,,,-1.25,,,,-2.25,-2.25,,-1.25,,1.75,1.75,1.75,,,
5,,-0.875,0.125,,,,,,1.125,1.125,,-0.875,,,,-0.875,0.125,0.125


In [16]:
simuser = maxtrixnorm.T.corr()
simuser.head()

userId,1,2,3,4,5,6,7,8,10,11,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.276648,-0.218218,-0.299837,-0.353553,0.534522,-0.356348,0.612372,...,0.632456,0.36262,-0.010574,-0.57735,0.730297,0.428773,-1.578727e-17,0.401478,-0.612372,0.119741
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.276648,,,1.0,,0.693375,,,0.457496,,...,-0.5,,-0.350096,,1.0,-0.114332,-0.25,-0.688847,,-0.490098
5,-0.218218,,,,1.0,-0.459907,0.57735,0.0,,0.57735,...,0.0,0.420084,0.507093,1.0,-0.801784,0.656892,-0.36997,0.511408,0.316228,0.126131


In [17]:
simuserco = cosine_similarity(maxtrixnorm.fillna(0))


In [21]:
pickeduser = 1



In [18]:
numuser = 10


In [19]:
usersim = .3


In [23]:
similaruser = simuser[simuser[pickeduser] > numuser][pickeduser].sort_values(ascending= False)[:numuser]



In [24]:
pickeduserseen = maxtrixnorm[maxtrixnorm.index == pickeduser].dropna(axis= 1, how= 'all')
pickeduserseen

title,American Beauty (1999),Braveheart (1995),Fight Club (1999),Forrest Gump (1994),Independence Day (a.k.a. ID4) (1996),Jurassic Park (1993),"Matrix, The (1999)",Pulp Fiction (1994),Schindler's List (1993),Seven (a.k.a. Se7en) (1995),"Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Toy Story (1995),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.6,-0.4,0.6,-0.4,-1.4,-0.4,0.6,-1.4,0.6,0.6,-0.4,0.6,0.6,-0.4,0.6


In [25]:
similarmovies = maxtrixnorm[maxtrixnorm.index.isin(similaruser.index)].dropna(axis= 1, how= 'all')
similarmovies

title
userId


In [26]:
similarmovies.drop(pickeduserseen.columns, axis= 1, inplace= True, errors= 'ignore')

In [27]:
similarmovies

title
userId
