In [1]:
import pandas as pd
import numpy as np

<h3>Creating DataFrames</h3>

In [69]:
col = ['movieId','title']
movies = pd.read_csv('u.item',usecols=[0,1],delimiter='|',names=col)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [70]:
rating = pd.read_csv("u.data", delimiter="\t",usecols=[0,1,2] ,
                     names=["userId", "movieId", "rating"])
rating.head()

Unnamed: 0,userId,movieId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


<h1>Merging above two DataFrames</h1>

In [4]:
newdata =  pd.merge(rating,movies,on='movieId')
newdata.sample(5)

Unnamed: 0,userId,movieId,rating,title
7035,896,4,3,Get Shorty (1995)
57161,94,961,4,Orlando (1993)
2252,455,40,3,"To Wong Foo, Thanks for Everything! Julie Newm..."
83466,634,840,2,Last Man Standing (1996)
89256,560,489,3,Notorious (1946)


<h3>Checking for null

In [5]:
newdata = newdata.dropna(axis=0,subset=['title'])
newdata.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

<h3>Counts the number of rating given to each movie</h3>

In [6]:
movies_rating_count = (newdata.groupby('title')['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'}))

In [7]:
movies_rating_count

Unnamed: 0,title,totalRatingCount
0,'Til There Was You (1997),9
1,1-900 (1994),5
2,101 Dalmatians (1996),109
3,12 Angry Men (1957),125
4,187 (1997),41
...,...,...
1659,Young Guns II (1990),44
1660,"Young Poisoner's Handbook, The (1995)",41
1661,Zeus and Roxanne (1997),6
1662,unknown,9


<h1>Merge 'totalRatingCount' to original dataframe</h1>

In [8]:
newdata_rating = newdata.merge(movies_rating_count,left_on='title',right_on='title',how='left')

In [9]:
newdata_rating.sample(5)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
89394,269,636,3,Escape from New York (1981),91
13959,758,1016,4,Con Air (1997),137
68003,864,472,4,Dragonheart (1996),158
72152,210,708,5,"Sex, Lies, and Videotape (1989)",101
54652,532,824,4,"Great White Hype, The (1996)",49


In [10]:
newdata_rating.shape

(100000, 5)

<h2>use minimun rating count to get popular movies </h2>

In [11]:
threshold = 100
popular_movies = newdata_rating[newdata_rating['totalRatingCount']>=threshold]

In [12]:
popular_movies.shape

(65008, 5)

In [13]:
popular_movies.sample(4)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
29795,613,127,4,"Godfather, The (1972)",413
33395,738,56,4,Pulp Fiction (1994),394
18790,94,195,3,"Terminator, The (1984)",301
38156,49,343,2,Alien: Resurrection (1997),124


<h1>Creating Pivot table based on ratings</h1> 

In [14]:
rating_matrix = popular_movies.pivot_table(index='movieId',columns='userId',values='rating').fillna(0.0)

In [15]:
rating_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
7,4.0,0.0,0.0,0.0,0.0,2.0,5.0,3.0,4.0,4.0,...,0.0,0.0,4.0,0.0,4.0,0.0,4.0,4.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0
1012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
1016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
1028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,4.0,2.0


<h3>Standardize the Rating values</h3><br>so that we get know like /
dislike of the user

In [16]:
def standardize(x):
    return (x-x.mean())#/(x.min()-x.max())

In [17]:
rating_matrix=rating_matrix.apply(standardize)
rating_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.049708,3.497076,-0.269006,-0.207602,3.114035,2.438596,-2.546784,-0.587719,-0.178363,2.482456,...,0.80117,2.637427,3.160819,-0.298246,3.219298,-0.385965,-0.923977,4.777778,-0.754386,-1.230994
2,1.049708,-0.502924,-0.269006,-0.207602,2.114035,-1.561404,-2.546784,-0.587719,-0.178363,-1.517544,...,2.80117,-0.362573,-0.839181,-0.298246,-0.780702,-0.385965,-0.923977,-0.222222,-0.754386,3.769006
4,1.049708,-0.502924,-0.269006,-0.207602,-0.885965,-1.561404,2.453216,-0.587719,-0.178363,2.482456,...,3.80117,-0.362573,-0.839181,-0.298246,-0.780702,-0.385965,1.076023,-0.222222,-0.754386,-1.230994
7,2.049708,-0.502924,-0.269006,-0.207602,-0.885965,0.438596,2.453216,2.412281,3.821637,2.482456,...,-1.19883,-0.362573,3.160819,-0.298246,3.219298,-0.385965,3.076023,3.777778,-0.754386,-1.230994
8,-0.950292,-0.502924,-0.269006,-0.207602,-0.885965,2.438596,2.453216,-0.587719,-0.178363,-1.517544,...,-1.19883,-0.362573,-0.839181,-0.298246,-0.780702,-0.385965,4.076023,-0.222222,-0.754386,-1.230994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,-1.950292,-0.502924,-0.269006,-0.207602,-0.885965,-1.561404,-2.546784,-0.587719,-0.178363,-1.517544,...,-1.19883,-0.362573,2.160819,-0.298246,4.219298,-0.385965,-0.923977,-0.222222,-0.754386,3.769006
1012,-1.950292,-0.502924,-0.269006,-0.207602,-0.885965,-1.561404,-2.546784,-0.587719,-0.178363,-1.517544,...,-1.19883,-0.362573,-0.839181,-0.298246,4.219298,-0.385965,-0.923977,-0.222222,-0.754386,-1.230994
1016,-1.950292,-0.502924,-0.269006,-0.207602,-0.885965,-1.561404,-2.546784,-0.587719,-0.178363,-1.517544,...,-1.19883,3.637427,2.160819,-0.298246,2.219298,-0.385965,-0.923977,-0.222222,-0.754386,-1.230994
1028,-1.950292,-0.502924,-0.269006,-0.207602,-0.885965,-1.561404,-2.546784,-0.587719,-0.178363,-1.517544,...,-1.19883,-0.362573,-0.839181,-0.298246,4.219298,4.614035,-0.923977,-0.222222,3.245614,0.769006


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity calculate row-wise similarity

In [19]:
#item(movies) based similarity 
cos_sim  = cosine_similarity(rating_matrix.T)

In [20]:
movie_matrix = pd.DataFrame(cos_sim,columns=rating_matrix.columns,index=rating_matrix.columns)

<h1>Item Base Approach </h1>
<br>Here movies are consider as a Items 

In [21]:
movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.060006,-0.172405,-0.076475,0.243137,0.220562,0.176823,0.214428,-0.095214,0.133534,...,0.237626,-0.066955,0.010832,0.031555,-0.100938,-0.066589,0.096790,0.050600,-0.063876,0.227067
2,-0.060006,1.000000,0.039397,0.189146,-0.080606,0.115077,-0.177919,-0.007446,0.174843,0.018736,...,0.039737,0.316042,0.391448,0.463542,0.292161,0.217377,0.129159,0.130605,0.091737,-0.070601
3,-0.172405,0.039397,1.000000,0.385675,-0.122575,-0.091023,-0.188004,-0.021637,0.042921,-0.103431,...,-0.128229,-0.025295,0.182762,0.023317,0.059959,-0.042782,0.062498,0.079553,0.086198,-0.145507
4,-0.076475,0.189146,0.385675,1.000000,-0.066759,-0.049142,-0.042515,0.176791,0.118318,-0.047984,...,-0.047953,-0.015340,0.144434,0.224939,0.120968,-0.017875,0.132019,0.160920,0.137180,-0.066449
5,0.243137,-0.080606,-0.122575,-0.066759,1.000000,0.043102,0.165772,0.175771,-0.021439,-0.027172,...,0.268381,-0.038739,-0.080647,-0.025539,-0.025532,-0.050630,0.049583,0.090847,0.006997,0.188905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,-0.066589,0.217377,-0.042782,-0.017875,-0.050630,-0.021313,-0.096034,-0.011509,0.003811,-0.080348,...,-0.086912,0.487496,0.283161,0.256369,0.440033,1.000000,-0.030325,0.147194,-0.084949,0.035597
940,0.096790,0.129159,0.062498,0.132019,0.049583,0.187840,0.089851,0.116887,0.074565,0.175854,...,0.139625,-0.022260,0.063918,0.102370,-0.021716,-0.030325,1.000000,0.066364,0.145925,0.030866
941,0.050600,0.130605,0.079553,0.160920,0.090847,0.057489,-0.137499,0.088854,0.157156,-0.021419,...,-0.076061,0.188677,0.290226,0.193792,0.284409,0.147194,0.066364,1.000000,0.034472,-0.010930
942,-0.063876,0.091737,0.086198,0.137180,0.006997,0.186851,0.089885,0.050954,-0.010160,0.029673,...,0.055403,-0.033326,-0.062610,0.060791,-0.070147,-0.084949,0.145925,0.034472,1.000000,0.012510


<h2>function for getting similar items(movies)

In [22]:
def getMovie(id,k=5):
    
    ''''
        id:movieID;
        k:number of similar moives
    '''
    movies = movie_matrix[id]
    movies = movies.sort_values(ascending=False)
    return movies[:k]

In [23]:
user_id=55
similar_movies =getMovie(id=user_id,k=10)

In [24]:
''''
creating dataframe based on similarities
'''
weighted_movies = pd.DataFrame({'movieId':similar_movies.index,'similarity':similar_movies.values})

# print(weighted_movies)
'''
filter moviesId from original Dataset
'''
filtered_movies =pd.DataFrame(popular_movies[popular_movies['movieId'].isin(similar_movies.index)])
'''
merging 'filtered_movies' and 'weighted_movies'
'''

top_similar = filtered_movies.merge(weighted_movies,on='movieId').sort_values(by='similarity',ascending=False)
top_similar.drop_duplicates(subset=['movieId'],inplace=True)
top_similar

Unnamed: 0,userId,movieId,rating,title,totalRatingCount,similarity
439,214,55,4,"Professional, The (1994)",149,1.0
167,785,137,2,Big Night (1996),171,0.441774
211,380,217,2,Bram Stoker's Dracula (1992),120,0.38195


<h1>User Based Approach</h1><br>
<h3>
Now try to find Similar users and recommend moives based on that 
</h3>

In [25]:
con_sim_user  = cosine_similarity(rating_matrix)
matrix_user = pd.DataFrame(con_sim_user,columns=rating_matrix.index,index=rating_matrix.index)


In [26]:
# movie_matrix_user=movie_matrix_user.apply(standardize)
matrix_user

movieId,1,2,4,7,8,9,11,12,13,14,...,865,866,879,895,926,928,1012,1016,1028,1047
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.145854,-0.008202,0.376580,0.109994,0.194892,0.070428,0.093365,0.002410,-0.017480,...,-0.465280,-0.066433,-0.219444,-0.272488,-0.135553,-0.151108,-0.116023,-0.024489,-0.028819,-0.038110
2,-0.145854,1.000000,0.088021,-0.147112,-0.135736,-0.199585,0.023614,-0.035238,-0.119293,-0.170596,...,0.196447,0.058089,0.073408,0.119918,0.215324,0.090083,0.058994,0.096816,0.085457,0.098143
4,-0.008202,0.088021,1.000000,0.061206,0.079998,0.025085,0.226806,0.219109,0.008178,-0.109239,...,-0.163922,-0.137791,-0.147187,-0.128576,-0.139479,-0.177740,-0.055639,0.009052,-0.134892,-0.103815
7,0.376580,-0.147112,0.061206,1.000000,0.018873,0.247871,0.246592,0.272047,0.097333,-0.025351,...,-0.396550,-0.134388,-0.198861,-0.200606,-0.161939,-0.098845,-0.041617,-0.063109,-0.106094,-0.048509
8,0.109994,-0.135736,0.079998,0.018873,1.000000,0.087317,0.015267,0.141510,-0.027759,-0.018000,...,-0.157680,-0.098274,-0.151588,-0.200761,-0.189114,-0.187469,-0.084665,-0.074010,-0.101876,-0.192314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,-0.151108,0.090083,-0.177740,-0.098845,-0.187469,-0.152130,-0.078379,-0.204887,0.036116,-0.048307,...,0.340495,0.246128,0.159606,0.273476,0.437588,1.000000,0.224382,0.166329,0.330134,0.406808
1012,-0.116023,0.058994,-0.055639,-0.041617,-0.084665,-0.035873,-0.096732,-0.165788,0.105195,0.045521,...,0.350285,0.230360,0.162109,0.223163,0.289393,0.224382,1.000000,0.282225,0.257794,0.254544
1016,-0.024489,0.096816,0.009052,-0.063109,-0.074010,-0.093928,0.037229,-0.091819,-0.059434,-0.089234,...,0.177932,0.215451,0.205709,0.209260,0.300264,0.166329,0.282225,1.000000,0.196972,0.276961
1028,-0.028819,0.085457,-0.134892,-0.106094,-0.101876,-0.056484,-0.116836,-0.227514,-0.000560,-0.069127,...,0.216470,0.443107,0.132375,0.150939,0.424286,0.330134,0.257794,0.196972,1.000000,0.423795


<h1>function to get similar users 

In [27]:
def getUser(userId,k=5):
    sim_user = matrix_user[userId].nlargest(k)
    return sim_user[1:]

In [56]:
user_id=451
top_similar_user = getUser(user_id,k=10)
top_similar_user

movieId
739    0.358464
393    0.355384
88     0.351019
402    0.343760
94     0.294732
66     0.285128
67     0.265442
559    0.237258
732    0.233651
Name: 451, dtype: float64

In [63]:
''''
creating dataframe based on similarities
'''
weighted_users = pd.DataFrame({'userId':top_similar_user.index,'user_similarity':top_similar_user.values})

# print(weighted_users)
'''
filter userId from original Dataset
'''
filtered_users =pd.DataFrame(popular_movies[popular_movies['userId'].isin(top_similar_user.index)])
'''
merging 'filtered_users' and 'weighted_users'
'''
# print(filtered_users)
top_similar = filtered_users.merge(weighted_users,on='userId')#.sort_values(by='user_similarity',ascending=False)
top_similar.drop_duplicates(subset=['movieId'],inplace=True)
top_similar

Unnamed: 0,userId,movieId,rating,title,totalRatingCount,user_similarity
0,88,302,3,L.A. Confidential (1997),297,0.351019
1,88,690,4,Seven Years in Tibet (1997),155,0.351019
2,88,750,2,Amistad (1997),124,0.351019
3,88,321,1,Mother (1996),169,0.351019
4,88,286,5,"English Patient, The (1996)",481,0.351019
...,...,...,...,...,...,...
596,739,498,4,"African Queen, The (1951)",152,0.358464
627,732,305,2,"Ice Storm, The (1997)",108,0.233651
628,732,289,3,Evita (1996),259,0.233651
629,732,269,5,"Full Monty, The (1997)",315,0.233651


In [64]:
user_rated = rating_matrix.loc[:,user_id]
user_rated

movieId
1      -0.403509
2      -0.403509
4      -0.403509
7      -0.403509
8      -0.403509
          ...   
928    -0.403509
1012   -0.403509
1016   -0.403509
1028   -0.403509
1047   -0.403509
Name: 451, Length: 342, dtype: float64

In [65]:
d=pd.DataFrame({'movieId':user_rated.index,'score':user_rated.values})
top_similar=top_similar.merge(d,on='movieId')

<h2>Claculating the user score based on this previous rating with recommended movies using user similarity

In [66]:
top_similar['score'] = (top_similar['rating'] * top_similar['score'] +top_similar['user_similarity']) 

In [67]:
top_similar.sort_values(by='score',ascending=False).head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount,user_similarity,score
230,393,294,4,Liar Liar (1997),485,0.355384,18.741348
253,393,333,4,"Game, The (1997)",251,0.355384,18.741348
302,739,327,5,Cop Land (1997),175,0.358464,18.34092
283,393,270,5,Gattaca (1997),136,0.355384,18.33784
10,88,326,5,G.I. Jane (1997),175,0.351019,18.333475
78,94,258,5,Contact (1997),509,0.294732,18.277188
277,393,243,4,Jungle2Jungle (1997),132,0.355384,14.741348
258,393,259,4,George of the Jungle (1997),162,0.355384,14.741348
223,393,332,4,Kiss the Girls (1997),143,0.355384,14.741348
224,393,322,4,Murder at 1600 (1997),218,0.355384,14.741348
