# Recommend movies by predicting movie ratings based on similar users and others users profile

In [None]:
# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
# movielens data

In [3]:
users = pd.read_table('movie/data/ml-1m/users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],engine='python')

In [4]:
movies = pd.read_table('movie/data/ml-1m/movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'], engine='python',encoding='latin-1')

In [5]:
ratings = pd.read_table('movie/data/ml-1m/ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'],engine='python')

In [6]:
movielens = pd.merge(pd.merge(ratings, users), movies)
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [7]:
movielens.info

<bound method DataFrame.info of          user_id  movie_id  rating  timestamp gender  age  occupation    zip  \
0              1      1193       5  978300760      F    1          10  48067   
1              2      1193       5  978298413      M   56          16  70072   
2             12      1193       4  978220179      M   25          12  32793   
3             15      1193       4  978199279      M   25           7  22903   
4             17      1193       5  978158471      M   50           1  95350   
...          ...       ...     ...        ...    ...  ...         ...    ...   
1000204     5949      2198       5  958846401      M   18          17  47901   
1000205     5675      2703       3  976029116      M   35          14  30030   
1000206     5780      2845       1  958153068      M   18          17  92886   
1000207     5851      3607       5  957756608      F   18          20  55410   
1000208     5938      2909       4  957273353      M   25           1  35401   

       

In [8]:
#dropping columns which are not useful in prediction

In [9]:
movielens.drop('timestamp',axis=1, inplace=True)

In [10]:
#get similar users matrix which contains distance between each user on the basis
#of genres, age and gender: 
#for ex those users who like movies with similar generes probably have small distance between them

In [11]:
similar_user_matrix = movielens.pivot_table(columns='genres',index='user_id', values='rating').fillna(0)

In [12]:
similar_user_matrix = pd.merge(users[['user_id','gender','age',]], similar_user_matrix, on='user_id', how='left')

In [13]:
similar_user_matrix = pd.get_dummies(similar_user_matrix)

In [14]:
similar_user_matrix

Unnamed: 0,user_id,age,Action,Action|Adventure,Action|Adventure|Animation,Action|Adventure|Animation|Children's|Fantasy,Action|Adventure|Animation|Horror|Sci-Fi,Action|Adventure|Children's,Action|Adventure|Children's|Comedy,Action|Adventure|Children's|Fantasy,...,Romance|Western,Sci-Fi,Sci-Fi|Thriller,Sci-Fi|Thriller|War,Sci-Fi|War,Thriller,War,Western,gender_F,gender_M
0,1,1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,0.0,4.000000,0.0,0.0,1,0
1,2,56,3.0,4.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0,1
2,3,25,0.0,4.250000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,5.0,0,1
3,4,45,5.0,5.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0,1
4,5,25,3.0,3.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,4.000,0.0,0.0,2.500000,0.0,4.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,25,3.4,3.166667,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,3.777778,2.375,3.0,4.0,3.304348,0.0,4.2,1,0
6036,6037,45,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.500000,0.000,4.0,5.0,3.266667,0.0,3.0,1,0
6037,6038,56,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,1,0
6038,6039,45,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000,0.0,4.0,0.000000,0.0,0.0,1,0


In [15]:
similar_user_matrix.set_index('user_id', inplace=True)

In [16]:
similar_user_matrix.head()

Unnamed: 0_level_0,age,Action,Action|Adventure,Action|Adventure|Animation,Action|Adventure|Animation|Children's|Fantasy,Action|Adventure|Animation|Horror|Sci-Fi,Action|Adventure|Children's,Action|Adventure|Children's|Comedy,Action|Adventure|Children's|Fantasy,Action|Adventure|Children's|Sci-Fi,...,Romance|Western,Sci-Fi,Sci-Fi|Thriller,Sci-Fi|Thriller|War,Sci-Fi|War,Thriller,War,Western,gender_F,gender_M
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1,0
2,56,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,25,0.0,4.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0,1
4,45,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
5,25,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,2.5,0.0,4.0,0,1


In [17]:
#get similarity matrix using cosine similarity

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
similarity = cosine_similarity(similar_user_matrix)

In [20]:
similarity = pd.DataFrame(similarity, index=similar_user_matrix.index, columns=similar_user_matrix.index)

In [21]:
similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.127319,0.190819,0.135550,0.322843,0.239188,0.105496,0.244932,0.312898,0.378664,...,0.320391,0.221737,0.137994,0.080981,0.248407,0.341123,0.213082,0.097625,0.250994,0.289806
2,0.127319,1.000000,0.802169,0.896430,0.725069,0.876171,0.898152,0.768174,0.798481,0.700134,...,0.702945,0.829956,0.916433,0.775023,0.736724,0.577865,0.832958,0.915634,0.831192,0.653595
3,0.190819,0.802169,1.000000,0.762329,0.633855,0.758870,0.752624,0.629869,0.654674,0.641730,...,0.583404,0.709042,0.769937,0.648449,0.639920,0.555547,0.700764,0.756159,0.724465,0.587208
4,0.135550,0.896430,0.762329,1.000000,0.705286,0.860743,0.882443,0.714843,0.740549,0.634790,...,0.682687,0.800926,0.923932,0.779434,0.688786,0.535636,0.805148,0.918900,0.802782,0.637153
5,0.322843,0.725069,0.633855,0.705286,1.000000,0.721517,0.712996,0.716295,0.789586,0.659929,...,0.646639,0.737978,0.703177,0.669771,0.668448,0.649562,0.744660,0.710719,0.685877,0.739314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.341123,0.577865,0.555547,0.535636,0.649562,0.575131,0.531219,0.562684,0.618596,0.733496,...,0.513203,0.635060,0.572384,0.531594,0.678154,1.000000,0.683371,0.497879,0.651412,0.724123
6037,0.213082,0.832958,0.700764,0.805148,0.744660,0.797913,0.777713,0.720763,0.769516,0.697531,...,0.658090,0.824956,0.813773,0.745052,0.733817,0.683371,1.000000,0.818357,0.822506,0.804870
6038,0.097625,0.915634,0.756159,0.918900,0.710719,0.900629,0.868911,0.707007,0.754062,0.628685,...,0.679495,0.842685,0.896329,0.827764,0.684269,0.497879,0.818357,1.000000,0.849337,0.600590
6039,0.250994,0.831192,0.724465,0.802782,0.685877,0.854450,0.771450,0.662133,0.691453,0.740538,...,0.664297,0.838230,0.797759,0.738817,0.675661,0.651412,0.822506,0.849337,1.000000,0.694099


In [22]:
# similarity = pd.DataFrame(similarity, index=similar_user_matrix['user_id'], columns=similar_user_matrix['user_id'])

In [23]:
# similarity

# predict movie ratings per user 

In [24]:
class predict_rating(object):
    
    def __init__(self,sim_matrix, movielens):
        self.sim_matrix= sim_matrix
        self.movielens = movielens
        
    def get_ratings(self, user_id):
#       get top 5 similar users as per the similarity score 
        sim_user = self.sim_matrix.loc[user_id].sort_values(ascending=False)[0:5].index.values
#       get movies rated by similar users and get mean of ratings by similar user's per movie
#       this will be the prediction for rating by user_id
        recom_movies = self.movielens[self.movielens['user_id'].isin(sim_user)][['user_id','title','rating']].groupby('title')['rating'].mean()
        recom_movies = pd.DataFrame(recom_movies)
        recom_movies = recom_movies.reset_index()
#       get ratings of all the movies rated by similar user's by user_id 
        flt= ((movielens['user_id']==user_id) & (movielens['title'].isin(recom_movies['title'])))
        user_df = movielens.loc[flt]
        rating_matrix = pd.merge(user_df[['user_id','rating','title']] , recom_movies, on='title', how='left', suffixes=('_original', '_pred'))
        return(rating_matrix)


In [25]:
pr = predict_rating(similarity,movielens)

In [26]:
#predict ratings for user 1

In [27]:
pred_df = pr.get_ratings(1)

In [28]:
pred_df

Unnamed: 0,user_id,rating_original,title,rating_pred
0,1,5,One Flew Over the Cuckoo's Nest (1975),5.0
1,1,3,James and the Giant Peach (1996),3.0
2,1,3,My Fair Lady (1964),3.666667
3,1,4,Erin Brockovich (2000),4.333333
4,1,5,"Bug's Life, A (1998)",4.5
5,1,3,"Princess Bride, The (1987)",4.5
6,1,5,Ben-Hur (1959),5.0
7,1,5,"Christmas Story, A (1983)",5.0
8,1,4,Snow White and the Seven Dwarfs (1937),3.666667
9,1,4,"Wizard of Oz, The (1939)",4.0


In [29]:
# pred_df

In [30]:
y_true = np.array(pred_df['rating_original'].values)

In [31]:
y_pred = np.array(pred_df['rating_pred'].values)

In [32]:
np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

0.44332931383244284

In [33]:
# np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [34]:
#predict ratings by user 2

In [35]:
pred_df = pr.get_ratings(2)

In [36]:
pred_df

Unnamed: 0,user_id,rating_original,title,rating_pred
0,2,5,One Flew Over the Cuckoo's Nest (1975),5.0
1,2,4,Awakenings (1990),4.0
2,2,3,Pleasantville (1998),3.0
3,2,5,Driving Miss Daisy (1989),5.0
4,2,4,To Kill a Mockingbird (1962),4.0
...,...,...,...,...
124,2,3,Manhattan (1979),3.0
125,2,5,Forrest Gump (1994),4.6
126,2,2,Miller's Crossing (1990),2.0
127,2,1,Nurse Betty (2000),1.0


In [37]:
# pred_df

In [38]:
y_true = np.array(pred_df['rating_original'].values),
y_pred = np.array(pred_df['rating_pred'].values)

In [39]:
np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

0.4525884973967885

In [40]:
# predict movie rating user wise 

In [41]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
#     print(s2)
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [42]:
#predict movie rating per user_id and movie_id

In [43]:
# by computing the correlation between other users 
# and get the weighted average of other's rating's as per the correlation score(using pearson correlationg)
# as predicted rating 

In [44]:
class predict_rating_pearson:


    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')
        user_condition = movielens.user_id != user_id
        movie_condition = movielens.movie_id == movie_id
        ratings_by_others = movielens.loc[user_condition & movie_condition]
        
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = all_user_profiles[their_ids]
        user_profile = all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)

In [45]:
cp = predict_rating_pearson()

In [46]:
movie_id= 1193
user_id =1

In [47]:
cp.estimate(user_id, movie_id)

4.608110178149255

In [48]:
movielens[(movielens['movie_id']==movie_id) & (movielens['user_id']==user_id)]['rating'][0]

5

In [49]:
movie_id= 20
user_id =6010

In [50]:
cp.estimate(user_id, movie_id)

2.4985828997845596

In [51]:
movielens[(movielens['movie_id']==movie_id) & (movielens['user_id']==user_id)]['rating']

953882    3
Name: rating, dtype: int64

In [52]:
# process followed in this notebook:
# in this notebook i have tried to recommend movie to a particular user based on below 2 methods:
# 1. recommend movies on the basis of movies liked by similar users’s   : 
#     in this process we find similar users on the basis of user features like age, gender and genres of movies they 
#     like   after getting similar users we recommend movies liked by top 5 users method used to get similar users: 
#     using sklearn.cosine-similarity function cosine distance between users is computed  which is based on the fact 
#     that so smaller the distance similar is the user , so cosine distance is sorted in descending order in order 
#     to get the top similar users 
    
# 2. recommend movie on the basis of predicting rating a user can give to a particular movie based on other user 
#    profiles
#     in this process pearson correlation is computed to find the correlation among users 
#     based on ratings given to each movie by each user for ex : one series contains ratings given to movies by a  
#     particular user say ‘user 1’ and other series contains ratings given to movies by other users .
#     now pearson correlation is computed between first series and other series . now we have correlation coefficient 
#     between the user 1 and other users . now next step is calculating mean of other user’s ratings of a particular
#     movie with weightage given as correlation coefficient
