In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import math
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import imshow
from matplotlib.pyplot import figure
from sklearn.decomposition import TruncatedSVD
from scipy.linalg import svd
from scipy.linalg import sqrtm
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import itertools

In [2]:
rating = pd.read_csv('u.data', sep = '\t', header = None, names = ['user_id', 'movie_id', 'rating', 'timestamp'], encoding = 'utf-8')
user = pd.read_csv('u.user', sep = '|', header = None, names = ['user_id', 'age', 'gender', 'occupation', 'zip_code'], encoding = 'utf-8')
movie = pd.read_csv('u.item', sep = '|', encoding="iso-8859-1", header = None, names = ["movie_id", "movie_title", "release_date", "video_releasedate", "IMDbURL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])

In [3]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movie.head()

Unnamed: 0,movie_id,movie_title,release_date,video_releasedate,IMDbURL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
data = rating.merge(movie, on = 'movie_id')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_releasedate,IMDbURL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
num_ratings = data.groupby('movie_id').agg({'rating':'count'}).reset_index()
num_ratings = num_ratings.rename(columns = {'rating':'num_ratings'})
num_ratings.head()

Unnamed: 0,movie_id,num_ratings
0,1,452
1,2,131
2,3,90
3,4,209
4,5,86


In [8]:
data = data.merge(num_ratings, on = 'movie_id')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_releasedate,IMDbURL,unknown,Action,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,num_ratings
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,117
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,117
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,117
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,117
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,117


# Memory Based Recommendation System

## User Based Collaborative Filtering

### Using most similar user's top rated movies

In [9]:
user_rating_matrix = rating.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
user_rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
scaler = StandardScaler(with_mean = False)
scaler.fit(user_rating_matrix)
scaled_matrix = scaler.transform(user_rating_matrix)

In [11]:
normalize = Normalizer(copy=False) 
normalized_matrix = normalize.fit_transform(scaled_matrix)

In [12]:
user_similarity_matrix = cosine_similarity(normalized_matrix)
user_similarity = pd.DataFrame(user_similarity_matrix)
user_similarity.index = user_rating_matrix.index
user_similarity.columns = user_rating_matrix.index
user_similarity.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.06656,0.016278,0.017677,0.17277,0.197338,0.149129,0.146407,0.076003,0.150385,...,0.147266,0.049227,0.11427,0.124748,0.091067,0.04565,0.134628,0.061272,0.071086,0.180272
2,0.06656,1.0,0.054343,0.048188,0.014567,0.145459,0.027335,0.042207,0.05803,0.047034,...,0.066043,0.143337,0.0952,0.254762,0.121813,0.080277,0.10173,0.076172,0.080731,0.036744
3,0.016278,0.054343,1.0,0.182993,0.00335,0.031566,0.030019,0.102404,0.023455,0.031459,...,0.010128,0.011961,0.043557,0.030612,0.059625,0.007958,0.114572,0.035113,0.059952,0.0056
4,0.017677,0.048188,0.182993,1.0,0.004261,0.020976,0.021003,0.081111,0.021571,0.011863,...,0.014219,0.009976,0.02808,0.0615,0.05162,0.004874,0.092931,0.057219,0.10452,0.018402
5,0.17277,0.014567,0.00335,0.004261,1.0,0.077708,0.153711,0.100931,0.019659,0.064338,...,0.130711,0.020854,0.015995,0.026771,0.057025,0.054179,0.092036,0.057371,0.048328,0.143546


In [13]:
def user_recommendations_same_movie(user_id, num_ratings = 10, num_recom = 5):
    similar_user = user_similarity.loc[user_id, :].sort_values(ascending = False).index[1]
    similar_user_movies = data[(data['user_id'] == similar_user) & (data['num_ratings'] > 100)].sort_values(by = ['rating', 'num_ratings'], ascending = False).drop_duplicates()[['movie_id', 'movie_title', 'rating']]
    main_user_movies = data[(data['user_id'] == user_id) & (data['num_ratings'] > 100)].sort_values(by = ['rating', 'num_ratings'], ascending = False).drop_duplicates()[['movie_id', 'movie_title']]
    final_list = similar_user_movies.merge(main_user_movies, on = ['movie_id', 'movie_title']).set_index('movie_id').sort_values(by = 'rating', ascending = False)
    return final_list['movie_title'].head(num_recom)

In [14]:
user_recommendations_same_movie(4)

movie_id
300        Air Force One (1997)
328    Conspiracy Theory (1997)
294            Liar Liar (1997)
288               Scream (1996)
Name: movie_title, dtype: object

In [15]:
user_recommendations_same_movie(942)

movie_id
215                    Field of Dreams (1989)
357    One Flew Over the Cuckoo's Nest (1975)
520                  Great Escape, The (1963)
705                Singin' in the Rain (1952)
50                           Star Wars (1977)
Name: movie_title, dtype: object

In [16]:
def user_recommendations_not_rated_movie(user_id, num_ratings = 10, num_recom = 5):
    similar_user = user_similarity.loc[user_id, :].sort_values(ascending = False).index[1]
    similar_user_movies = data[(data['user_id'] == similar_user) & (data['num_ratings'] > 100)].sort_values(by = ['rating', 'num_ratings'], ascending = False).drop_duplicates()[['movie_id', 'movie_title', 'rating']]
    main_user_movies = data[(data['user_id'] == user_id) & (data['num_ratings'] > 100)].sort_values(by = ['rating', 'num_ratings'], ascending = False).drop_duplicates()[['movie_id', 'movie_title']]
    movie_list = similar_user_movies.merge(main_user_movies, on = ['movie_id'], how = 'left', suffixes = ('_similar_user', '_main_user'))
    final_list = movie_list[movie_list['movie_title_main_user'].isna()].set_index('movie_id').sort_values(by = 'rating', ascending = False)
    return final_list['movie_title_similar_user'].head(num_recom)

In [17]:
user_recommendations_not_rated_movie(4)

movie_id
748           Saint, The (1997)
333            Game, The (1997)
322       Murder at 1600 (1997)
272    Good Will Hunting (1997)
313              Titanic (1997)
Name: movie_title_similar_user, dtype: object

In [18]:
user_recommendations_not_rated_movie(942)

movie_id
64         Shawshank Redemption, The (1994)
87       Searching for Bobby Fischer (1993)
190                          Henry V (1989)
651                            Glory (1989)
199    Bridge on the River Kwai, The (1957)
Name: movie_title_similar_user, dtype: object

### Using the predicted rating of unrated movies

In [19]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [20]:
avg_rating = rating.groupby('user_id').agg({'rating':'mean'}).reset_index().rename(columns = {'rating':'avg_rating'})
avg_rating.head()

Unnamed: 0,user_id,avg_rating
0,1,3.610294
1,2,3.709677
2,3,2.796296
3,4,4.333333
4,5,2.874286


In [21]:
rating_updated = rating.merge(avg_rating, on = 'user_id')
rating_updated['new_rating'] = rating_updated['rating'] - rating_updated['avg_rating']
rating_updated.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_rating,new_rating
0,196,242,3,881250949,3.615385,-0.615385
1,196,393,4,881251863,3.615385,0.384615
2,196,381,4,881251728,3.615385,0.384615
3,196,251,3,881251274,3.615385,-0.615385
4,196,655,5,881251793,3.615385,1.384615


In [22]:
user_rating_matrix_1 = rating_updated.pivot(index = 'user_id', columns = 'movie_id', values = 'new_rating').fillna(0)
user_rating_matrix_1.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.290323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.709677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.125714,0.125714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
user_similarity_array_1 = cosine_similarity(user_rating_matrix_1)
np.fill_diagonal(user_similarity_array_1, 0 )
user_similarity_dataframe = pd.DataFrame(user_similarity_array_1)
user_similarity_dataframe.index = user_rating_matrix_1.index
user_similarity_dataframe.columns = user_rating_matrix_1.index
user_similarity_dataframe.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.043411,0.011051,0.059303,0.134514,0.103373,0.110556,0.180891,0.012253,-0.000621,...,0.025835,-0.047952,0.087224,0.007718,0.074378,0.078714,0.067433,0.02879,-0.03127,0.032123
2,0.043411,0.0,0.013658,-0.017016,0.03577,0.094503,0.089408,0.05564,0.027294,0.097846,...,0.012853,-0.028798,0.056659,0.197835,0.090009,0.032505,0.015053,-0.017344,0.012068,0.039173
3,0.011051,0.013658,0.0,-0.059638,0.016037,-0.017158,0.016141,0.041177,-0.010093,0.023856,...,0.001615,0.000658,-0.006888,0.036157,-0.018513,-0.00624,-0.023907,0.034414,-0.009187,0.001489
4,0.059303,-0.017016,-0.059638,0.0,0.007373,-0.053929,-0.025604,0.136046,0.016082,-0.013588,...,0.011895,0.002174,-0.028,-0.025021,0.022882,-0.00596,0.279818,0.258594,0.064504,-0.019222
5,0.134514,0.03577,0.016037,0.007373,0.0,0.038484,0.067874,0.140106,0.010195,0.014335,...,0.070014,-0.070821,0.024278,0.038672,0.093567,0.051782,0.02954,0.036234,0.043318,0.099324


In [24]:
# Finding top k neighbors
def top_k_neighbors(df, k):
    return df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:k].index, index=['top{}'.format(i) for i in range(1, k+1)]), axis=1)

top_neighbors = top_k_neighbors(user_similarity_dataframe, 30)

In [25]:
# user-item score calculation
def user_item_score(user_id, movie_id):
    similar_users = top_neighbors[top_neighbors.index == user_id].values
    similar_users_list = similar_users.squeeze().tolist()
    movie_user_list = user_rating_matrix_1.loc[:, movie_id]
    filtered_users = movie_user_list[movie_user_list.index.isin(similar_users_list)]
    filtered_users = filtered_users[filtered_users.notnull()]
    mean_value_user = avg_rating[avg_rating['user_id'] == user_id].values[0][1]
    filtered_similar_users = filtered_users.index.tolist()
    similar_user_rating = user_similarity_dataframe.loc[user_id, filtered_similar_users]
    fin = pd.concat([filtered_users, similar_user_rating], axis=1)
    fin.columns = ['score','weights']
    fin['net_score'] = fin['score']*fin['weights']
    rating_sum = fin['net_score'].sum()
    weights_sum = fin['weights'].sum()
    final_score = mean_value_user + (rating_sum/weights_sum)
    return final_score

In [26]:
user_item_score(1,7)

4.010185983524527

In [27]:
def top_k_recommendations(user_id, num_reco = 5):
    user_seen_movies = user_rating_matrix_1.loc[user_id, :]
    user_seen_movies = user_seen_movies[user_seen_movies != 0].index.tolist()
    similar_users = top_neighbors[top_neighbors.index == user_id].values
    similar_users_list = similar_users.squeeze().tolist()
    movie_list = []
    for i in similar_users_list:
        similar_user_seen_movies = user_rating_matrix_1.loc[i, :]
        similar_user_seen_movies = similar_user_seen_movies[similar_user_seen_movies != 0].index.tolist()
        movie_list.append(similar_user_seen_movies)
        final_list = set(list(itertools.chain.from_iterable(movie_list)))
    unseen_movies = set(final_list) - set(user_seen_movies)
    rating = []
    for i in unseen_movies:
        rating.append(user_item_score(user_id, i))
    unseen_movie_rating = pd.DataFrame({'movie_id':list(unseen_movies),'rating':rating})
    top_k_recommendations = unseen_movie_rating.merge(movie, on = 'movie_id').sort_values(by = 'rating', ascending = False).set_index('movie_id')
    return top_k_recommendations['movie_title'].head(num_reco)

In [28]:
top_k_recommendations(50)

movie_id
50             Star Wars (1977)
56          Pulp Fiction (1994)
302    L.A. Confidential (1997)
315            Apt Pupil (1998)
272    Good Will Hunting (1997)
Name: movie_title, dtype: object

In [29]:
user = int(input("Enter the user id to whom you want to recommend : "))
predicted_movies = top_k_recommendations(user)
print(" ")
print("The Recommendations for User Id :", user)
print("   ")
for i in predicted_movies:
    print(i)

Enter the user id to whom you want to recommend : 50
 
The Recommendations for User Id : 50
   
Star Wars (1977)
Pulp Fiction (1994)
L.A. Confidential (1997)
Apt Pupil (1998)
Good Will Hunting (1997)


## Item Based Collaborative Filtering

In [30]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [31]:
avg_rating_movie = rating.groupby('movie_id').agg({'rating':'mean'}).reset_index().rename(columns = {'rating':'avg_rating'})
avg_rating_movie.head()

Unnamed: 0,movie_id,avg_rating
0,1,3.878319
1,2,3.206107
2,3,3.033333
3,4,3.550239
4,5,3.302326


In [32]:
movie_rating_updated = rating.merge(avg_rating_movie, on = 'movie_id')
movie_rating_updated['new_rating'] = movie_rating_updated['rating'] - movie_rating_updated['avg_rating']
movie_rating_updated.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_rating,new_rating
0,196,242,3,881250949,3.991453,-0.991453
1,63,242,3,875747190,3.991453,-0.991453
2,226,242,5,883888671,3.991453,1.008547
3,154,242,3,879138235,3.991453,-0.991453
4,306,242,5,876503793,3.991453,1.008547


In [33]:
movie_rating_matrix_1 = rating_updated.pivot(index = 'movie_id', columns = 'user_id', values = 'new_rating').fillna(0)
movie_rating_matrix_1.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,0.290323,0.0,0.0,1.125714,0.364929,0.0,0.0,0.0,-0.206522,...,-1.701149,-0.923077,0.253521,0.0,0.731481,0.0,0.0,0.954545,0.0,0.0
2,-0.610294,0.0,0.0,0.0,0.125714,0.0,0.0,0.0,0.0,0.0,...,0.298851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.589286
3,0.389706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.253521,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.610294,0.0,0.0,0.0,0.0,0.0,1.034739,0.0,0.0,-0.206522,...,1.298851,0.0,0.0,0.0,0.0,0.0,-1.457944,0.0,0.0,0.0
5,-0.610294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
movie_similarity_array_1 = cosine_similarity(movie_rating_matrix_1)
np.fill_diagonal(movie_similarity_array_1, 0 )
movie_similarity_dataframe = pd.DataFrame(movie_similarity_array_1)
movie_similarity_dataframe.index = movie_rating_matrix_1.index
movie_similarity_dataframe.columns = movie_rating_matrix_1.index
movie_similarity_dataframe.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,-0.058367,-0.06556,-0.039649,0.011691,0.027294,0.074627,0.12737,0.038487,0.004889,...,0.060065,0.0,0.0,0.0,0.0111,0.0,0.0,0.0,0.053377,-0.033203
2,-0.058367,0.0,0.033551,0.031268,0.013884,-0.010781,-0.0545,-0.008301,-0.105092,-0.020407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001926,0.036452
3,-0.06556,0.033551,0.0,-0.117538,0.013721,0.053766,-0.088374,-0.150728,-0.067389,-0.04981,...,0.0,0.0,0.0,0.0,0.20092,0.0,0.0,0.0,0.0,0.033238
4,-0.039649,0.031268,-0.117538,0.0,-0.150014,-0.023626,0.004446,0.110419,0.027818,0.010657,...,0.0,0.0,-0.114405,-0.114405,0.089998,0.0,0.0,0.0,0.001436,-0.047084
5,0.011691,0.013884,0.013721,-0.150014,0.0,-0.041925,-0.045006,0.011713,-0.0624,-0.032335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04563


In [35]:
# Finding top k movie neighbors
def top_k_neighbors_movie(df, k):
    return df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:k].index, index=['top{}'.format(i) for i in range(1, k+1)]), axis=1)

top_movie_neighbors = top_k_neighbors_movie(movie_similarity_dataframe, 100)

In [36]:
top_movie_neighbors.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top91,top92,top93,top94,top95,top96,top97,top98,top99,top100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,174,588,50,28,95,210,172,64,22,181,...,161,223,418,282,936,603,202,705,921,1482
2,1419,233,54,405,1555,562,566,1621,385,926,...,1181,73,942,1422,62,843,148,574,1027,1595
3,409,1230,1052,1231,1677,783,1552,1659,1660,1476,...,1219,722,560,725,1094,1055,577,1016,875,390
4,1554,414,1626,1550,152,85,144,47,1203,1495,...,867,50,1677,204,157,868,420,56,651,42
5,1554,857,830,852,581,911,907,912,397,839,...,1041,314,802,1034,1526,819,417,976,403,629


In [37]:
# user-item score calculation
def user_item_score_1(user_id, movie_id):
    similar_movies = top_movie_neighbors[top_movie_neighbors.index == movie_id].values
    similar_movie_list = similar_movies.squeeze().tolist()
    movie_user_list_1 = movie_rating_matrix_1.loc[ : , user_id]
    filtered_movies_1 = movie_user_list_1[movie_user_list_1.index.isin(similar_movie_list)]
    filtered_movies_1 = filtered_movies_1[filtered_movies_1.notnull()]
    mean_value_movie = avg_rating_movie[avg_rating_movie['movie_id'] == movie_id].values[0][1]
    filtered_similar_movies = filtered_movies_1.index.tolist()
    similar_movie_rating = movie_similarity_dataframe.loc[movie_id, filtered_similar_movies]
    final_movie = pd.concat([filtered_movies_1, similar_movie_rating], axis=1)
    final_movie.columns = ['score','weights']
    final_movie['net_score'] = final_movie['score']*final_movie['weights']
    movie_rating_sum = final_movie['net_score'].sum()
    movie_weights_sum = final_movie['weights'].sum()
    final_score_movie = mean_value_movie + (movie_rating_sum/movie_weights_sum)
    return final_score_movie

In [38]:
user_item_score_1(1, 60)

4.315064198683939

In [39]:
def top_k_item_based(user_id, num_reco = 5):
    user_unseen_movies = movie_rating_matrix_1.loc[ : , user_id]
    user_unseen_movies = user_unseen_movies[user_unseen_movies == 0].index.tolist()
    rating = []
    for i in user_unseen_movies:
        rating.append(user_item_score_1(user_id, i))
    unseen_movie_rating_1 = pd.DataFrame({'movie_id':user_unseen_movies,'rating':rating})
    top_k_recommendations_1 = unseen_movie_rating_1.merge(movie, on = 'movie_id').sort_values(by = 'rating', ascending = False).set_index('movie_id')
    return top_k_recommendations_1['movie_title'].head(num_reco)

In [40]:
top_k_item_based(40)

movie_id
1653    Entertaining Angels: The Dorothy Day Story (1996)
1536                                 Aiqing wansui (1994)
1467                 Saint of Fort Washington, The (1993)
814                         Great Day in Harlem, A (1994)
1189                                   Prefontaine (1997)
Name: movie_title, dtype: object

## Model Based Recommendation System 

### Using Matrix Factorization 

In [41]:
rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [42]:
avg_rating_user = rating.groupby('user_id').agg({'rating':'mean'}).reset_index().rename(columns = {'rating':'avg_rating'})
avg_rating_user.head()

Unnamed: 0,user_id,avg_rating
0,1,3.610294
1,2,3.709677
2,3,2.796296
3,4,4.333333
4,5,2.874286


In [43]:
rating_updated_user = rating.merge(avg_rating_user, on = 'user_id')
rating_updated_user['new_rating'] = rating_updated_user['rating'] - rating_updated_user['avg_rating']
rating_updated_user.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,avg_rating,new_rating
0,196,242,3,881250949,3.615385,-0.615385
1,196,393,4,881251863,3.615385,0.384615
2,196,381,4,881251728,3.615385,0.384615
3,196,251,3,881251274,3.615385,-0.615385
4,196,655,5,881251793,3.615385,1.384615


In [44]:
user_rating_matrix_2 = rating_updated_user.pivot(index = 'user_id', columns = 'movie_id', values = 'new_rating').fillna(0)
user_rating_matrix_2.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.290323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.709677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.125714,0.125714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
def new_rating_matrix_based(df):
    U, S, Mt = np.linalg.svd(df, full_matrices=False)
    variance_explained = ((S/S.sum())*100).cumsum()
    k = (variance_explained < 90).sum()
    S = np.diag(S)
    S = sqrtm(S)
    U = U[:, :k]
    S = S[:k, :k]
    Mt = Mt[:k, :]
    a = np.dot(U, S)
    b = np.dot(S, Mt)
    user_rating_matrix_pred = np.dot(a, b)
    user_rating_dataframe_pred = pd.DataFrame(user_rating_matrix_pred)
    user_rating_dataframe_pred.index = df.index
    user_rating_dataframe_pred.columns = df.columns
    new_ratings = user_rating_dataframe_pred.apply(lambda x : x + avg_rating_user.set_index('user_id')['avg_rating'], axis = 0)
    return new_ratings

In [46]:
new_ratings = new_rating_matrix_based(user_rating_matrix_2)

In [47]:
def top_k_matrix_based(user_id, num_reco = 5):
    user_unseen_movies = movie_rating_matrix_1.loc[ : , user_id]
    user_unseen_movies = user_unseen_movies[user_unseen_movies == 0].index.tolist()
    rating = []
    for i in user_unseen_movies:
        rating.append(new_ratings.loc[user_id, i])
    unseen_movie_rating_1 = pd.DataFrame({'movie_id':user_unseen_movies,'rating':rating})
    top_k_recommendations_1 = unseen_movie_rating_1.merge(movie, on = 'movie_id').sort_values(by = 'rating', ascending = False).set_index('movie_id')
    return top_k_recommendations_1['movie_title'].head(num_reco)

In [48]:
top_k_matrix_based(23)

movie_id
524              Great Dictator, The (1940)
949    How to Make an American Quilt (1995)
622            Swiss Family Robinson (1960)
975                             Fear (1996)
490                 To Catch a Thief (1955)
Name: movie_title, dtype: object

In [49]:
top_k_matrix_based(69)

movie_id
860                Believers, The (1987)
287                 Marvin's Room (1996)
1132            Feeling Minnesota (1996)
864           My Fellow Americans (1996)
931     Island of Dr. Moreau, The (1996)
Name: movie_title, dtype: object