In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

In [2]:
_dir = 'data/movielens/'
movie_file = _dir+'movies.csv'
rating_file = _dir+'ratings.csv'
tag_file = _dir+'tags.csv'

In [3]:
movies = pd.read_csv(movie_file,encoding="Latin1")
ratings = pd.read_csv(rating_file,encoding="Latin1")
tags = pd.read_csv(tag_file,encoding="Latin1")

## User Based CF

In [4]:
# mean rating of each uset
mean = ratings.groupby(by='userId', as_index=False)['rating'].mean()

# merge mean with ratings df
merged = pd.merge(ratings, mean, on='userId')

# normalized rating = rating(user u,item i) - mean_rating_of_user(u)
merged['rating_diff_avg'] = merged['rating_x'] - merged['rating_y']
print (merged.shape)
merged.head()

(264505, 6)


Unnamed: 0,userId,movieId,rating_x,timestamp,rating_y,rating_diff_avg
0,12882,1,4.0,1147195252,4.061321,-0.061321
1,12882,32,3.5,1147195307,4.061321,-0.561321
2,12882,47,5.0,1147195343,4.061321,0.938679
3,12882,50,5.0,1147185499,4.061321,0.938679
4,12882,110,4.5,1147195239,4.061321,0.438679


In [5]:
# matrix for row=users; cols=movies, values=norm_rating
final=pd.pivot_table(merged, values='rating_diff_avg', index='userId',columns='movieId')

# Replacing NaN by Movie Average
final = final.fillna(final.mean(axis=0))

final.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,-0.829457,-0.436518,-0.468109,-0.770223,-0.615331,0.320415,-1.329457,-0.690175,-0.829457,-0.094277,...,0.105075,0.006629,0.262314,0.23735,0.429868,0.306567,0.22511,0.234458,0.362468,0.349157
320,0.20022,-0.436518,-0.468109,-0.770223,-0.615331,0.320415,-0.203889,-0.690175,-0.150642,-0.094277,...,0.105075,0.006629,0.262314,0.23735,0.429868,0.306567,0.22511,0.234458,0.362468,0.349157
359,1.314526,-0.436518,-0.468109,-0.770223,-0.615331,1.314526,-0.203889,-0.690175,0.314526,0.314526,...,0.105075,0.006629,0.262314,0.23735,0.429868,0.306567,0.22511,0.234458,0.362468,0.349157
370,0.705596,0.205596,-0.468109,-0.770223,-0.615331,1.205596,-0.203889,-0.690175,-0.150642,-0.094277,...,-1.294404,-0.794404,0.705596,0.205596,0.429868,0.306567,-0.794404,0.705596,-0.294404,-0.794404
910,1.10192,0.10192,-0.39808,-0.770223,-0.39808,-0.39808,-0.203889,-0.690175,-0.150642,0.10192,...,0.105075,0.006629,-0.39808,0.23735,0.429868,0.306567,0.22511,0.60192,0.362468,0.349157


In [6]:
# user similarity
cosine = cosine_similarity(final)

# fill 0 to diagonal elements
np.fill_diagonal(cosine,0)

# convert np array to dataframe
user_user_sim = pd.DataFrame(cosine,index=final.index, columns=final.index)

print (user_user_sim.shape)
user_user_sim.head()

(862, 862)


userId,316,320,359,370,910,975,1015,1387,1447,1588,...,137118,137209,137227,137446,137559,137609,137805,138072,138176,138200
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,0.0,0.921169,0.665659,0.673486,0.694247,0.894969,0.80578,0.851492,0.945224,0.705491,...,0.827564,0.895641,0.87929,0.916856,0.912146,0.922262,0.587738,0.671783,0.949138,0.74022
320,0.921169,0.0,0.687225,0.691158,0.699527,0.91602,0.816931,0.874283,0.970234,0.724147,...,0.861798,0.909376,0.907009,0.938964,0.929049,0.943265,0.612746,0.695382,0.973853,0.768459
359,0.665659,0.687225,0.0,0.534369,0.523475,0.655225,0.602806,0.629143,0.705042,0.542504,...,0.62182,0.65432,0.655839,0.679696,0.6839,0.686193,0.418283,0.489595,0.70737,0.534065
370,0.673486,0.691158,0.534369,0.0,0.54756,0.67181,0.618456,0.628825,0.712683,0.548592,...,0.636688,0.673489,0.651209,0.688647,0.689265,0.692595,0.405881,0.497332,0.714011,0.546637
910,0.694247,0.699527,0.523475,0.54756,0.0,0.680701,0.621463,0.634921,0.723574,0.528281,...,0.638257,0.668887,0.677377,0.701964,0.701245,0.705041,0.408456,0.509008,0.725896,0.554105


In [7]:
def topn_user(user_user_sim_df, user_id, n):
    '''
    given a user, find top N similar users
    '''
    a = user_user_sim_df.loc[:, user_id]
    return a.nlargest(n)

def movies_both_users(df, movies, user_a, user_b):
    '''
    given two users (a & b), check items seen by both and compare their ratings
    '''
    df_a = df[df['userId']==user_a]
    df_b = df[df['userId']==user_b]
    df_m = pd.merge(df_a, df_b, how='inner', on=['movieId'])
    df_m = pd.merge(df_m, movies, on='movieId', how='inner')
    cols = ['title','rating_x_x', 'rating_x_y', 'genres']
    df_m = df_m[cols]
    return df_m

def user_item_score(user_user_sim_df, user_item_rating_df, mean_user_rating_df, user, item, n=30):
    '''
    given user a and item b, calculate score
    '''
    # find top N users to the {given user}
    topn_users_series = topn_user(user_user_sim_df, user, n)
    top_n_users_list = list(topn_users_series.index.values)
    
    # find rating of the {given item}, by top n users
    item_ratings = user_item_rating_df.loc[:,item]
    item_ratings_topn = item_ratings[item_ratings.index.isin(top_n_users_list)]
    
    # {given user} mean rating
    mean_user_rating = (mean_user_rating_df[mean_user_rating_df['userId']==user]['rating']).values[0]
    
    # similarity of {given user} with top n users, ratings given by those top n users to the {given item}
    fin = pd.concat([item_ratings_topn, topn_users_series], axis=1)
    fin.columns = ['norm_rating','user_similarity']
    fin['score'] = fin.apply(lambda x: x['norm_rating'] * x['user_similarity'], axis=1)

    score = mean_user_rating + (fin['score'].sum() / fin['user_similarity'].sum())
    return score

def items_by_closest_users(user_user_sim_df, merged_df, user, n=30):
    topn_users_series = topn_user(user_user_sim_df, user, n)
    closest_users = list(topn_users_series.index.values)

    items_from_closest_users = merged_df[merged_df['userId'].isin(closest_users)]
    items_from_closest_users = items_from_closest_users['movieId'].values.squeeze().tolist()
    return items_from_closest_users

def recommendations(user_user_sim_df, user_item_rating_df, mean_user_rating_df, merged_df, user, topN=50):
    item_score = list()
    items = items_by_closest_users(user_user_sim_df, merged_df, user)
    items = list(set(items))
    for item in items:
        score = user_item_score(user_user_sim_df, user_item_rating_df, mean_user_rating_df, user, item, n=30)
        item_score.append((item, score))
    item_score_sorted = sorted(item_score, key=lambda x:(-x[1],x[0]))[0:topN]
    
    movie_ids = [i[0] for i in item_score_sorted]
    result = movies[movies['movieId'].isin(movie_ids)]
    return result
    

In [8]:
######## Test #######
target_user = 370
topn = 5
topn_users = topn_user(user_user_sim, target_user, topn)
print ('Top 5 users to', target_user)
print ('\t', topn_users)

# user_b = list(topn_users.keys())[0]
# both_user_movies = movies_both_users(merged, movies, target_user, user_b)
# print ('\nMovies watched by both users: [', target_user,user_b ,']')
# both_user_movies.head(5)


# score between a user and an item
user = 320
item = 7371
score = user_item_score(user_user_sim, final, mean, user, item)
print ('\nscore between [user:' + str(user) + ', item:' + str(item) + '] = ' + str(score))


Top 5 users to 370
	 userId
86309     0.715393
44194     0.714642
138176    0.714011
24802     0.713716
129869    0.713009
Name: 370, dtype: float64

score between [user:320, item:7371] = 4.255766437391595


In [9]:
recs = recommendations(user_user_sim, final, mean, merged, user)
recs

Unnamed: 0,movieId,title,genres
42,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
147,246,Hoop Dreams (1994),Documentary
175,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller
176,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
189,318,"Shawshank Redemption, The (1994)",Crime|Drama
302,527,Schindler's List (1993),Drama|War
386,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
422,858,"Godfather, The (1972)",Crime|Drama
442,912,Casablanca (1942),Drama|Romance
443,913,"Maltese Falcon, The (1941)",Film-Noir|Mystery
