In this file, we implement three basic recommendation algorithms on 1M movielens dataset:

    1- content-based algorithm that attempts to figure out what a user's favourite aspects of an item is, and then recommends items that present those aspects
    2- user-based algorithm where the recommendation is computed as the weighted sum of the most similar users' ratings to the target items.
    3- users clustering based algorithm when the recommendation is computed based on users in the same cluster
    4- items clustering based algorithm when the recommendation is computed based on 'the user's interest to similar items
    

In [1]:
import pandas as pd
import numpy as np
import sklearn
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
ratings_df = pd.read_excel('C:/Users/sabrine abdrabbah/Jupyter notebook/ratings1M.xlsx')
#Storing the movie information into a pandas dataframe
movies_df = pd.read_excel('C:/Users/sabrine abdrabbah/Jupyter notebook/movies1M.xlsx')


In [2]:
print("total number of ratings",len(ratings_df), "total number of users", len(ratings_df.userId.unique().tolist()), 
      "total number of movies", len(ratings_df.movieId.unique().tolist()) )

total number of ratings 1000209 total number of users 6040 total number of movies 3706


In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# Create a rating matrix where the columns are movies, and the rows are users. 
movie_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
movie_matrix.head()
#the matrix is so sparse (most users have only rated a small selection of the entire set of movies)

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [6]:
#data preprocessing to extract years from titles and differets genres
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df.title.str.strip()
##Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#create list of genres
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western,Miami Beach (1988)
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
moviesWithYears_df = movies_df.copy()
#Dropping the genres column
moviesWithYears_df = moviesWithYears_df.drop('genres', 1)
moviesWithYears_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [9]:
def split_ratings(user_id):
    active_user_ratings=ratings_df[ratings_df.userId==user_id]
    return train_test_split(active_user_ratings, test_size=0.3, shuffle=False)

In [10]:
# Define Function to get the most rated movies for the group
def get_most_rated_movies(movie_ratings, max_number_of_movies):
    user_movie_ratings=movie_ratings.reset_index()
    # 1- Count: consiste à creer une nouvelle ligne contenant le nombre de ratings given to each movie
    user_movie_ratings = user_movie_ratings.append(user_movie_ratings.count(), ignore_index=True)
    # 2- sort selon ligne count
    user_movie_ratings_sorted = user_movie_ratings.sort_values(len(user_movie_ratings)-1, axis=1, ascending=False)
    # 3- Drop ligne count
    user_movie_ratings_sorted = user_movie_ratings_sorted.drop(user_movie_ratings_sorted.tail(1).index)
    # 4- slice
    most_rated_movies = user_movie_ratings_sorted.iloc[:, :max_number_of_movies]
    return most_rated_movies

In [45]:
def generate_content_based_recom(train, test):
    y_pred=[]
    y_test=[]
    train=train[train.rating>=5] #learn the user profile from the most liked items (already seen)
    input_user=train.sort_values(by='movieId', ascending=True).reset_index(drop=True)
    userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(input_user['movieId'].tolist())]
    userMovies = userMovies.reset_index(drop=True)
    userMovies = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
    userProfile = userMovies.transpose().dot(input_user['rating'])
    userProfile = (userProfile/(userProfile.sum()))*5 #normalisation
    genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
    genreTable= genreTable[genreTable.index.isin(test['movieId'].tolist())]
    #And drop the unnecessary information
    genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
    recommendation_list=genreTable.index.tolist()
    
    y_pred= np.append(y_pred,(genreTable*userProfile).sum(axis=1))
    y_test=np.append(y_test,test.sort_values(by='movieId', ascending=True).set_index('movieId').rating)
    return recommendation_list, y_pred, y_test

In [12]:
from math import sqrt
from scipy.stats import pearsonr
#predictions is computed for items that have been already liked by similar users
def generate_user_based_recom(train, test, active_user):
    y_predCF=[]
    y_testCF=[]
    X=5 #total number of neighbors
    #Select the movies that the active user has watched
    userInput = train.drop('userId', 1).drop('timestamp', 1)
    inputId = moviesWithYears_df[moviesWithYears_df['movieId'].isin(userInput['movieId'].tolist())]
    inputMovies = pd.merge(inputId, userInput)
    inputMovies = inputMovies.drop('year', 1)
    #Filtering out users that have watched movies that the input has watched and storing it
    userSubset = ratings_df[(ratings_df['userId']!=active_user) &
                      (ratings_df['movieId'].isin(inputMovies['movieId'].tolist()))]
    #Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
    userSubsetGroup = userSubset.groupby(['userId'])
    #Sorting it so users with movie most in common with the input will have priority
    userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
    #calculate the Pearson Correlation between input user and subset group
    pearsonCorrelationDict = {}
        #For every user group in our subset
    for name, group in userSubsetGroup:
        #Let's start by sorting the input and current user group so the values aren't mixed up later on
        group = group.sort_values(by='movieId')
        inputMovies = inputMovies.sort_values(by='movieId')
        #Get the N for the formula (number of co-rated items)
        nRatings = len(group)
        
        #Get the review scores for the movies that they both have in common
        temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
        #And then store them in a temporary buffer variable in a list format to facilitate future calculations
        tempRatingList = temp_df['rating'].tolist()
        #Let's also put the current user group reviews in a list format
        tempGroupList = group['rating'].tolist()
        #Now let's calculate the pearson correlation between two users, so called, x and y
        Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
        Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
        Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
        # calculate Pearson's correlation
           
        #If the denominator is different than zero, then divide, else, 0 correlation.
        if Sxx != 0 and Syy != 0:
            pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
        else:
            pearsonCorrelationDict[name] = 0
    #transform the dictionary into dataframe
    pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    pearsonDF.columns = ['similarityIndex']
    pearsonDF['userId'] = pearsonDF.index
    pearsonDF.index = range(len(pearsonDF))
    #Now let's get the top X users that are most similar to the input (neighbors).
    topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:X]
    
    #to get all the items rated by similar users to the input (neighbors)
    topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
    topUsersRating=topUsersRating[~topUsersRating.movieId.isin(inputMovies['movieId'].tolist())]
    
    #Multiplies the similarity by the user's ratings
    topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
    
    #Applies a sum to the topUsers after grouping it up by userId
    tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
    tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
    
    #Creates an empty dataframe
    recommendation_df = pd.DataFrame()
    #Now we take the weighted average
    recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
    #recommendation_df['movieId'] = tempTopUsersRating.index
    recommendation_df=recommendation_df.reset_index()
    #recommendation_df.head()
    #Now let's sort it and see the top 20 movies that the algorithm recommended
    recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
    
    test_prediction=recommendation_df.merge(test[['movieId','rating']], left_on='movieId', right_on='movieId', how='inner')
    #test_prediction=test_prediction[test_prediction['weighted average recommendation score'].notnull()]
    y_predCF= np.append(y_predCF,test_prediction['weighted average recommendation score'])
    y_testCF=np.append(y_testCF,test_prediction['rating'])
    
    
    recommendation_list=test.movieId.tolist()
    return recommendation_list, y_predCF, y_testCF
    
    
    

## 1- Content-based collaborative filtering

In [13]:
user_id=5114
train, test= split_ratings(user_id)
print("total train of the active user", len(train))
print("total test of the active user", len(test))

total train of the active user 104
total test of the active user 45


In [46]:
recommendation_list, y_pred, y_test = generate_content_based_recom(train, test)
print(recommendation_list)
print(y_pred)
print(y_test)

[126, 141, 146, 150, 153, 169, 432, 441, 455, 480, 531, 551, 892, 899, 1009, 1012, 1021, 1022, 1023, 1028, 1029, 1032, 1059, 1064, 1073, 1088, 1097, 1301, 1367, 1372, 1376, 1377, 1702, 1721, 2018, 2038, 2048, 2316, 2384, 2396, 2701, 2716, 2788, 3668, 3745]
[1.875      0.75       1.45833333 0.25       1.79166667 1.70833333
 0.75       0.75       1.70833333 1.25       1.125      2.
 1.29166667 0.66666667 1.875      1.125      1.625      1.875
 1.5        2.         1.875      1.875      0.54166667 2.25
 2.625      0.66666667 1.79166667 0.25       1.625      1.25
 1.25       1.79166667 2.04166667 0.54166667 1.5        1.875
 1.5        0.54166667 1.625      1.04166667 0.66666667 0.75
 0.75       0.54166667 1.45833333]
[2. 3. 3. 3. 2. 1. 4. 3. 3. 5. 5. 3. 4. 5. 4. 2. 3. 4. 5. 5. 3. 4. 2. 3.
 5. 4. 5. 5. 4. 5. 5. 4. 3. 3. 4. 5. 4. 2. 2. 5. 2. 4. 5. 3. 5.]


In [47]:
#evaluate the recommendation
print("Root mean squared error (RMSE): %.2f" % np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#print(metrics.mean_squared_error(y_test, y_pred))
print("Mean squared error (MSE): %.2f" % np.mean((y_pred - y_test) ** 2))


Root mean squared error (RMSE): 2.65
Mean squared error (MSE): 7.00


In [48]:
from sklearn.metrics import accuracy_score, precision_score
average_precision = precision_score([1 if x>3 else 0 for x in y_test], [1 if x>3 else 0 for x in y_pred])
average_accuracy = accuracy_score([1 if x>3 else 0 for x in y_test], [1 if x>3 else 0 for x in y_pred])
print('Average accuracy score: {0:0.2f}'.format(average_accuracy))
print('Average precision score: {0:0.2f}'.format(average_precision))

Average accuracy score: 0.44
Average precision score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


## 2- User-based collaborative filtering

In [17]:
recommendation_listCF, y_predCF, y_testCF = generate_user_based_recom(train, test, user_id)
print(recommendation_listCF)
print(y_predCF)
print(y_testCF)

[432, 441, 3668, 455, 480, 1301, 1367, 2316, 1372, 1376, 1377, 1702, 892, 899, 1721, 2384, 2396, 126, 141, 146, 2701, 150, 153, 2716, 169, 1009, 1012, 1021, 1022, 1023, 1028, 1029, 1032, 2788, 1059, 3745, 1064, 531, 2018, 1073, 551, 1088, 2038, 1097, 2048]
[5.  4.  3.5 3.  2. ]
[5. 2. 5. 3. 2.]


In [18]:
#evaluate the recommendation
print("Root mean squared error (RMSE): %.2f" % np.sqrt(metrics.mean_squared_error(y_testCF, y_predCF)))
#print(metrics.mean_squared_error(y_test, y_pred))
print("Mean squared error (MSE): %.2f" % np.mean((y_predCF - y_testCF) ** 2))


Root mean squared error (RMSE): 1.12
Mean squared error (MSE): 1.25


In [19]:
from sklearn.metrics import accuracy_score, precision_score
average_precision = precision_score([1 if x>3 else 0 for x in y_testCF], [1 if x>3 else 0 for x in y_predCF])
average_accuracy = accuracy_score([1 if x>3 else 0 for x in y_testCF], [1 if x>3 else 0 for x in y_predCF])
print('Average accuracy score: {0:0.2f}'.format(average_accuracy))
print('Average precision score: {0:0.2f}'.format(average_precision))

Average accuracy score: 0.80
Average precision score: 0.67


## 3- Recommender system based on users clustering

In [20]:
#choose the first most rated 1000 movies
most_rated_movies_1k = get_most_rated_movies(movie_matrix, 1000)
most_rated_movies_1k.head()

movieId,userId,2858,260,1196,1210,480,2028,589,2571,1270,...,3244,170,1672,1438,3528,3494,2318,69,1031,1769
0,1,,4.0,,,,5.0,,,5.0,...,,,,,,,,,,
1,2,4.0,,5.0,4.0,5.0,4.0,4.0,4.0,,...,,,,,,,,,,
2,3,4.0,5.0,4.0,4.0,4.0,,,,3.0,...,,,,,,,,,,
3,4,,5.0,2.0,3.0,4.0,5.0,,,,...,,,,,,,,,,
4,5,4.0,,,,,2.0,,5.0,,...,,,,,,,4.0,,,


In [21]:
#as k-means algorithm does not deal well with sparse datasets, 
#we will need to cast it as the sparse csr matrix type defined in the SciPi library.
# Conversion to sparse csr matrix
from scipy.sparse import csr_matrix
sparse_ratings = pd.SparseDataFrame(most_rated_movies_1k.iloc[:,1:])

sparse_ratings.columns = [str(i) for i in sparse_ratings.columns]
sparse_ratings=csr_matrix(sparse_ratings.to_coo())

sparse_ratings

<6040x999 sparse matrix of type '<class 'numpy.float64'>'
	with 747550 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.cluster import KMeans 
#choose  k=30
predictions = KMeans(n_clusters=30, algorithm='full').fit_predict(sparse_ratings)
predictions

array([ 1, 15, 20, ..., 20, 13,  7])

In [23]:
clustered = pd.concat([movie_matrix.reset_index(), pd.DataFrame({'group':predictions})], axis=1).set_index('userId')
clustered.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,group
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,15
3,,,,,,,,,,,...,,,,,,,,,,20
4,,,,,,,,,,,...,,,,,,,,,,20
5,,,,,,2.0,,,,,...,,,,,,,,,,24


In [24]:
def users_kmeans_recom(train, test, active_user):
    y_pred_clus=[]
    y_test_clus=[]
    # Pick a cluster ID of the user_id from the clusters above
    cluster_number = clustered.loc[active_user, 'group']
    #print(cluster_number)
    cluster = clustered[clustered.group == cluster_number].drop( 'group', axis=1).drop(active_user, axis=0)
    avg_ratings = pd.merge(test.set_index('movieId').drop(['userId','timestamp'], axis=1),  pd.DataFrame({'prediction':cluster.mean()}),  how='inner', left_index=True, right_index=True)
    y_pred_clus= np.append(y_pred_clus,avg_ratings['prediction'])
    y_test_clus=np.append(y_test_clus,avg_ratings['rating'])
    #print(avg_ratings)
    recommendation_list_clus=test.movieId.tolist()
    return recommendation_list_clus, y_pred_clus, y_test_clus
   

In [25]:
recommendation_list_clus, y_pred_clus, y_test_clus = users_kmeans_recom(train, test, user_id)
print(recommendation_list_clus)
print(y_pred_clus)
print(y_test_clus)

[432, 441, 3668, 455, 480, 1301, 1367, 2316, 1372, 1376, 1377, 1702, 892, 899, 1721, 2384, 2396, 126, 141, 146, 2701, 150, 153, 2716, 169, 1009, 1012, 1021, 1022, 1023, 1028, 1029, 1032, 2788, 1059, 3745, 1064, 531, 2018, 1073, 551, 1088, 2038, 1097, 2048]
[3.16666667 4.         4.27272727 3.09090909 3.84146341 3.77777778
 3.10810811 4.25       4.09090909 3.89361702 2.77777778 3.12820513
 5.         4.41176471 3.78431373 3.         4.13636364 2.
 3.71428571 2.66666667 2.38095238 4.38235294 3.15       3.96923077
 2.42857143 3.44       3.56521739 2.92307692 4.00925926 3.85294118
 4.24299065 3.61       3.64634146 4.46153846 4.         3.53846154
 2.68965517 3.72727273 3.71264368 3.9787234  3.56896552 3.97777778
 3.8        4.13636364 3.29166667]
[4. 3. 3. 3. 5. 5. 4. 2. 5. 5. 4. 3. 4. 5. 3. 2. 5. 2. 3. 3. 2. 3. 2. 4.
 1. 4. 2. 3. 4. 5. 5. 3. 4. 5. 2. 5. 3. 5. 4. 5. 3. 4. 5. 5. 4.]


In [26]:
import numpy as np
y_pred_clus=np.nan_to_num(y_pred_clus) 

#evaluate the recommendation
print("Root mean squared error (RMSE): %.2f" % np.sqrt(metrics.mean_squared_error(y_test_clus, y_pred_clus)))
#print(metrics.mean_squared_error(y_test, y_pred))
print("Mean squared error (MSE): %.2f" % np.mean((y_pred_clus - y_test_clus) ** 2))


Root mean squared error (RMSE): 0.97
Mean squared error (MSE): 0.95


In [27]:
from sklearn.metrics import accuracy_score, precision_score
average_precision = precision_score([1 if x>3 else 0 for x in y_test_clus], [1 if x>3 else 0 for x in y_pred_clus])
average_accuracy = accuracy_score([1 if x>3 else 0 for x in y_test_clus], [1 if x>3 else 0 for x in y_pred_clus])
print('Average accuracy score: {0:0.2f}'.format(average_accuracy))
print('Average precision score: {0:0.2f}'.format(average_precision))

Average accuracy score: 0.69
Average precision score: 0.65


## 4- Recommender system based on items clustering

In [28]:
#Create a pivot table of movies (on rows) and corresponsing user ratings (on columns). 
user_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating')
user_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [29]:
most_users = get_most_rated_movies(user_matrix, 1000)
most_users.head()

userId,movieId,4169,1680,4277,1941,1181,889,3618,2063,1150,...,5426,5087,3158,1937,4430,131,4560,516,5794,2700
0,1,,4.0,5.0,5.0,3.0,4.0,2.0,5.0,2.0,...,,5.0,4.0,,4.0,3.0,3.0,3.0,,
1,2,3.0,4.0,4.0,4.0,1.0,,3.0,3.0,,...,,,3.0,,,,,,5.0,
2,3,2.0,3.0,,3.0,1.0,2.0,,3.0,,...,,,,,2.0,,,,4.0,
3,4,3.0,,,,,,2.0,,2.0,...,,,,,2.0,,,,,
4,5,2.0,3.0,,3.0,,,,,,...,3.0,,,,1.0,,2.0,,,


In [30]:
# Conversion to sparse csr matrix
from scipy.sparse import csr_matrix
sparse_ratings_matrix = pd.SparseDataFrame(most_users.iloc[:,1:])

sparse_ratings_matrix.columns = [str(i) for i in sparse_ratings_matrix.columns]
sparse_ratings_matrix=csr_matrix(sparse_ratings_matrix.to_coo())

sparse_ratings_matrix

<3706x999 sparse matrix of type '<class 'numpy.float64'>'
	with 515036 stored elements in Compressed Sparse Row format>

In [31]:
predictions_items = KMeans(n_clusters=20, algorithm='full').fit_predict(sparse_ratings_matrix)
predictions_items

array([11,  2,  2, ...,  9,  9, 18])

In [32]:
clustered_items = pd.concat([user_matrix.reset_index(), pd.DataFrame({'group':predictions_items})], axis=1).set_index('movieId')
clustered_items.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,6032,6033,6034,6035,6036,6037,6038,6039,6040,group
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,4.0,,,4.0,,,,,3.0,11
2,,,,,,,,,,5.0,...,,,,,,,,,,2
3,,,,,,,,,,,...,,,,1.0,,,,,,2
4,,,,,,,,3.0,,,...,,,,2.0,2.0,,,,,13
5,,,,,,,,,,,...,,,,1.0,,,,,,2


In [33]:
def items_kmeans_recom(train, test, active_user):
    y_pred_clus_items=[]
    y_test_clus_items=[]    
    recommendation_list_clus_items=test.movieId.tolist()
    for i, item in enumerate(recommendation_list_clus_items):
        cluster_number3=clustered_items.loc[item, 'group']
        cluster_items = clustered_items[clustered_items.group == cluster_number3].drop(item, axis=0)
        y_pred_clus_items.append(cluster_items[active_user].mean())
                
    y_test_clus_items = np.append(y_test_clus_items, test['rating'])   
    return recommendation_list_clus_items, y_pred_clus_items, y_test_clus_items
   
    

In [34]:
recommendation_list_clus_items, y_pred_clus_items, y_test_clus_items = items_kmeans_recom(train, test, user_id)
print(recommendation_list_clus_items)
print(y_pred_clus_items)
print(y_test_clus_items)

[432, 441, 3668, 455, 480, 1301, 1367, 2316, 1372, 1376, 1377, 1702, 892, 899, 1721, 2384, 2396, 126, 141, 146, 2701, 150, 153, 2716, 169, 1009, 1012, 1021, 1022, 1023, 1028, 1029, 1032, 2788, 1059, 3745, 1064, 531, 2018, 1073, 551, 1088, 2038, 1097, 2048]
[3.6315789473684212, 3.0, 5.0, 3.4545454545454546, 4.3, 3.125, 3.409090909090909, 3.5, 4.0, 4.0, 4.5, 3.4545454545454546, 3.2, 4.0, 4.0, 4.666666666666667, 4.0, 2.857142857142857, 4.5, 2.7142857142857144, 3.75, 4.0, 3.736842105263158, 4.4, 3.0, 3.409090909090909, 3.736842105263158, 3.4545454545454546, 4.1875, 3.3636363636363638, 4.125, 4.25, 4.1875, 4.0, 3.75, 3.0, 3.4545454545454546, 3.0, 4.1875, 4.125, 3.0, 3.6315789473684212, 3.3636363636363638, 4.3, 3.409090909090909]
[4. 3. 3. 3. 5. 5. 4. 2. 5. 5. 4. 3. 4. 5. 3. 2. 5. 2. 3. 3. 2. 3. 2. 4.
 1. 4. 2. 3. 4. 5. 5. 3. 4. 5. 2. 5. 3. 5. 4. 5. 3. 4. 5. 5. 4.]


In [35]:
import numpy as np
y_pred_clus_items=np.nan_to_num(y_pred_clus_items) 

#evaluate the recommendation
print("Root mean squared error (RMSE): %.2f" % np.sqrt(metrics.mean_squared_error(y_test_clus_items, y_pred_clus_items)))
#print(metrics.mean_squared_error(y_test, y_pred))
print("Mean squared error (MSE): %.2f" % np.mean((y_pred_clus_items - y_test_clus_items) ** 2))


Root mean squared error (RMSE): 1.19
Mean squared error (MSE): 1.42


In [36]:
from sklearn.metrics import accuracy_score, precision_score
average_precision = precision_score([1 if x>3 else 0 for x in y_test_clus_items], [1 if x>3 else 0 for x in y_pred_clus_items])
average_accuracy = accuracy_score([1 if x>3 else 0 for x in y_test_clus_items], [1 if x>3 else 0 for x in y_pred_clus_items])
print('Average accuracy score: {0:0.2f}'.format(average_accuracy))
print('Average precision score: {0:0.2f}'.format(average_precision))

Average accuracy score: 0.62
Average precision score: 0.61
