In [None]:
# https://medium.com/analytics-vidhya/recommendation-system-using-collaborative-filtering-cc310e641fde

# Recommendation System using collaborative filtering in Python

In [34]:
#importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate
from surprise import SVD
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data',  sep='\t', names=r_cols, encoding='latin-1')
ratings.head()
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item',  sep='\t', names=i_cols, encoding='latin-1')
movies.head()
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [37]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [50]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [52]:
df_ratings = X_train.pivot(index='user_id', columns='movie_id', values='rating')

In [53]:
df_ratings

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [54]:
# Method 1: Weighted Average approach

In [56]:
df_ratings_dummy = df_ratings.copy().fillna(0)
df_ratings_dummy.head()
#cosine similarity of the ratings
similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings.index, columns=df_ratings.index)
#calculate ratings using weighted sum of cosine similarity
#function to calculate ratings
def calculate_ratings(id_movie, id_user):
    if id_movie in df_ratings:
        cosine_scores = similarity_matrix_df[id_user] #similarity of id_user with every other user
        ratings_scores = df_ratings[id_movie]      #ratings of every other user for the movie id_movie
        #won't consider users who havent rated id_movie so drop similarity scores and ratings corresponsing to np.nan
        index_not_rated = ratings_scores[ratings_scores.isnull()].index
        ratings_scores = ratings_scores.dropna()
        cosine_scores = cosine_scores.drop(index_not_rated)
        #calculating rating by weighted mean of ratings and cosine scores of the users who have rated the movie
        ratings_movie = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()
    else:
        return 2.5
    return ratings_movie

In [57]:
calculate_ratings(3,150) #predicts rating for user_id 150 and movie_id 3

3.201080685364519

In [93]:
# X_test['movie_id']

In [94]:
# X_test['user_id']

In [82]:
#evaluates on test set
def score_on_test_set():
    user_movie_pairs = zip(X_test['movie_id'], X_test['user_id'])
    predicted_ratings = np.array([calculate_ratings(movie, user) for (movie,user) in user_movie_pairs])
    true_ratings = np.array(X_test['rating'])
    score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    return score

In [84]:
# test_set_score = score_on_test_set()
# print(test_set_score)

In [67]:
# Method 1: Model-based approaches

In [92]:
# installing surprise library
# !pip install surprise

In [87]:
#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
ratings = ratings.drop(columns='timestamp')
reader = Reader()
#dataset creation
data = Dataset.load_from_df(ratings, reader)
#model
knn = KNNBasic()
#Evaluating the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE', 'mae'], cv = 3)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.990281  , 0.98430763, 0.99084004]),
 'test_mae': array([0.78076817, 0.77905808, 0.78315822]),
 'fit_time': (0.3130679130554199, 0.42780518531799316, 0.3385312557220459),
 'test_time': (5.038460969924927, 8.01596188545227, 4.750699996948242)}

In [89]:
#Define the SVD algorithm object
svd = SVD()
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], cv = 3)

{'test_rmse': array([0.93978617, 0.95148184, 0.94986871]),
 'fit_time': (0.8694379329681396, 0.8447649478912354, 0.8222129344940186),
 'test_time': (0.24219393730163574, 0.28964710235595703, 0.4984891414642334)}

In [90]:
trainset = data.build_full_trainset()
svd.fit(trainset)
ratings[ratings['user_id'] == 5]

Unnamed: 0,user_id,movie_id,rating
172,5,2,3
439,5,17,4
673,5,439,1
679,5,225,2
922,5,110,1
...,...,...,...
93172,5,419,3
94436,5,375,3
95021,5,373,3
96918,5,368,1


In [None]:
# The prediction for user_id 1 and movie 110 by SVD model is 2.14 
# and the actual rating was 2 which is kind of amazing.

In [91]:
svd.predict(1, 110)

Prediction(uid=1, iid=110, r_ui=None, est=2.113772185276542, details={'was_impossible': False})