# Collaborative Filtering

In [52]:
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [53]:
#We have .data file. This is not .csv file.
columns = ["userid", "movieid", "rating", "timestamp"]
userMovieData= pd.read_csv('C:/Users/Lenovo/Desktop/IPY/MoviesDataSet/u.data', sep='\t',names= columns)
userMovieData.head(10)

Unnamed: 0,userid,movieid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [54]:
userMovieData.drop( "timestamp", inplace = True, axis = 1 )

In [55]:
print(len(userMovieData))
print(len( userMovieData['userid'].unique()))
print(len( userMovieData['movieid'].unique()))

100000
943
1682


#### Let's assume in this world there exists 943 people and 1682 movies. 943 people have rated different 1682 movies, so generated lot of data -  100000 rows

In [56]:
userMovieData.head(10)

Unnamed: 0,userid,movieid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


#### Creating a "Similarity Martix" means creating a "Pivot". Pivots can be implemented in multiple ways, we are choosing obviously Python way! NOTE: We are creating pivot using pivot() method. pivot_table() is somthing used to implement pivot tables. 

##### Refer:  https://github.com/rahulvaish/MachineLearningPython/blob/DataWrangling/UnderstandingPivotTables-GBToilets.ipynb

In [57]:
userMovieSimilarityMartix = userMovieData.pivot( index='userid', columns='movieid', values = "rating" )

In [58]:
userMovieSimilarityMartix

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
7,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
8,,,,,,,3.0,,,,...,,,,,,,,,,
9,,,,,,5.0,4.0,,,,...,,,,,,,,,,
10,4.0,,,4.0,,,4.0,,4.0,,...,,,,,,,,,,


In [59]:
# 943 X 1682 Crosschecked !!

In [60]:
# We see a lot of 'NAN's Missing values. Replacing 'NAN's with 0.
# Why 0 makes sense ?
# If user A has not seen the movie B, for B the rating is missing or 0. Same is the situation where movie B has been watched but not rated.

In [61]:
userMovieSimilarityMartix.fillna( 0, inplace = True )

In [62]:
userMovieSimilarityMartix

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
#Checking a part/section of DataFrame-
#userMovieSimilarityMartix.iloc[10:20, 20:30]

#### For calculating distances, many similarity coefficients can be calculated. Most widely used similarity coefficients are Euclidean, Cosine, Pearson Correlation etc.
#### We will use cosine distance here. Here we are insterested in similarity. That means higher the value more similar they are. 


## NOTE: The function gives us the distance, we have to deduct it from 1. Also the function does not returns DataFrame, we have to perform conversion!

In [64]:
user_sim = 1 - pairwise_distances( userMovieSimilarityMartix.as_matrix(), metric="cosine" )

In [68]:
user_sim_df = pd.DataFrame(user_sim )

In [69]:
user_sim_df.loc[0:10,0:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,0.311124
1,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,0.129257
2,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,0.075912
3,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,0.104182
4,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,0.32171
5,0.430239,0.245843,0.072415,0.068044,0.237286,1.0,0.489255,0.201369,0.183951,0.551713,0.260765
6,0.440367,0.107328,0.066137,0.09123,0.3736,0.489255,1.0,0.284951,0.14565,0.487024,0.374882
7,0.319072,0.103344,0.08306,0.18806,0.24893,0.201369,0.284951,1.0,0.085942,0.233289,0.160853
8,0.078138,0.161048,0.06104,0.101284,0.056847,0.183951,0.14565,0.085942,1.0,0.198223,0.065721
9,0.376544,0.159862,0.065151,0.060859,0.201427,0.551713,0.487024,0.233289,0.198223,1.0,0.223393


In [None]:
# Hopefully you have noticed the '1' in diagnols..

#### Now we have a huge matrix with numbers everywhere, how to determine the similarty - Which users are similar to each other ? To answer this question and understand, we will be using an inbuild method idxmax()

## Model Based Approaches

In [231]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic, evaluate

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

#Define the algorithm object; in this case kNN
knn = KNNBasic()

#Evaluate the performance in terms of RMSE
evaluate(knn, data, measures=['RMSE'])

Evaluating RMSE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9776
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9789
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9695
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9810
------------
Fold 5
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9849
------------
------------
Mean RMSE: 0.9784
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.97764007686097709,
                             0.97889035204999741,
                             0.9694859699934969,
                             0.98099811511904433,
                             0.98488926832497381]})

In [232]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
evaluate(svd, data, measures=['RMSE'])

Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.9371
------------
Fold 2
RMSE: 0.9417
------------
Fold 3
RMSE: 0.9289
------------
Fold 4
RMSE: 0.9379
------------
Fold 5
RMSE: 0.9379
------------
------------
Mean RMSE: 0.9367
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.93714337825960081,
                             0.9417378198331483,
                             0.92893737314257874,
                             0.93793761103739881,
                             0.93789928866069328]})