# Recommendation System

In this notebook, I would attempt to implement collaborative filtering on the movie lens dataset to predict the rating of a user for a movie he hasn't seen yet
<br/>
Unlike the other notebook, this one uses lower level code instead of using a library


In [47]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

## Data Import

In [48]:
#load users
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [49]:
#Load items
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'action', 'adventure', 
          'animation', 'Chilren\'s', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 
         'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,action,adventure,animation,Chilren's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [50]:
movies = movies[['movie_id', 'title']]

In [51]:
#Load u.data into dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [52]:
#drop timestamp
ratings.drop('timestamp', axis=1, inplace=True)

In [53]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,  random_state=42)

## Define Functions

In [54]:
#Function that returns rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [55]:
#Define baseline of model to return 3
def baseline(user_id, movie_id):
    return 3.0

In [56]:
#Function to compute rmse score obtained by testing
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract actual ratings given by users in test data
    y_true = np.array(X_test['rating'])
    
    return rmse(y_true, y_pred)

In [57]:
score(baseline)

1.2421145216317555

## User based collaborative Filtering

### Ratings matrix

In [58]:
X_train.head()

Unnamed: 0,user_id,movie_id,rating
59428,642,765,3
34957,583,209,4
4264,115,762,4
53791,130,827,4
82114,713,752,2


In [59]:
#Create a user by movie matrix with values being the ratings
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

In [60]:
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1667,1668,1670,1671,1672,1673,1676,1678,1679,1680
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,,,3.0,,4.0,,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


## Weighted Mean

In [61]:
#Create dummy with all null values being 0 
r_matrix_dummy = r_matrix.copy().fillna(0)

In [62]:
#Compute cosine similarity between users
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [63]:
#Convert cosine_sim to DF
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.138038,0.032441,0.024844,0.215803,0.315804,0.263965,0.267712,0.091371,0.243662,...,0.245828,0.085952,0.19577,0.132171,0.128758,0.107208,0.163399,0.071207,0.077925,0.323557
2,0.138038,1.0,0.098297,0.139852,0.081774,0.131897,0.083066,0.080662,0.1602,0.095648,...,0.11289,0.22371,0.331184,0.220784,0.229125,0.11986,0.145608,0.091798,0.143345,0.064351
3,0.032441,0.098297,1.0,0.280093,0.0,0.089131,0.025222,0.066608,0.070481,0.064173,...,0.010918,0.021442,0.125124,0.031336,0.099417,0.0,0.103745,0.027374,0.091023,0.0
4,0.024844,0.139852,0.280093,1.0,0.013261,0.040693,0.066119,0.12164,0.064357,0.039065,...,0.014954,0.048947,0.098492,0.065029,0.112632,0.0,0.134201,0.124974,0.094091,0.015769
5,0.215803,0.081774,0.0,0.013261,1.0,0.144684,0.254663,0.104024,0.037077,0.166295,...,0.226865,0.076139,0.080198,0.077926,0.077481,0.035146,0.213758,0.117001,0.11745,0.225614


In [64]:
#User based collab filtering using weighted mean ratings
def cf_user_wmean(user_id, movie_id):
        
    
    if movie_id in r_matrix:
        #Get similarity score for user in question with every other user        
        
        #Get user ratings for movie in question. Use iloc so you get the NaN rows as well                
        m_ratings = r_matrix[movie_id]
        
        #Get indices of all users with NaN ratings for this movie
        index_not_null = m_ratings[m_ratings.notnull()].index        
        idx = [i for i in range(1,r_matrix.shape[0]+1) if i not in index_not_null]
        
        #Drop all nas from m_ratings
        m_ratings.dropna(inplace=True)
        
        #Drop the corresponding cosine value scores from sim_scores series        
        sim_scores = cosine_sim[user_id]

        for i in idx:
            if i in sim_scores:
                sim_scores = sim_scores.drop(i)
        
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
        
    else:
        wmean_rating = m_ratings.mean()
        
    return wmean_rating

## Prediction

We will be trying to predict rating for user_id = 196, movie_id = 302
We will also be trying to figure out why our model gave such a rating

In [65]:
user_id = 196
movie_id = 302

### Get all ratings for user 196

In [96]:
ratings[ratings['user_id']==196].head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
940,196,393,4
1133,196,381,4
1812,196,251,3
1896,196,655,5


Since the user hasn't seen movie 302 yet. We will base the ratnig on users similar to this one.

### Get people similar to user 196

In [74]:
user_sim_196 = cosine_sim[196].reset_index(name='Similarity index')
user_sim_196.sort_values(by='Similarity index', ascending=False).head(6)

   user_id  Similarity index
0        1          0.082716
1        2          0.146676
2        3          0.042787
3        4          0.000000
4        5          0.094536


Unnamed: 0,user_id,Similarity index
195,196,1.0
590,591,0.29844
538,539,0.295008
305,306,0.254277
650,651,0.245621
859,860,0.244663


In [91]:
top5_sim_users = user_sim_196.sort_values(by='Similarity index', ascending=False).head(20)['user_id']
top5_sim_users = top5_sim_users.drop(195) #Drop the user itself (195 is index for user id 196)
top5_sim_users

590    591
538    539
305    306
650    651
859    860
557    558
474    475
877    878
691    692
302    303
17      18
62      63
727    728
578    579
674    675
936    937
344    345
206    207
128    129
Name: user_id, dtype: int64

So we find that top 5 users similar to 196 are 591, 539, 306, 651, 860

### What did those users rate this movie?

In [92]:
ratings[ratings['user_id'].isin(top5_sim_users)][ratings['movie_id']==302]

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating
2452,303,302,4
13481,345,302,5
27143,129,302,4
38547,63,302,3
54091,207,302,4
82839,860,302,4
92651,651,302,5
98153,475,302,3


Thus, we see that the rating average is more or less around 4. Now we will use our model to predict.

In [94]:
cf_user_wmean(user_id, movie_id)

4.181081972024287

As expected, the prediction is around 4. If we calculate weighted mean using the similarity index, it will come down to 4.18.