In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#load users
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
#Load items
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'action', 'adventure', 
          'animation', 'Chilren\'s', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 
         'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,action,adventure,animation,Chilren's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
movies = movies[['movie_id', 'title']]

In [119]:
#Load u.data into dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
# ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols)
ratings = pd.read_csv('data/ml-latest-small/ratings.csv', names=r_cols, skiprows=1)
# ratings['rating'][7] = np.NaN
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [120]:
#drop timestamp
ratings.drop('timestamp', axis=1, inplace=True)

In [121]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,  random_state=42)

In [122]:
#Function that returns rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [123]:
#Define baseline of model to return 3
def baseline(user_id, movie_id):
    return 3.0

In [124]:
#Function to compute rmse score obtained by testing
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract actual ratings given by users in test data
    y_true = np.array(X_test['rating'])
    
    return rmse(y_true, y_pred)

In [125]:
score(baseline)

1.1568716287386662

## User based collaborative Filtering

### Ratings matrix

In [126]:
X_train.head()

Unnamed: 0,user_id,movie_id,rating
15959,104,436,1.5
94262,599,7373,3.5
71511,460,76093,4.5
27761,187,33138,5.0
36262,247,8950,5.0


In [127]:
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

In [128]:
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,190221,191005,193565,193571,193573,193579,193581,193583,193585,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [129]:
#User based collaborative filter using mean ratings
def cf_user_mean(user_id, movie_id):
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:        
        #Compute mean of all ratings
        mean_rating = r_matrix[movie_id].mean()
    else:
        mean_rating = 3.0 #default
        
    return mean_rating

In [130]:
#Compute RMSE again
score(cf_user_mean)

0.9863836616948367

## Weighted Mean

In [131]:
#Create dummy with all null values being 0 
r_matrix_dummy = r_matrix.copy().fillna(0)

In [132]:
#Compute cosine similarity
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [133]:
#Convert cosine_sim to DF
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.019206,0.030037,0.148182,0.13344,0.09453,0.108223,0.111181,0.055935,0.015435,...,0.071786,0.147947,0.17767,0.033734,0.154419,0.119841,0.144922,0.225067,0.125572,0.122788
2,0.019206,1.0,0.0,0.005694,0.0,0.0,0.0,0.042132,0.0,0.060658,...,0.166331,0.023773,0.01703,0.0,0.0,0.016321,0.0,0.045315,0.03597,0.066024
3,0.030037,0.0,1.0,0.0,0.0,0.00357,0.0,0.0,0.0,0.0,...,0.003473,0.007082,0.0,0.0,0.015825,0.010969,0.005658,0.00385,0.0,0.024271
4,0.148182,0.005694,0.0,1.0,0.105275,0.065295,0.079346,0.051279,0.0,0.011677,...,0.062164,0.08818,0.171895,0.061427,0.053951,0.137275,0.072647,0.074596,0.031271,0.059707
5,0.13344,0.0,0.0,0.105275,1.0,0.214268,0.04852,0.382606,0.0,0.007922,...,0.028422,0.314411,0.092888,0.146191,0.102985,0.072785,0.033568,0.08443,0.243891,0.049435


In [134]:
#User based collab filtering using weighted mean ratings
def cf_user_wmean(user_id, movie_id):
        
    
    if movie_id in r_matrix:
        #Get similarity score for user in question with every other user        
        
        #Get user ratings for movie in question. Use iloc so you get the NaN rows as well                
        m_ratings = r_matrix[movie_id]
        
        #Get indices of all users with NaN ratings for this movie
        index_not_null = m_ratings[m_ratings.notnull()].index        
        idx = [i for i in range(1,r_matrix.shape[0]+1) if i not in index_not_null]
        
        #Drop all nas from m_ratings
        m_ratings.dropna(inplace=True)
        
        #Drop the corresponding cosine value scores from sim_scores series        
        sim_scores = cosine_sim[user_id]

        for i in idx:
            if i in sim_scores:
                sim_scores = sim_scores.drop(i)
        
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
        
    else:
        wmean_rating = 3.0
        
    return wmean_rating

In [135]:
cf_user_wmean(253,465)

3.0