# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [3]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2']=['A','B','C','D']
df

Unnamed: 0,col1,col2
0,1,A
1,2,B
2,3,C
3,4,D


## 2. Deleting a row in a DataFrame

In [8]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('d',axis=0)

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [11]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
pd.DataFrame(data=({'col1':ser_1,'col2':ser_2,'col3':ser_3}))

Unnamed: 0,col1,col2,col3
0,0.993821,0.515246,0.366261
1,1.726516,-0.934401,-0.939673
2,1.325029,-0.668072,-2.811176
3,1.018035,-0.371509,2.247626
4,-0.571155,-0.920005,-0.803278
5,0.394787,1.615187,0.930393


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [16]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df.col_2

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [19]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [20]:
# using the same DataFrame, index into into its first row
df.iloc[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [110]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'])

  """
  if __name__ == '__main__':


In [111]:
users = users.set_index('user_id',drop=False)
movies = movies.set_index('movie_id',drop=False)

In [113]:
users.head(3)

Unnamed: 0_level_0,user_id,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,F,1,10,48067
2,2,M,56,16,70072
3,3,M,25,15,55117


## 2. How to load the training and testing subsets

In [31]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv',index_col=0,encoding='latin-1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0,encoding='latin-1')

In [42]:
movielens_train.loc[106838]

user_id                          2000
movie_id                         1873
rating                              4
timestamp                   983844733
gender                              M
age                                18
occupation                          4
zip                             44685
title          Misérables, Les (1998)
genres                          Drama
for_testing                     False
Name: 106838, dtype: object

In [107]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [33]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [34]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [35]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [36]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [41]:
evaluate(my_estimate_func)

1.2323719526527521

In [43]:
print ('RMSE for my estimate function: %s' %evaluate(my_estimate_func))

RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [45]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    # 
    movie_condition = movielens_train.movie_id==movie_id
    return movielens_train.loc[movie_condition, 'rating'].mean()

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [81]:
import math

In [114]:
# write an 'estimate' function that computes the mean rating based on age similarities
def collab_age_mean(user_id, movie_id):
    #get all users with same age as the passed User
    user_list = users[users.age==users.loc[user_id]['age']]['user_id']
    #get all the rating for this movie by these users
    movie_condition = (movielens_train.movie_id==movie_id & movielens_train.user_id.isin(user_list)
                      & movielens_train.rating.notnull())
    #rating
    calc_rating = movielens_train.loc[movie_condition, 'rating'].mean()
    if math.isnan(calc_rating):
        # no similar users found, so return a default
        return 3
    else:
        return calc_rating
    
    
# try it out for a user_id, movie_id pair
collab_age_mean(4653, 2648)

3

In [115]:
evaluate(collab_age_mean)

1.2611571041624079

In [116]:
# write an 'estimate' function that computes the mean rating based on zip code similarities
def collab_zip_code_mean(user_id, movie_id):
    #get all users with same age as the passed User
    user_list = users[users.zip==users.loc[user_id]['zip']]['user_id']
    #get all the rating for this movie by these users
    movie_condition = (movielens_train.movie_id==movie_id & movielens_train.user_id.isin(user_list)
                      & movielens_train.rating.notnull())
    #rating
    calc_rating = movielens_train.loc[movie_condition, 'rating'].mean()
    if math.isnan(calc_rating):
        # no similar users found, so return a default
        return 3
    else:
        return calc_rating
    
    
# try it out for a user_id, movie_id pair
collab_zip_code_mean(4653, 2648)

3

In [117]:
evaluate(collab_zip_code_mean)

1.2402543628456708

In [118]:
# write an 'estimate' function that computes the mean rating based on occupation  similarities
def collab_occ_mean(user_id, movie_id):
    #get all users with same age as the passed User
    user_list = users[users.occupation==users.loc[user_id]['occupation']]['user_id']
    #get all the rating for this movie by these users
    movie_condition = (movielens_train.movie_id==movie_id & movielens_train.user_id.isin(user_list)
                      & movielens_train.rating.notnull())
    #rating
    calc_rating = movielens_train.loc[movie_condition, 'rating'].mean()
    if math.isnan(calc_rating):
        # no similar users found, so return a default
        return 3
    else:
        return calc_rating
    
    
# try it out for a user_id, movie_id pair
collab_occ_mean(4653, 2648)

3

In [119]:
evaluate(collab_occ_mean)

1.3306092550051076

In [120]:
# write an 'estimate' function that computes the mean rating based on movie genre
def content_genre_mean(user_id, movie_id):
    #get all movies of the same genre
    movie_list = movies[movies.genres==movies.loc[movie_id]['genres']]['movie_id']
    #get all the rating for movies of the same genre and is by this user
    movie_condition = (movielens_train.user_id==user_id & movielens_train.movie_id.isin(movie_list)
                      & movielens_train.rating.notnull())
    #rating
    calc_rating = movielens_train.loc[movie_condition, 'rating'].mean()
    if math.isnan(calc_rating):
        # no similar users found, so return a default
        return 3
    else:
        return calc_rating
    
    
# try it out for a user_id, movie_id pair
content_genre_mean(4653, 3948)

3

In [121]:
evaluate(content_genre_mean)

1.2323719526527521

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [123]:
from scipy.spatial import distance

In [127]:
class CollabEuclideanReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id', fill_value=0)

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: distance.euclidean(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabEuclideanReco()
reco.learn()
print ('RMSE for CollabEuclideanReco: %s' %evaluate(reco.estimate))

RMSE for CollabEuclideanReco: 1.12537198265


In [128]:
class CollabCosineReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id', fill_value=0)

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: distance.cosine(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabCosineReco()
reco.learn()
print ('RMSE for CollabCosineReco: %s' %evaluate(reco.estimate))

RMSE for CollabCosineReco: 1.12337750761
