# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [3]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2']=[5,6,7,8]
df

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


## 2. Deleting a row in a DataFrame

In [20]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df

Unnamed: 0,col1
a,1
b,2
c,3
d,4


In [21]:
df=df.drop(df.index[3])


In [22]:
df

Unnamed: 0,col1
a,1
b,2
c,3


## 3. Creating a DataFrame from a few Series

In [23]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))


In [6]:
a=pd.Series([1,2,3,4,5,6])
a

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [25]:
ser_1

0   -1.322410
1   -2.405790
2   -0.544094
3    0.153279
4    0.467040
5   -0.090587
dtype: float64

In [30]:
df1=pd.DataFrame({'col_1':ser_1,'col_2':ser_2,'col_3':ser_3})
df1

Unnamed: 0,col_1,col_2,col_3
0,-1.32241,0.747793,0.296182
1,-2.40579,0.334234,0.92839
2,-0.544094,0.372444,-1.108817
3,0.153279,-2.09936,-0.358042
4,0.46704,0.252609,-0.69691
5,-0.090587,0.508084,-0.918022


In [38]:
clear

[H[2J

# Pandas questions: Indexing

## 1. Indexing into a specific column

In [39]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df

Unnamed: 0,col_1,col_2,col_3
obs1,0.12,0.9,
obs2,7.0,9.0,
obs3,45.0,34.0,
obs4,10.0,11.0,


## 2. Label-based indexing

In [41]:
# using the same DataFrame, index into the row whose index is 'obs3'

df.iloc[2:3,:]

Unnamed: 0,col_1,col_2,col_3
obs3,45.0,34.0,


## 2. Position-based indexing

In [46]:
# using the same DataFrame, index into into its first row
df.ix[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [39]:
import pandas as pd

users = pd.read_table('users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'])
ratings = pd.read_table('ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])




In [40]:
movielens = pd.merge(pd.merge(ratings, users), movies)

## 2. How to load the training and testing subsets

In [4]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('movielens_train.csv', index_col=0,encoding='latin-1')
movielens_test = pd.read_csv('movielens_test.csv', index_col=0,encoding='latin-1')

In [5]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [16]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [6]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [7]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [10]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [11]:
print ('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [13]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    
    # second, compute the mean of those ratings
    user_condition = movielens_train.user_id == user_id
    return movielens_train.loc[user_condition, 'rating'].mean()
    # 
    return

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)
print ('RMSE for my estimate function: %s' % evaluate(collab_mean))

RMSE for my estimate function: 1.23078247597


In [15]:
movielens_train.groupby('gender')['rating'].mean()

gender
F    3.594928
M    3.530507
Name: rating, dtype: float64

In [17]:
movielens_train.groupby(['gender','age'])['rating'].mean()

gender  age
F       1      3.500000
        18     3.528958
        25     3.548507
        35     3.730104
        45     3.581818
        50     3.617978
        56     3.725490
M       1      3.305556
        18     3.507712
        25     3.489764
        35     3.569591
        45     3.565574
        50     3.728125
        56     3.611111
Name: rating, dtype: float64

In [18]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [20]:
movie_matrix=movielens_train.pivot_table(values='rating',index='user_id',columns='movie_id')

In [21]:
movie_matrix.head()

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
15,,,,,,,,,,,...,,,,,,,,,,


In [22]:
movie_matrix.loc[11:16, 1196:1200]

movie_id,1196,1197,1198,1199,1200
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13,,,,,
15,,,,,


In [24]:
movielens_train.groupby(['gender','age'])['rating'].mean()

gender  age
F       1      3.500000
        18     3.528958
        25     3.548507
        35     3.730104
        45     3.581818
        50     3.617978
        56     3.725490
M       1      3.305556
        18     3.507712
        25     3.489764
        35     3.569591
        45     3.565574
        50     3.728125
        56     3.611111
Name: rating, dtype: float64

In [26]:
movielens_train.pivot_table(values='rating',index='age',columns='gender',aggfunc='mean')

gender,F,M
age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.5,3.305556
18,3.528958,3.507712
25,3.548507,3.489764
35,3.730104,3.569591
45,3.581818,3.565574
50,3.617978,3.728125
56,3.72549,3.611111


In [27]:
movielens_train.pivot_table(values='rating',index='age',columns='gender',aggfunc=[np.mean,np.std])

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.5,3.305556,1.242757,1.335765
18,3.528958,3.507712,1.162283,1.151606
25,3.548507,3.489764,1.146094,1.101005
35,3.730104,3.569591,0.984159,1.112843
45,3.581818,3.565574,1.18385,1.082775
50,3.617978,3.728125,1.049953,1.009899
56,3.72549,3.611111,0.981396,1.073106


__Minimal reco engine v1.1: implicit sim functions¶__

In [30]:
users_info=users.set_index('user_id')
users_info.head()

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [None]:
#collaborative filter based on age similarities

In [32]:
def collab_age(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_age = ratings_by_others.pivot_table('rating', index='movie_id', columns='age')
    user_age = users_info.ix[user_id, 'age']
    if user_age in means_by_age.columns: 
        return means_by_age.ix[movie_id, user_age]
    else:
        return means_by_age.ix[movie_id].mean()

print ('RMSE for collab_age: %s' % evaluate(collab_age))

RMSE for collab_age: 1.20520133441


In [None]:
#collaborative filter based on zip code similarities

In [33]:
def collab_zip(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_zip = ratings_by_others.pivot_table('rating', index='movie_id', columns='zip')
    user_zip = users_info.ix[user_id, 'zip']
    if user_zip in means_by_zip.columns: 
        return means_by_zip.ix[movie_id, user_zip]
    else:
        return means_by_zip.ix[movie_id].mean()

print ('RMSE for collab_zip: %s' % evaluate(collab_zip))

RMSE for collab_zip: 1.12566403192


In [None]:
#collaborative filter based on occupation similarities

In [34]:
def collab_occupation(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_occupation = ratings_by_others.pivot_table('rating', index='movie_id', columns='occupation')
    user_occupation = users_info.ix[user_id, 'occupation']
    if user_occupation in means_by_occupation.columns: 
        return means_by_occupation.ix[movie_id, user_occupation]
    else:
        return means_by_occupation.ix[movie_id].mean()

print ('RMSE for collab_occupation: %s' % evaluate(collab_occupation))

RMSE for collab_occupation: 1.20287696436


Mini-Challenge: second round¶
Implement an estimate function of your own using other custom similarity notions, eg.:
euclidean
cosine

# Custom similarity function

In [35]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [41]:
class Collabeuclidean:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: euclidean
                                    (profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = Collabeuclidean()
reco.learn()
print ('RMSE for Collabeuclidean: %s' % evaluate(reco.estimate))

RMSE for Collabeuclidean: 1.11215994339
