# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [40]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [41]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['col2']=[5,6,7,8]
df

Unnamed: 0,col1,col2
0,1,5
1,2,6
2,3,7
3,4,8


## 2. Deleting a row in a DataFrame

In [42]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('a',axis=0)

Unnamed: 0,col1
b,2
c,3
d,4


## 3. Creating a DataFrame from a few Series

In [43]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
df=pd.DataFrame({'1':ser_1,'2':ser_2, '3':ser_3})
df

Unnamed: 0,1,2,3
0,2.331766,-0.671638,-1.220701
1,0.668223,-0.503286,0.041535
2,0.059036,0.037603,-1.14125
3,-1.292859,-1.51756,-0.17125
4,-3.433791,0.442741,-1.076131
5,1.53349,-0.572064,-1.716788


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [44]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df.col_1

obs1     0.12
obs2     7.00
obs3    45.00
obs4    10.00
Name: col_1, dtype: float64

## 2. Label-based indexing

In [45]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [46]:
# using the same DataFrame, index into into its first row
df.iloc[0,0]

0.12

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [47]:
import pandas as pd

users = pd.read_table(r'C:\Users\prati\Downloads\recommendation system\ml-1m\users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

movies = pd.read_table( r'C:\Users\prati\Downloads\recommendation system\ml-1m\movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'], engine='python') 
    

In [48]:
# subset version (hosted notebook)
movielens_train = pd.read_csv(r'C:\Users\prati\Downloads\recommendation system\movielens_train.csv', index_col=0, encoding='latin')
movielens_test = pd.read_csv(r'C:\Users\prati\Downloads\recommendation system\movielens_test.csv', index_col=0, encoding='latin')

In [49]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
372194,1327,73,2,989689722,M,1,10,12159,"Misérables, Les (1995)",Drama|Musical,False
905143,4663,1228,4,963866233,M,25,4,92037,Raging Bull (1980),Drama,False
369635,3097,1639,3,969910209,F,25,5,10021,Chasing Amy (1997),Drama|Romance,False
999679,1470,3212,3,974838528,M,18,4,94118,Born to Win (1971),Drama,False
745667,3940,2409,3,965748252,M,35,20,32708,Rocky II (1979),Action|Drama,False


In [50]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
563690,2637,3101,4,973574903,F,35,20,44303,Fatal Attraction (1987),Thriller,False
358410,3836,3500,3,966413822,F,50,8,84770,Mr. Saturday Night (1992),Comedy|Drama,False
249435,4950,3006,4,962638458,M,25,16,55421,"Insider, The (1999)",Drama,False
957528,1117,781,4,975693004,M,18,14,10017,Stealing Beauty (1996),Drama,False
694570,3304,2160,5,968007128,F,45,5,92649,Rosemary's Baby (1968),Horror|Thriller,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [51]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [52]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [53]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [54]:
print ('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.283038170580714


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [55]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
        
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    if movielens_train.loc[user_condition & movie_condition].empty:
        return(3)
    else:
        a= movielens_train.loc[user_condition & movie_condition]
        b= a.rating
        return b.mean()

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [56]:
user_info = users.set_index('user_id')
user_info.head(5)
user_id = 3
user_info.loc[user_id, 'age']

25

In [57]:
#pd.__version__ == '1.0.0'
class CollabAgeReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.loc[user_id, 'age']
        if ~np.isnan(self.means_by_age.loc[movie_id, user_age]):
            return self.means_by_age.loc[movie_id, user_age]
        else:
            return self.means_by_age.loc[movie_id].mean()

#try .ix in place of .loc if it supports 
reco = CollabAgeReco()
reco.learn()
print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

RMSE for CollabGenderReco: 1.2297001526594797


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [58]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [59]:
users = pd.read_table(r'C:\Users\prati\Downloads\recommendation system\ml-1m\users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

ratings = pd.read_table(r'C:\Users\prati\Downloads\recommendation system\ml-1m\ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

movies = pd.read_table(r'C:\Users\prati\Downloads\recommendation system\ml-1m\movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'], engine='python')

In [60]:
movielens = pd.merge(pd.merge(ratings, users), movies)
class CollabeuclideanReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: euclidean(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabeuclideanReco()
reco.learn()
print('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

RMSE for CollabPearsonReco: 1.1243627596220371
