Preprocessing of Yelp Dataset to Reviews for Restaurants in Toronto with User, Item, Rating Columns

In [1]:
import os 
os.chdir("farhaan/yel_data/")

# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from __future__ import division
import time

# Read review dataset
review = pd.read_csv("yelp_academic_dataset_review.csv")
# review.shape

# Count number of unqiue users and items for review dataset
n_users_review = review.user_id.unique().shape[0]
n_items_review = review.business_id.unique().shape[0]


# Read business dataset
business = pd.read_csv("yelp_academic_dataset_business.csv")

# Group businesses by 'city'
city = business.groupby('city')['city'].count()

# Subset business to category 'Restaurants'
restaurant = business[business['categories'].str.contains("Restaurants",na=False)]

# Group restaurant businesses by city
city2 = restaurant.groupby('city')['city'].count()

# Filter restaurant businesses to city 'Toronto'
restaurant_toronto = restaurant.loc[restaurant['city'] == 'Toronto']

# Left join restaruants in toronto table with review table
review_rest_tor = pd.merge(restaurant_toronto, review, on='business_id', how='left')

# Subset to user, item, rating columns
uir = review_rest_tor[['user_id','business_id','stars_y']]
user_index = uir.user_id.unique()
item_index = uir.business_id.unique()

# Count number of unique users and items
n_users = uir.user_id.unique().shape[0]
n_items = uir.business_id.unique().shape[0]

Create User, Item Matrix and Filter down the dataset to 5 Core with every user and item with at least 5 ratings

In [2]:
# Split User, Item, Rating dataset to train and test sets of 70% & 30%
from sklearn.model_selection import train_test_split
train, test = train_test_split(uir, test_size=0.30, random_state=42)

# Create table for train data with list of users as index & items as columns
train_matrix = pd.DataFrame(index=user_index, columns=item_index)

# Fill in train_matrix table with ratings
for row in train.itertuples():
    user = row[1]
    item = row[2]
    train_matrix.loc[user][item] = row[3]  

# Create table for test data with list of users as index & items as columns    
test_matrix = pd.DataFrame(index=user_index, columns=item_index)

# Fill in test_matrix table with ratings
for row in test.itertuples():
    user = row[1]
    item = row[2]
    test_matrix.loc[user][item] = row[3]

# Begin filtering process to create 5 Core Subset

# Count number of rated items for each user
item_1 = train_matrix.apply(lambda x: x > 0, raw=True).sum(axis=1)
# item_1.value_counts()

# Filter down to the users with greater than or equal to 5 ratings
train1 = train_matrix
train1['item_1'] = item_1
train2 = train1.loc[train1['item_1'] >= 5]
# train2.shape

# Count number of rated users for each item
train2 = train2.drop('item_1',axis=1)
train3 = train2.transpose()
user_1 = train3.apply(lambda x: x > 0, raw=True).sum(axis=1)
# user_1.value_counts()

# Filter down to the items with greater than or equal to 5 ratings
train3['user_1'] = user_1
train4 = train3.loc[train3['user_1'] >= 5]
train4 = train4.drop('user_1',axis=1)
train5 = train4.transpose()

# Repeat the process for both user and item
item_2 = train5.apply(lambda x: x > 0, raw=True).sum(axis=1)
train5['item_2'] = item_2
train6 = train5.loc[train5['item_2'] >= 5]
train6 = train6.drop('item_2',axis=1)
train7 = train6.transpose()
user_2 = train7.apply(lambda x: x > 0, raw=True).sum(axis=1)
train7['user_2'] = user_2
train8 = train7.loc[train7['user_2'] >= 5]
train8 = train8.drop('user_2',axis=1)
train9 = train8.transpose()

# Check every user and item has at least 5 ratings
item_3 = train9.apply(lambda x: x > 0, raw=True).sum(axis=1)
user_3 = train9.apply(lambda x: x > 0, raw=True).sum(axis=1)

# Filter down the test matrix to filtered user and item in train matrix
test9 = test_matrix.loc[train9.index,train9.columns]


Repeat the same process for Restaurants in Phoenix

In [3]:
# Filter restaurant businesses to city 'Phoenix'
restaurant_phoenix = restaurant.loc[restaurant['city'] == 'Phoenix']


# Left join restaruants in phoenix table with review table
review_rest_pho = pd.merge(restaurant_phoenix, review, on='business_id', how='left')

# Subset to user, item, rating columns
uir_ph = review_rest_pho[['user_id','business_id','stars_y']]

# Assign index for user and item
user_index_ph = uir_ph.user_id.unique()
item_index_ph = uir_ph.business_id.unique()

# Count number of unique users and items
n_users_ph = uir_ph.user_id.unique().shape[0]
n_items_ph = uir_ph.business_id.unique().shape[0]

# Split User, Item, Rating dataset to train and test sets of 70% & 30%
from sklearn.model_selection import train_test_split
train_ph, test_ph = train_test_split(uir_ph, test_size=0.30, random_state=42)

# Create table for train data with list of users as index & items as columns
train_matrix_ph = pd.DataFrame(index=user_index_ph, columns=item_index_ph)

# Fill in train_matrix table with ratings
for row in train_ph.itertuples():
    user = row[1]
    item = row[2]
    train_matrix_ph.loc[user][item] = row[3]  

# Create table for test data with list of users as index & items as columns    
test_matrix_ph = pd.DataFrame(index=user_index_ph, columns=item_index_ph)

# Fill in test_matrix table with ratings
for row in test_ph.itertuples():
    user = row[1]
    item = row[2]
    test_matrix_ph.loc[user][item] = row[3]


# Count number of rated items for each user
item_1_ph = train_matrix_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)

# Filter down to the users with greater than or equal to 5 ratings
train1_ph = train_matrix_ph
train1_ph['item_1'] = item_1_ph
train2_ph = train1_ph.loc[train1_ph['item_1'] >= 5]

# Count number of rated users for each item
train2_ph = train2_ph.drop('item_1',axis=1)
train3_ph = train2_ph.transpose()
user_1_ph = train3_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)

# Filter down to the items with greater than or equal to 5 ratings
train3_ph['user_1'] = user_1_ph
train4_ph = train3_ph.loc[train3_ph['user_1'] >= 5]
train4_ph = train4_ph.drop('user_1',axis=1)
train5_ph = train4_ph.transpose()

# Repeat the process for both user and item
item_2_ph = train5_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)

train5_ph['item_2'] = item_2_ph
train6_ph = train5_ph.loc[train5_ph['item_2'] >= 5]
train6_ph = train6_ph.drop('item_2',axis=1)
train7_ph = train6_ph.transpose()
user_2_ph = train7_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)
train7_ph['user_2'] = user_2_ph
train8_ph = train7_ph.loc[train7_ph['user_2'] >= 5]
train8_ph = train8_ph.drop('user_2',axis=1)
train9_ph = train8_ph.transpose()
# train9_ph.shape

item_3_ph = train9_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)

user_3_ph = train9_ph.apply(lambda x: x > 0, raw=True).sum(axis=1)

# Filter down the test matrix to filtered user and item in train matrix
test9_ph = test_matrix_ph.loc[train9_ph.index,train9_ph.columns]

Original Yelp dataset & User, Item, Rating dataset Summary

In [4]:
print '\nYelp Review Dataset'
print 'Size of original Yelp dataset, Review is %s ' % str(review.shape)
print 'Number of users is %s, number of items is %s ' % (str(n_users_review),str(n_items_review))
print '\nYelp Business Dataset'
print 'Size of original Yelp dataset, Business is %s ' % str(business.shape)
print 'Size of Restaurants subset of Business dataset is %s ' % str(restaurant.shape)
print '\nTop 10 Cities with the most number of restaurants are as follows \n%s ' % str(city2.sort_values(ascending=False).head(10))
print '\nRestaurants in Toronto Subset'
print 'Size of Restaurants in Toronto subset of Business dataset is %s ' % str(restaurant_toronto.shape)
print 'User, Item, Rating dataset for restuarnats in Toronto contain %s rows ' % str(len(uir))
print 'Number of users is %s, number of items is %s ' % (str(n_users),str(n_items))
print '\nRestaurants in Phoenix Subset'
print 'Size of Restaurants in Phoenix subset of Business dataset is %s ' % str(restaurant_phoenix.shape)
print 'User, Item, Rating dataset for restuarnats in Toronto contain %s rows ' % str(len(uir_ph))
print 'Number of users is %s, number of items is %s ' % (str(n_users_ph),str(n_items_ph))


Yelp Review Dataset
Size of original Yelp dataset, Review is (4153150, 10) 
Number of users is 1029432, number of items is 144072 

Yelp Business Dataset
Size of original Yelp dataset, Business is (144072, 16) 
Size of Restaurants subset of Business dataset is (48485, 16) 

Top 10 Cities with the most number of restaurants are as follows 
city
Toronto        6347
Las Vegas      5431
Phoenix        3353
Montréal       2852
Charlotte      2201
Pittsburgh     1990
Edinburgh      1412
Scottsdale     1356
Cleveland      1235
Mississauga    1128
Name: city, dtype: int64 

Restaurants in Toronto Subset
Size of Restaurants in Toronto subset of Business dataset is (6347, 16) 
User, Item, Rating dataset for restuarnats in Toronto contain 245127 rows 
Number of users is 58355, number of items is 6347 

Restaurants in Phoenix Subset
Size of Restaurants in Phoenix subset of Business dataset is (3353, 16) 
User, Item, Rating dataset for restuarnats in Toronto contain 266766 rows 
Number of users is

Train, Test Matrices of Users, Items & 5 Core Subset Summary

In [5]:
print '\nRestaurants in Toronto'
print '\nTrain dataset contains %s rows, Test dataset contains %s rows ' % (str(len(train)),str(len(test)))
print 'Size of train & test matrices with users in rows & items in columns is %s ' % (str(train_matrix.shape))
print '\nUsers of filtered 5 core subset has rated at least 5 items as follows  \n%s' % (str(item_3.value_counts().head(5)))
print '\nItems of filtered 5 core subset has been rated by at least 5 users as follows  \n%s' % (str(user_3.value_counts().head(5)))
print '\nSize of 5 core subset is %s ' % (str(train9.shape))
print '\nRestaurants in Phoenix'
print '\nTrain dataset contains %s rows, Test dataset contains %s rows ' % (str(len(train_ph)),str(len(test_ph)))
print 'Size of train & test matrices with users in rows & items in columns is %s ' % (str(train_matrix_ph.shape))
print '\nSize of 5 core subset is %s ' % (str(train9_ph.shape))


Restaurants in Toronto

Train dataset contains 171588 rows, Test dataset contains 73539 rows 
Size of train & test matrices with users in rows & items in columns is (58355, 6348) 

Users of filtered 5 core subset has rated at least 5 items as follows  
5    1419
6     946
7     663
8     568
9     436
dtype: int64

Items of filtered 5 core subset has been rated by at least 5 users as follows  
5    1419
6     946
7     663
8     568
9     436
dtype: int64

Size of 5 core subset is (7016, 3948) 

Restaurants in Phoenix

Train dataset contains 186736 rows, Test dataset contains 80030 rows 
Size of train & test matrices with users in rows & items in columns is (97476, 3354) 

Size of 5 core subset is (7032, 2153) 


Content-Based Recommender Systems

Preprocess features dataset for Restaurants in Toronto

In [17]:
feature = restaurant_toronto[['business_id','attributes','categories']]

# Extract features from column, categories & attributes for items(restaurants in Toronto)
feature1 = pd.DataFrame(feature['categories'].astype(str).str.split(',').tolist())
feature1 = feature1.apply(lambda x: x.str.lower())
feature1 = feature1.apply(lambda x: x.str.replace('u\'',''))
feature1 = feature1.apply(lambda x: x.str.replace('u\"',''))
feature1 = feature1.apply(lambda x: x.str.replace('\'',''))
feature1 = feature1.apply(lambda x: x.str.replace('[',''))
feature1 = feature1.apply(lambda x: x.str.replace(']',''))

# Extract features from column, attributes for items(restaurants in Toronto)
feature2 = pd.DataFrame(feature['attributes'].astype(str).str.split(',').tolist())
feature2 = feature2.apply(lambda x: x.str.lower())
feature2 = feature2.apply(lambda x: x.str.replace('u\'',''))
feature2 = feature2.apply(lambda x: x.str.replace('u\"',''))
feature2 = feature2.apply(lambda x: x.str.replace(r'.*: {',''))
feature2 = feature2.apply(lambda x: x.str.replace('}\"',''))
feature2 = feature2.apply(lambda x: x.str.replace('\'',''))
feature2 = feature2.apply(lambda x: x.str.replace(':',''))
feature2 = feature2.apply(lambda x: x.str.replace('[',''))
feature2 = feature2.apply(lambda x: x.str.replace(']',''))

feature1 = feature1.as_matrix()
feature2 = feature2.as_matrix()
features = np.concatenate((feature1, feature2),axis=1)
features = pd.DataFrame(features)

features.index = feature['business_id']
filtered = list(train9.columns.values)
features_filt = features[features.index.isin(filtered)]
number_of_features = pd.unique(features_filt.values.ravel())

Preprocess features dataset for Restaurants in Phoenix

In [18]:
feature_ph = restaurant_phoenix[['business_id','attributes','categories']]

# Extract features from column, categories & attributes for items(restaurants in Phoenix)
feature1_ph = pd.DataFrame(feature_ph['categories'].astype(str).str.split(',').tolist())
feature1_ph = feature1_ph.apply(lambda x: x.str.lower())
feature1_ph = feature1_ph.apply(lambda x: x.str.replace('u\'',''))
feature1_ph = feature1_ph.apply(lambda x: x.str.replace('u\"',''))
feature1_ph = feature1_ph.apply(lambda x: x.str.replace('\'',''))
feature1_ph = feature1_ph.apply(lambda x: x.str.replace('[',''))
feature1_ph = feature1_ph.apply(lambda x: x.str.replace(']',''))

# Extract features from column, attributes for items(restaurants in Phoenix)
feature2_ph = pd.DataFrame(feature_ph['attributes'].astype(str).str.split(',').tolist())
feature2_ph = feature2_ph.apply(lambda x: x.str.lower())
feature2_ph = feature2_ph.apply(lambda x: x.str.replace('u\'',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace('u\"',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace(r'.*: {',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace('}\"',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace('\'',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace(':',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace('[',''))
feature2_ph = feature2_ph.apply(lambda x: x.str.replace(']',''))

feature1_ph = feature1_ph.as_matrix()
feature2_ph = feature2_ph.as_matrix()
features_ph = np.concatenate((feature1_ph, feature2_ph),axis=1)
features_ph = pd.DataFrame(features_ph)

features_ph.index = feature_ph['business_id']
filtered_ph = list(train9_ph.columns.values)
features_filt_ph = features_ph[features_ph.index.isin(filtered_ph)]
number_of_features_ph = pd.unique(features_filt_ph.values.ravel())

Features Summary

In [19]:
print 'Restaurants in Toronto'
print '\nThere are %s features' % str(len(number_of_features))
print 'Examples of features are as follows \n%s ' % str(number_of_features[50:70])
print '\nRestaurants in Phoenix'
print '\nThere are %s features' % str(len(number_of_features_ph))
print 'Examples of features are as follows \n%s ' % str(number_of_features_ph[350:370])

Restaurants in Toronto

There are 536 features
Examples of features are as follows 
[' lunch false' ' dinner false' ' breakfast true' ' brunch true'
 ' hastv true' ' noiselevel average' ' restaurantspricerange2 1'
 ' restaurantsreservations true' ' restaurantstableservice true'
 ' wheelchairaccessible true' ' wifi free' 'restaurants' ' japanese'
 ' sushi bars' 'alcohol beer_and_wine' ' restaurantsdelivery true' 'korean'
 ' street true' ' caters false' ' goodforkids false'] 

Restaurants in Phoenix

There are 476 features
Examples of features are as follows 
[' kosher true' ' halal true' ' soy-free true' 'karaoke' 'latin american'
 ' cambodian' 'tapas/small plates' ' szechuan' ' local flavor'
 ' fruits & veggies' 'drugstores' 'bikeparking false' 'sports clubs'
 ' food stands' ' jazz & blues' 'caribbean' ' cuban' 'arts & entertainment'
 ' cupcakes' 'fondue'] 


Content-Based Naive Bayes Approach

In [20]:
def content_based_recommender(train, test, features):

    # For each user, retrieve item id's of rated items
    col = train.columns
    pref_known_train = train.apply(lambda x: x > 0, raw=True).apply(lambda x: list(col[x.values]), axis=1)

    cols = test.columns
    pref_known_test = test.apply(lambda x: x > 0, raw=True).apply(lambda x: list(cols[x.values]), axis=1)

    mean_rating = train.mean(axis=1)
    mean_user_rating = pd.concat([mean_rating] * len(train.columns), axis=1)
    mean_user_rating.columns = train.columns
    
    test_pred = pd.DataFrame(index=test.index, columns=test.columns)

    for m in range(len(train)):
        like_m = pd.DataFrame(data=None, columns=['feature','like','dislike'])
        like = 0
        dislike = 0
        total = 0
        for x1 in pref_known_train[m]:
            if train.loc[train.index[m]][x1] > 2:
                like = like + 1
                for z1 in np.array(features.loc[x1]):
                    if z1 is not None:
                        like_new1 = pd.DataFrame([[z1,1,0]],columns=['feature','like','dislike'])
                        like_m = like_m.append(like_new1)
                    
            else:
                dislike = dislike + 1
                for z1 in np.array(features.loc[x1]):
                    if z1 is not None:
                        like_new1 = pd.DataFrame([[z1,0,1]],columns=['feature','like','dislike'])
                        like_m = like_m.append(like_new1)
            total = total + 1
    
        prob_like =  like / total
        prob_dislike = dislike / total
        like_m_f = like_m.groupby("feature").sum()

        like_m_f['pL'] = (like_m_f['like'] + 0.1) / (like + 2*(0.1))
        like_m_f['pDL'] = (like_m_f['dislike'] + 0.1) / (dislike + 2*(0.1))

        for x2 in pref_known_test[m]:
            like_m_l = pd.DataFrame(data=None, columns=['feature'])
            for z2 in np.array(features.loc[x2]):
                if z2 is not None:
                    like_new2 = pd.DataFrame([[z2]],columns=['feature'])
                    like_m_l = like_m_l.append(like_new2)
        
            like_m_l_f = like_m_l.feature.unique()

            for w in like_m_l_f:
                if w not in like_m_f.index:
                    like_new = pd.DataFrame([[0,0,0.1/(like+(2*0.1)),0.1/(dislike+(2*0.1))]],index=[w],columns=['like','dislike','pL','pDL'])
                    like_m_f = like_m_f.append(like_new) 
        
            pLiked = prob_like
            for v in np.array(features.loc[x2]):
                if v is not None:
                    pLiked = pLiked * like_m_f.loc[str(v)]['pL']
                else:
                    pass
            
            pDisliked = prob_dislike
            for v2 in np.array(features.loc[x2]):
                if v2 is not None:
                    pDisliked = pDisliked * like_m_f.loc[str(v2)]['pDL']
                else:
                    pass
    
            P1 = pLiked / (pLiked+pDisliked)
            P2 = pDisliked / (pLiked+pDisliked)
            val = np.log(P1/P2)
            rat = 1/(1+np.exp(-val))
            rating = rat*4 +1 
            rating

            test_pred.loc[train.index[m]][x2] = rating

    return test_pred

In [21]:
start_time = time.time()
test_pred = content_based_recommender(train9, test9, features_filt)
print 'Content Based Recommender for Restaurant in Toronto took %s seconds' % (time.time() - start_time)



Content Based Recommender for Restaurant in Toronto took 8280.69850421 seconds


In [22]:
start_time = time.time()
test_pred_ph = content_based_recommender(train9_ph, test9_ph, features_filt_ph)
print 'Content Based Recommender for Restaurant in Phoenix took %s seconds' % (time.time() - start_time)



Content Based Recommender for Restaurant in Phoenix took 7056.17153001 seconds


In [23]:
test_pred_0 = test_pred.fillna(0)
test_pred_0_mat = test_pred_0.as_matrix()
pred_list = list(test_pred.values.ravel())
actual_list = list(test9.values.ravel())
test_pred_0_ph = test_pred_ph.fillna(0)
test_pred_0_mat_ph = test_pred_0_ph.as_matrix()
pred_list_ph = list(test_pred_ph.values.ravel())
actual_list_ph = list(test9_ph.values.ravel())
print 'Content-Based Recommenders Sytems for Toronto Mean Absolute Error: ' + str(round(mae(test_pred_0_mat,test9_0_mat),4))
print 'Content-Based Recommenders Sytems for Phoenix Mean Absolute Error: ' + str(round(mae(test_pred_0_mat_ph,test9_0_mat_ph),4))

Content-Based Recommenders Sytems for Toronto Mean Absolute Error: 1.5151
Content-Based Recommenders Sytems for Phoenix Mean Absolute Error: 1.4289
