In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn import base
from ast import literal_eval
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler


%matplotlib inline

In [16]:
train_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/train.csv")
test_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/test.csv")
val_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/validation.csv")


In [17]:
train_df.shape

(1076577, 12)

In [18]:
train_df.head()

Unnamed: 0,business_id,date,review_id,text,user_id,attributes,categories,city,name,restaurant_rating,average_stars,stars
0,e41TP5cXZqSrz50xCBJqZw,2018-08-27 13:22:18,7MDCINszzBuYDUQbMU585g,"Wowowow I really like Insomnia, and the eggs b...",O3q-nwYZykMmacxjru01Zg,"{'BestNights': ""{'monday': False, 'tuesday': F...","French, Bars, Caterers, Canadian (New), Breakf...",Toronto,Insomnia Restaurant & Lounge,4.0,3.45,4.0
1,OPcGFLP_mRt9yvgP3ALXOQ,2016-10-02 21:36:35,l5E3lkEI2S6_-JyCD4iiKA,This place was just ok as I wasn't too impress...,gmCK2ARSx22S_4_RQF9Jwg,"{'Smoking': ""u'no'"", 'CoatCheck': 'False', 'Ha...","Bars, Restaurants, Latin American, Nightlife, ...",Pittsburgh,Totopo Mexican Kitchen and Bar,4.5,3.77,3.0
2,2zmsElyBgX5MT7dWoxsQlA,2017-12-29 18:13:17,RPw1fCfDUwpZE4-qKbXikg,"If you have the option, go elsewhere \n\nThis ...",AMcup3kbJXK2ufixQdxHVg,"{'RestaurantsPriceRange2': '2', 'OutdoorSeatin...","Food, Chinese, Ethnic Food, Imported Food, Spe...",Pickering,Paul Wong's Fine Chinese Cuisine,2.5,3.42,2.0
3,pxv5jBUOgsg6Rf6Z88e4mg,2014-10-03 15:38:54,ud9gtP6EH-qpB3mN9db5xg,Located upstairs this restaurant bar is a neig...,Qu8a6nCM_0-TJejCSsh2xA,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Nightlife, American (New), Pizza,...",Phoenix,The Attic Ale House,4.0,4.66,5.0
4,AIasveRX0245WeAH0C46Uw,2016-08-01 22:15:33,wBpaUqbXDDJCXmkK1I34uA,I came to Chick Fiesta expecting decent Portug...,tU94-C1zpBsfGFvpsJJr2w,"{'WheelchairAccessible': 'True', 'Alcohol': ""u...","South African, Halal, Portuguese, Steakhouses,...",Toronto,Chick Fiesta,4.0,3.42,2.0


In [19]:
val_df.head()

Unnamed: 0,business_id,date,review_id,text,user_id,attributes,categories,city,name,restaurant_rating,average_stars,stars
0,b_cb87bZ4W87RqXxbJeuLg,2015-03-23 04:50:17,dqy7akXNNwzuAb7GHr4rvQ,"I've passed by this place hundreds of times, n...",iqQBsjqOF9xgIAbsZ2sAZA,"{'RestaurantsPriceRange2': '1', 'Caters': 'Fal...","Restaurants, Local Flavor, Vegan, Shopping, Ca...",Montréal,Eva B,4.5,3.44,3.0
1,YzTafk0tQ-nAu1oICuuNFw,2017-03-17 02:36:40,mhj0ORMTPspY9MdowNfqig,If you're looking for decent sushi without dre...,spX1hYXpJzc9apJYaHiMJQ,"{'HasTV': 'True', 'NoiseLevel': ""u'average'"", ...","Restaurants, Sushi Bars, Live/Raw Food, Japanese",Charlotte,Sushi 101,3.5,3.41,3.0
2,Xny0n0s98TpP82sQxfgIMQ,2018-02-13 17:29:42,RSJnaE-P93-5jTPBhNOVDQ,"I'm polish, Hungarian, and german so I have mi...",RCxBgx6ti8AYsCSEsHRBPg,"{'RestaurantsDelivery': 'False', 'RestaurantsG...","American (Traditional), Nightlife, Bars, Polis...",Cleveland,Sokolowski's University Inn,4.5,4.37,3.0
3,HjXI6lKGPVNQ8dYJPtaOCg,2016-09-06 20:40:31,GihEyvZaSB_YHeYO1E1Qfw,The pizza with jerk chicken on it is all you n...,tPZMwjHZNAx_HuRzTpMXmA,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Sandwiches, Pizza, Bars, Food, Nightlife, Beer...",Tempe,Mellow Mushroom,3.5,4.27,4.0
4,vL9rvPWFWd7FsdJL330b7Q,2008-11-06 15:25:20,w--i84bOL7qwN_fFdc70pw,I've been out to Save Your Dairy a few times a...,NeJLklFT4Ohig0uW9719bw,"{'BusinessAcceptsCreditCards': 'False', 'Busin...","Specialty Food, Health Markets, Food",Queen Creek,Save Your Dairy,4.5,3.76,5.0


In [20]:
#Converting the columm strings into dictionary and lists using custom functions
def convert_str_to_dict(row):
    return literal_eval(row)

In [21]:
def convert_str_to_list(row):
    l = row.strip().split(",")
    for s in l:
        s = s.strip()
    return l

In [22]:
train_df['categories'] = train_df['categories'].apply(convert_str_to_list)
val_df['categories'] = val_df['categories'].apply(convert_str_to_list)


In [23]:
#Binarizing the ratings
def convert_ratings(row):
    x = int(row)
    if x in [1, 2, 3]:
        new_rating = 0
    else:
        new_rating = 1
    return new_rating

In [24]:
train_df['likes'] = train_df['stars'].apply(convert_ratings)
val_df['likes'] = val_df['stars'].apply(convert_ratings)

In [25]:
train_labels = train_df['likes']

In [26]:
val_labels = val_df['likes']

In [27]:
#Dropping unnecessary columms
train_df.drop(columns=['date', 'review_id', 'text', 'stars'], inplace=True)
val_df.drop(columns=['date', 'review_id', 'text', 'stars'], inplace=True)

In [28]:
train_df.columns


Index(['business_id', 'user_id', 'attributes', 'categories', 'city', 'name',
       'restaurant_rating', 'average_stars', 'likes'],
      dtype='object')

In [29]:
val_df.columns

Index(['business_id', 'user_id', 'attributes', 'categories', 'city', 'name',
       'restaurant_rating', 'average_stars', 'likes'],
      dtype='object')

In [21]:
def Value_To_Dict(val):
    return {val:1}

def List_To_Dict(the_list):
    return {category:1 for category in the_list}
    
def Flatten_Dict(d, prekey = ''):
    flat_dict = {}
    for key in d:
        if isinstance(d[key], bool) and d[key]:
            flat_dict.update({prekey+'_'+key:1})
        elif isinstance(d[key], str):
            flat_dict.update({prekey+'_'+key+'_'+d[key]:1})
        elif isinstance(d[key], dict):
            flat_dict.update(Flatten_Dict(d[key], prekey=prekey+'_'+key))
    return flat_dict


In [22]:
class One_Hot_Encoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames, value_type = 'value', sparse = True):
        if value_type == 'value':
            self.apply_function_ = Value_To_Dict
        elif value_type == 'list':
            self.apply_function_ = List_To_Dict
        elif value_type == 'dict':
            self.apply_function_ = Flatten_Dict
        self.colnames_ = colnames
        self.dv_ = DictVectorizer(sparse = sparse)

    def fit(self, X, y = None):
        self.dv_.fit(X[self.colnames_].apply(self.apply_function_))
        return self

    def transform(self, X):
        return self.dv_.transform(X[self.colnames_].apply(self.apply_function_))

In [66]:
class Column_Selector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames):
        self.colnames_ = colnames

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.DataFrame(X[self.colnames_])

In [67]:
# Create features for each restaurant - Categories, Attributes, City and Average rating

encoding_category = One_Hot_Encoder('categories', 'list', sparse=False)
encoding_attribute = One_Hot_Encoder('attributes', 'dict', sparse=False)
encoding_city= One_Hot_Encoder('city', 'value', sparse=False)
business_rating = Column_Selector(['restaurant_rating'])

encoding_union = FeatureUnion([ ('cat', encoding_category), 
                               ('attr', encoding_attribute),
                               ('city', encoding_city), 
                               ('restaurant_rating', business_rating),
                              ])

In [None]:
train_features = encoding_union.fit_transform(train_df)

In [25]:
train_features.shape

(1078332, 2612)

In [None]:
val_features = encoding_union.transform(val_df)

In [26]:
val_features.shape

(269584, 2612)

In [27]:
sc = StandardScaler()

In [28]:
train_features[:, -1] = sc.fit_transform(train_features[:, -1].reshape(-1, 1))[:, 0]

In [29]:
val_features[:, -1] = sc.transform(val_features[:, -1].reshape(-1, 1))[:, 0]

In [31]:
#Custom methods to run the classification algorithms
def naive_bayes(train_feats, train_labels):
    clf = GaussianNB()
    clf.fit(train_feats, train_labels)
    return clf

def logistic_regression(train_feats, train_labels):
    clf = LogisticRegression(solver='liblinear')
    clf.fit(train_feats, train_labels)
    return clf

def random_forest(train_feats, train_labels):
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(train_feats, train_labels)
    return clf

In [32]:
def test_algo(clf, test_feats, test_labels):
    predictions = clf.predict(test_feats)
    f1 = f1_score(test_labels, predictions, average='binary')
    acc = balanced_accuracy_score(test_labels, predictions)
    return f1, acc

In [33]:
unique_users = train_df['user_id'].unique()

In [34]:
unique_users.shape

(64658,)

In [35]:
import csv
import warnings
warnings.filterwarnings('ignore') 

In [37]:
#Creating user profile using the Classification algorithms, training the models on it and selecting and saving the best algorithm 
total_rf_acc = 0.
total_rf_f1 = 0.
total_lr_acc = 0.
total_lr_f1 = 0.
total_gb_acc = 0.
total_gb_f1 = 0.
final_clf_list = []
bad_count = 0.
val_bad = 0.
with open("/Users/saranggrover/Desktop/yelp-dataset/user_profile_standard_scaled.csv", "w") as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(('user_id', 'Random Forest', 'Logistic Regression', 'Naive Bayes', 'MAX', 'Best Model'))
    for index, user in enumerate(tqdm(unique_users)):
        train_inds = train_df[train_df['user_id'].values == user].index.values
        val_inds = val_df[val_df['user_id'].values == user].index.values
        
        train_labels = train_df['likes'].values[train_inds]
        val_labels = val_df['likes'].values[val_inds]
        
        if (np.all(train_labels)) or (not np.any(train_labels)):
            bad_count += 1
            if (np.all(val_labels)) or (not np.any(val_labels)):
                val_bad += 1
            writer.writerow((user, None, None, None, None, "most_popular"))
            final_clf_list.append((user, "most_popular_{}".format(train_labels[0])))
            
            continue
        else:
            train_feats = train_features[train_inds]
            val_feats = val_features[val_inds]
            rf = random_forest(train_feats, train_labels)
            lr = logistic_regression(train_feats, train_labels)
            gb = naive_bayes(train_feats, train_labels)

            rf_f1, rf_acc = test_algo(rf, val_feats, val_labels)
            lr_f1, lr_acc = test_algo(lr, val_feats, val_labels)
            gb_f1, gb_acc = test_algo(gb, val_feats, val_labels)

            
        total_rf_acc += rf_f1
        total_rf_f1 += rf_acc
        total_lr_acc += lr_acc
        total_lr_f1 += lr_f1
        total_gb_acc += gb_acc
        total_gb_f1 += gb_f1
        max_acc = max(rf_acc, lr_acc, gb_acc)
        final_clf = None
        if rf_acc == max_acc:
            final_clf = rf
        elif lr_acc == max_acc:
            final_clf = lr
        elif gb_acc == max_acc:
            final_clf = gb
        else:
            raise Exception("could not find final classifier....")
      
        writer.writerow((user, rf_acc, lr_acc, gb_acc, max(rf_acc, lr_acc, gb_acc), type(final_clf).__name__))
       
        final_clf_list.append((user, type(final_clf).__name__))
    
        
total_rf_acc /= (len(unique_users) - bad_count)
total_rf_f1 /= (len(unique_users) - bad_count)
total_lr_f1 /= (len(unique_users) - bad_count)
total_lr_acc /= (len(unique_users) - bad_count)
total_gb_f1 /= (len(unique_users) - bad_count)
total_gb_acc /= (len(unique_users) - bad_count)




In [38]:
print(val_bad)

4236.0


In [39]:
print("Total accuracy: RF: {:.2%}, LR: {:.2%}, GB: {:.2%}".format(float(total_rf_acc), float(total_lr_acc), float(total_gb_acc)))
print("Total f1 score: RF: {:.2%}, LR: {:.2%}, GB: {:.2%}".format(float(total_rf_f1), float(total_lr_f1), float(total_gb_f1)))


Total accuracy: RF: 57.92%, LR: 63.71%, GB: 60.83%
Total f1 score: RF: 59.91%, LR: 63.67%, GB: 60.81%


In [40]:
with open('/Users/saranggrover/Desktop/yelp-dataset/user_model_standard_scaled.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['user_id', 'best_model'])
    csv_out.writerows(final_clf_list)

### Experiments

- Scaling:
    - StandardScaler
        - **Total accuracy: RF: 57.92%, LR: 63.71%, GB: 60.83%**
        - **Total f1 score: RF: 59.91%, LR: 63.67%, GB: 60.81%**

In [42]:
user_profile_models = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/user_model_standard_scaled.csv")

In [43]:
user_profile_models.head()

Unnamed: 0,user_id,best_model
0,iGtInQDTZ89mKnkhFWdlfA,RandomForestClassifier
1,O3cItff0mKAfXtl5VmbW2w,LogisticRegression
2,U5YQX_vMl_xQy8EQDqlNQQ,GaussianNB
3,JrgMipJRhagq42ROTzC_CQ,RandomForestClassifier
4,8lofUN7rFkwT2bw4b5SM4g,RandomForestClassifier


In [44]:
test_df['attributes'] = test_df['attributes'].apply(convert_str_to_dict)
test_df['categories'] = test_df['categories'].apply(convert_str_to_list)

In [45]:
test_df.drop(columns=['date', 'review_id', 'text'], inplace=True)

In [46]:
test_df['likes'] = test_df['stars'].apply(convert_ratings)

In [47]:
test_features = encoding_union.transform(test_df)

In [48]:
test_features[:, -1] = sc.transform(test_features[:, -1].reshape(-1, 1))[:, 0]

In [49]:
test_unique_users = test_df['user_id'].values

In [55]:
total_test_f1 = 0.
total_test_acc = 0.

for index, user in enumerate(tqdm(test_unique_users)):
    train_inds = train_df[train_df['user_id'].values == user].index.values
    test_inds = test_df[test_df['user_id'].values == user].index.values
    
    train_labels = train_df['likes'].values[train_inds]
    test_labels = test_df['likes'].values[test_inds]
    
    train_feats = train_features[train_inds]
    test_feats = test_features[test_inds]
    
    active_user_model = user_profile_models[user_profile_models['user_id'].values == user]['best_model'].values[0]
#     print(active_user_model)
    user_clf = None
    if active_user_model == "LogisticRegression":
        user_clf = logistic_regression(train_feats, train_labels)
    elif active_user_model == "RandomForestClassifier":
        user_clf = random_forest(train_feats, train_labels)
    else:
        user_clf = naive_bayes(train_feats, train_labels)
    
    u_f1, u_acc = test_algo(user_clf, test_feats, test_labels)
    total_test_f1 += u_f1
    total_test_acc += u_acc

total_test_f1 /= len(test_unique_users)
total_test_acc /= len(test_unique_users)  




Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [56]:
print("Test F1: {:.4f}, Test Acc: {:.4f}".format(total_test_f1, total_test_acc))

Test F1: 0.6599, Test Acc: 0.5923


## Test Results (average F1 and Accuracy per user)

- Scaled with standard scaler
    - Test F1: 0.6599, Test Acc: 0.5923