In [51]:
import os
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from numpy.linalg import norm
from sklearn.pipeline import FeatureUnion
from scipy.sparse import coo_matrix
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn import base
from ast import literal_eval
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessClassifier


%matplotlib inline

## Load Data

In [52]:
train_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/train.csv")
val_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/validation.csv")
test_df = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/test.csv")

In [54]:
# Count number of users
train_df['user_id'].unique().shape

(64658,)

In [55]:
val_df['user_id'].unique().shape

(64658,)

In [56]:
test_df['user_id'].unique().shape

(64658,)

In [57]:
# Count number of businesses
train_df['business_id'].unique().shape

(49160,)

In [58]:
train_df.head()

Unnamed: 0,review_id,user_id,business_id,text,date,attributes,name,city,categories,avg_business_stars,avg_user_star,stars
0,0HWPTEnVT7L5BCKj33erLQ,iGtInQDTZ89mKnkhFWdlfA,HMSDOjt_KCyknzjQ9aI5Jw,"I would like to start with the classic line ""I...",2018-11-04 23:44:27,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...",Tomo Noodles and Dumplings,Las Vegas,"Noodles, Restaurants, Japanese, Ramen",4.0,3.04,1.0
1,Y6xCQlzc9YTXD3IKMGN-oQ,O3cItff0mKAfXtl5VmbW2w,yXiDD18UV49-7UhA6dWjAw,We come here for once at least once a month. I...,2018-09-07 20:41:42,"{'GoodForMeal': ""{'dessert': False, 'latenight...",Giacomo's Pizzeria and Italian Restaurant,Charlotte,"Sandwiches, Pizza, Italian, Restaurants, Event...",4.0,4.15,4.0
2,9j_uLXkEgpN52Lz5VJ8sMQ,U5YQX_vMl_xQy8EQDqlNQQ,VSNUFYBQ_wOFmRXZ8SeQ4w,First meal in Pittsburgh was courtesy of S&D! ...,2015-09-30 02:07:55,"{'Alcohol': ""u'none'"", 'DogsAllowed': 'False',...",S&D Polish Deli,Pittsburgh,"Polish, Ethnic Food, Specialty Food, Food, Res...",4.5,3.48,4.0
3,xilO0UqpI_EYJg0nzTBqzw,JrgMipJRhagq42ROTzC_CQ,lVVJMvqu4LXL5rBqjO6eqg,The Pork bone soup is delicious! The broth loo...,2015-05-30 07:40:17,"{'HasTV': 'False', 'WiFi': ""u'no'"", 'BusinessP...",Tofu Village - House of Soon Tofu,Toronto,"Restaurants, Korean",4.5,2.53,4.0
4,P5LDWTH6cxQK-_IWtvyOWw,8lofUN7rFkwT2bw4b5SM4g,EEIz44ewHhOKmfTloAK13g,Words cannot express how delectably divine a d...,2017-11-19 16:47:26,"{'RestaurantsPriceRange2': '4', 'RestaurantsTa...",Alo Restaurant,Toronto,"Bars, Nightlife, Restaurants, French",4.5,3.97,5.0


## Feature Extraction

- This part converts certain columns to a workable format (string to dict, string to list)

In [59]:
def convert_str_to_dict(row):
    return literal_eval(row)

In [60]:
def convert_str_to_list(row):
    l = row.strip().split(",")
    for s in l:
        s = s.strip()
    return l

In [61]:
train_df['attributes'] = train_df['attributes'].apply(convert_str_to_dict)
val_df['attributes'] = val_df['attributes'].apply(convert_str_to_dict)

In [62]:
train_df['categories'] = train_df['categories'].apply(convert_str_to_list)
val_df['categories'] = val_df['categories'].apply(convert_str_to_list)


In [63]:
train_df['stars'].unique()

array([1., 4., 5., 3., 2.])

In [64]:
def convert_ratings(row):
    x = int(row)
    if x in [1, 2, 3]:
        new_rating = 0
    else:
        new_rating = 1
    return new_rating

In [65]:
train_df['likes'] = train_df['stars'].apply(convert_ratings)
val_df['likes'] = val_df['stars'].apply(convert_ratings)

In [16]:
# train_df.columns

In [17]:
train_labels = train_df['likes']

In [18]:
val_labels = val_df['likes']

In [19]:
train_df.drop(columns=['date', 'review_id', 'text', 'stars'], inplace=True)
val_df.drop(columns=['date', 'review_id', 'text', 'stars'], inplace=True)

In [20]:
print(train_df.columns)
print(val_df.columns)

Index(['business_id', 'user_id', 'attributes', 'name', 'city', 'categories',
       'avg_business_stars', 'avg_user_star', 'likes'],
      dtype='object')
Index(['business_id', 'user_id', 'attributes', 'name', 'city', 'categories',
       'avg_business_stars', 'avg_user_star', 'likes'],
      dtype='object')


In [21]:
def Value_To_Dict(val):
    return {val:1}

def List_To_Dict(the_list):
    return {category:1 for category in the_list}
    
def Flatten_Dict(d, prekey = ''):
    flat_dict = {}
    for key in d:
        if isinstance(d[key], bool) and d[key]:
            flat_dict.update({prekey+'_'+key:1})
        elif isinstance(d[key], str):
            flat_dict.update({prekey+'_'+key+'_'+d[key]:1})
        elif isinstance(d[key], dict):
            flat_dict.update(Flatten_Dict(d[key], prekey=prekey+'_'+key))
    return flat_dict


In [22]:
class One_Hot_Encoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames, value_type = 'value', sparse = True):
        if value_type == 'value':
            self.apply_function_ = Value_To_Dict
        elif value_type == 'list':
            self.apply_function_ = List_To_Dict
        elif value_type == 'dict':
            self.apply_function_ = Flatten_Dict
        self.colnames_ = colnames
        self.dv_ = DictVectorizer(sparse = sparse)

    def fit(self, X, y = None):
        self.dv_.fit(X[self.colnames_].apply(self.apply_function_))
        return self

    def transform(self, X):
        return self.dv_.transform(X[self.colnames_].apply(self.apply_function_))

In [66]:
class Column_Selector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames):
        self.colnames_ = colnames

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.DataFrame(X[self.colnames_])

## Feature Extraction

- Categories
- Attributes
- City
- Average business rating

In [67]:
# Create features for each restaurant

encoding_category = One_Hot_Encoder('categories', 'list', sparse=False)
encoding_attribute = One_Hot_Encoder('attributes', 'dict', sparse=False)
encoding_city= One_Hot_Encoder('city', 'value', sparse=False)
business_rating = Column_Selector(['avg_business_stars'])

encoding_union = FeatureUnion([ ('cat', encoding_category), 
                               ('attr', encoding_attribute),
                               ('city', encoding_city), 
                               ('avg_business_rating', business_rating),
                              ])


train_features = encoding_union.fit_transform(train_df)
val_features = encoding_union.transform(val_df)

In [25]:
train_features.shape

(1078332, 2612)

In [26]:
val_features.shape

(269584, 2612)

## Create User Profiles using ML classification models

- Maybe we don't scale the data to weight it more heavily on the average business ratings based on popularity?
- Other wise:
    - Scale the numeric features and one-hot encode the categorical ones.
- Also, some users like all the restaurants or none of the restaurants which causes a problem with sklearn's classifiers, so we will default to most popular restaurant in a given area that a user has not been to

In [27]:
sc = StandardScaler()

In [28]:
train_features[:, -1] = sc.fit_transform(train_features[:, -1].reshape(-1, 1))[:, 0]

In [29]:
val_features[:, -1] = sc.transform(val_features[:, -1].reshape(-1, 1))[:, 0]

In [31]:
def decision_tree(train_feats, train_labels):
    clf = DecisionTreeClassifier()
    clf.fit(train_feats, train_labels)
    return clf

def naive_bayes(train_feats, train_labels):
    clf = GaussianNB()
    clf.fit(train_feats, train_labels)
    return clf

def logistic_regression(train_feats, train_labels):
    clf = LogisticRegression(solver='liblinear')
    clf.fit(train_feats, train_labels)
    return clf

def gaussian_process(train_feats, train_labels):
    clf = GaussianProcessClassifier()
    clf.fit(train_feats, train_labels)
    return clf

def random_forest(train_feats, train_labels):
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(train_feats, train_labels)
    return clf

In [32]:
def test_algo(clf, test_feats, test_labels):
    predictions = clf.predict(test_feats)
#     print(predictions)
    f1 = f1_score(test_labels, predictions, average='binary')
    acc = balanced_accuracy_score(test_labels, predictions)
    return f1, acc

In [33]:
unique_users = train_df['user_id'].unique()

In [34]:
unique_users.shape

(64658,)

In [35]:
import csv
import warnings
warnings.filterwarnings('ignore') 

## Training and validating models for each user

- Train each model on the user review ratings in train set, validate on validation set, and select best algorithm based on the algorithm that performed the best in terms of accuracy and save that model. 

In [37]:
total_rf_acc = 0.
total_rf_f1 = 0.
total_lr_acc = 0.
total_lr_f1 = 0.
total_gb_acc = 0.
total_gb_f1 = 0.
final_clf_list = []
bad_count = 0.
val_bad = 0.
with open("/Users/saranggrover/Desktop/yelp-dataset/user_profile_standard_scaled.csv", "w") as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(('user_id', 'Random Forest', 'Logistic Regression', 'Naive Bayes', 'MAX', 'Best Model'))
    for index, user in enumerate(tqdm(unique_users)):
        train_inds = train_df[train_df['user_id'].values == user].index.values
        val_inds = val_df[val_df['user_id'].values == user].index.values
        
        train_labels = train_df['likes'].values[train_inds]
        val_labels = val_df['likes'].values[val_inds]
        
        if (np.all(train_labels)) or (not np.any(train_labels)):
            bad_count += 1
            if (np.all(val_labels)) or (not np.any(val_labels)):
                val_bad += 1
            writer.writerow((user, None, None, None, None, "most_popular"))
            final_clf_list.append((user, "most_popular_{}".format(train_labels[0])))
            
            continue
        else:
            train_feats = train_features[train_inds]
            val_feats = val_features[val_inds]
            rf = random_forest(train_feats, train_labels)
            lr = logistic_regression(train_feats, train_labels)
            gb = naive_bayes(train_feats, train_labels)

            rf_f1, rf_acc = test_algo(rf, val_feats, val_labels)
            lr_f1, lr_acc = test_algo(lr, val_feats, val_labels)
            gb_f1, gb_acc = test_algo(gb, val_feats, val_labels)

            
        total_rf_acc += rf_f1
        total_rf_f1 += rf_acc
        total_lr_acc += lr_acc
        total_lr_f1 += lr_f1
        total_gb_acc += gb_acc
        total_gb_f1 += gb_f1
        max_acc = max(rf_acc, lr_acc, gb_acc)
        final_clf = None
        if rf_acc == max_acc:
            final_clf = rf
        elif lr_acc == max_acc:
            final_clf = lr
        elif gb_acc == max_acc:
            final_clf = gb
        else:
            raise Exception("could not find final classifier....")
      
        writer.writerow((user, rf_acc, lr_acc, gb_acc, max(rf_acc, lr_acc, gb_acc), type(final_clf).__name__))
        # Append user_id and best model
        final_clf_list.append((user, type(final_clf).__name__))
    
        
total_rf_acc /= (len(unique_users) - bad_count)
total_rf_f1 /= (len(unique_users) - bad_count)
total_lr_f1 /= (len(unique_users) - bad_count)
total_lr_acc /= (len(unique_users) - bad_count)
total_gb_f1 /= (len(unique_users) - bad_count)
total_gb_acc /= (len(unique_users) - bad_count)




In [38]:
print(val_bad)

4236.0


In [39]:
print("Total accuracy: RF: {:.2%}, LR: {:.2%}, GB: {:.2%}".format(float(total_rf_acc), float(total_lr_acc), float(total_gb_acc)))
print("Total f1 score: RF: {:.2%}, LR: {:.2%}, GB: {:.2%}".format(float(total_rf_f1), float(total_lr_f1), float(total_gb_f1)))


Total accuracy: RF: 57.92%, LR: 63.71%, GB: 60.83%
Total f1 score: RF: 59.91%, LR: 63.67%, GB: 60.81%


In [40]:
with open('/Users/saranggrover/Desktop/yelp-dataset/user_model_standard_scaled.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['user_id', 'best_model'])
    csv_out.writerows(final_clf_list)

### Experiments

- No scaling:
    - Total accuracy: RF: 57.94%, **LR: 62.32%**, GB: 60.81%
    - Total f1 score: RF: 59.92%, **LR: 63.34%**, GB: 60.79%
- Scaling:
    - StandardScaler
        - Total accuracy: RF: 57.92%, **LR: 63.71%**, GB: 60.83%
        - Total f1 score: RF: 59.91%, **LR: 63.67%**, GB: 60.81%

## Final Evaluation on Test Set

- Using the chosen models for each user

In [42]:
user_profile_models = pd.read_csv("/Users/saranggrover/Desktop/yelp-dataset/user_model_standard_scaled.csv")

In [43]:
user_profile_models.head()

Unnamed: 0,user_id,best_model
0,iGtInQDTZ89mKnkhFWdlfA,RandomForestClassifier
1,O3cItff0mKAfXtl5VmbW2w,LogisticRegression
2,U5YQX_vMl_xQy8EQDqlNQQ,GaussianNB
3,JrgMipJRhagq42ROTzC_CQ,RandomForestClassifier
4,8lofUN7rFkwT2bw4b5SM4g,RandomForestClassifier


In [44]:
test_df['attributes'] = test_df['attributes'].apply(convert_str_to_dict)
test_df['categories'] = test_df['categories'].apply(convert_str_to_list)

In [45]:
test_df.drop(columns=['date', 'review_id', 'text'], inplace=True)

In [46]:
test_df['likes'] = test_df['stars'].apply(convert_ratings)

In [47]:
test_features = encoding_union.transform(test_df)

In [48]:
test_features[:, -1] = sc.transform(test_features[:, -1].reshape(-1, 1))[:, 0]

In [49]:
test_unique_users = test_df['user_id'].values

In [55]:
total_test_f1 = 0.
total_test_acc = 0.

for index, user in enumerate(tqdm(test_unique_users)):
    train_inds = train_df[train_df['user_id'].values == user].index.values
    test_inds = test_df[test_df['user_id'].values == user].index.values
    
    train_labels = train_df['likes'].values[train_inds]
    test_labels = test_df['likes'].values[test_inds]
    
    train_feats = train_features[train_inds]
    test_feats = test_features[test_inds]
    
    active_user_model = user_profile_models[user_profile_models['user_id'].values == user]['best_model'].values[0]
#     print(active_user_model)
    user_clf = None
    if active_user_model == "LogisticRegression":
        user_clf = logistic_regression(train_feats, train_labels)
    elif active_user_model == "RandomForestClassifier":
        user_clf = random_forest(train_feats, train_labels)
    else:
        user_clf = naive_bayes(train_feats, train_labels)
    
    u_f1, u_acc = test_algo(user_clf, test_feats, test_labels)
    total_test_f1 += u_f1
    total_test_acc += u_acc

total_test_f1 /= len(test_unique_users)
total_test_acc /= len(test_unique_users)  




Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [56]:
print("Test F1: {:.4f}, Test Acc: {:.4f}".format(total_test_f1, total_test_acc))

Test F1: 0.6599, Test Acc: 0.5923


## Test Results (average F1 and Accuracy per user)

- Scaled with standard scaler
    - Test F1: 0.6599, Test Acc: 0.5923