# Content-Based Filtering using ML Regression Models

- Kevin Chuang
- Yelp Academic Dataset (01.2019)

In [1]:
import os
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from numpy.linalg import norm
from sklearn.pipeline import FeatureUnion
from scipy.sparse import coo_matrix
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn import base
from ast import literal_eval
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# %matplotlib inline

## Load Data

In [2]:
train_df = pd.read_csv("yelp_restaurant_train.csv")
val_df = pd.read_csv("yelp_restaurant_validation.csv")
test_df = pd.read_csv("yelp_restaurant_test.csv")

In [3]:
# rest_df = pd.read_csv("restaurant_business.csv")

In [4]:
# Count number of users
train_df['user_id'].unique().shape

(64658,)

In [5]:
val_df['user_id'].unique().shape

(64658,)

In [6]:
test_df['user_id'].unique().shape

(64658,)

In [7]:
# Count number of businesses
train_df['business_id'].unique().shape

(49160,)

In [8]:
train_df.head()

Unnamed: 0,business_id,date,review_id,text,user_id,attributes,name,city,categories,avg_business_stars,avg_user_star,stars
0,HMSDOjt_KCyknzjQ9aI5Jw,2018-11-04 23:44:27,0HWPTEnVT7L5BCKj33erLQ,"I would like to start with the classic line ""I...",iGtInQDTZ89mKnkhFWdlfA,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...",Tomo Noodles and Dumplings,Las Vegas,"Noodles, Restaurants, Japanese, Ramen",4.0,3.04,1.0
1,yXiDD18UV49-7UhA6dWjAw,2018-09-07 20:41:42,Y6xCQlzc9YTXD3IKMGN-oQ,We come here for once at least once a month. I...,O3cItff0mKAfXtl5VmbW2w,"{'GoodForMeal': ""{'dessert': False, 'latenight...",Giacomo's Pizzeria and Italian Restaurant,Charlotte,"Sandwiches, Pizza, Italian, Restaurants, Event...",4.0,4.15,4.0
2,VSNUFYBQ_wOFmRXZ8SeQ4w,2015-09-30 02:07:55,9j_uLXkEgpN52Lz5VJ8sMQ,First meal in Pittsburgh was courtesy of S&D! ...,U5YQX_vMl_xQy8EQDqlNQQ,"{'Alcohol': ""u'none'"", 'DogsAllowed': 'False',...",S&D Polish Deli,Pittsburgh,"Polish, Ethnic Food, Specialty Food, Food, Res...",4.5,3.48,4.0
3,lVVJMvqu4LXL5rBqjO6eqg,2015-05-30 07:40:17,xilO0UqpI_EYJg0nzTBqzw,The Pork bone soup is delicious! The broth loo...,JrgMipJRhagq42ROTzC_CQ,"{'HasTV': 'False', 'WiFi': ""u'no'"", 'BusinessP...",Tofu Village - House of Soon Tofu,Toronto,"Restaurants, Korean",4.5,2.53,4.0
4,EEIz44ewHhOKmfTloAK13g,2017-11-19 16:47:26,P5LDWTH6cxQK-_IWtvyOWw,Words cannot express how delectably divine a d...,8lofUN7rFkwT2bw4b5SM4g,"{'RestaurantsPriceRange2': '4', 'RestaurantsTa...",Alo Restaurant,Toronto,"Bars, Nightlife, Restaurants, French",4.5,3.97,5.0


## Feature Extraction

- This part converts certain columns to a workable format (string to dict, string to list)

In [9]:
def convert_str_to_dict(row):
    return literal_eval(row)

In [10]:
def convert_str_to_list(row):
    l = row.strip().split(",")
    for s in l:
        s = s.strip()
    return l

In [11]:
train_df['attributes'] = train_df['attributes'].apply(convert_str_to_dict)
val_df['attributes'] = val_df['attributes'].apply(convert_str_to_dict)

train_df['categories'] = train_df['categories'].apply(convert_str_to_list)
val_df['categories'] = val_df['categories'].apply(convert_str_to_list)


In [12]:
train_df['stars'].unique()

array([1., 4., 5., 3., 2.])

In [13]:
# def convert_ratings(row):
#     x = int(row)
#     if x in [1, 2, 3]:
#         new_rating = 0
#     else:
#         new_rating = 1
#     return new_rating

In [14]:
# train_df['likes'] = train_df['stars'].apply(convert_ratings)
# val_df['likes'] = val_df['stars'].apply(convert_ratings)

In [15]:
# train_df.columns

In [16]:
# train_labels = train_df['stars']

In [17]:
# val_labels = val_df['stars']

In [18]:
train_df.drop(columns=['date', 'review_id', 'text'], inplace=True)
val_df.drop(columns=['date', 'review_id', 'text'], inplace=True)

In [19]:
print(train_df.columns)
print(val_df.columns)

Index(['business_id', 'user_id', 'attributes', 'name', 'city', 'categories',
       'avg_business_stars', 'avg_user_star', 'stars'],
      dtype='object')
Index(['business_id', 'user_id', 'attributes', 'name', 'city', 'categories',
       'avg_business_stars', 'avg_user_star', 'stars'],
      dtype='object')


In [20]:
def Value_To_Dict(val):
    return {val:1}

def List_To_Dict(the_list):
    return {category:1 for category in the_list}
    
def Flatten_Dict(d, prekey = ''):
    flat_dict = {}
    for key in d:
        if isinstance(d[key], bool) and d[key]:
            flat_dict.update({prekey+'_'+key:1})
        elif isinstance(d[key], str):
            flat_dict.update({prekey+'_'+key+'_'+d[key]:1})
        elif isinstance(d[key], dict):
            flat_dict.update(Flatten_Dict(d[key], prekey=prekey+'_'+key))
    return flat_dict


In [21]:
class One_Hot_Encoder(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames, value_type = 'value', sparse = True):
        if value_type == 'value':
            self.apply_function_ = Value_To_Dict
        elif value_type == 'list':
            self.apply_function_ = List_To_Dict
        elif value_type == 'dict':
            self.apply_function_ = Flatten_Dict
        self.colnames_ = colnames
        self.dv_ = DictVectorizer(sparse = sparse)

    def fit(self, X, y = None):
        self.dv_.fit(X[self.colnames_].apply(self.apply_function_))
        return self

    def transform(self, X):
        return self.dv_.transform(X[self.colnames_].apply(self.apply_function_))

In [22]:
class Column_Selector(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, colnames):
        self.colnames_ = colnames

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return pd.DataFrame(X[self.colnames_])

## Feature Extraction

- Categories
- Attributes
- City
- Average business rating

In [23]:
# Create features for each restaurant

encoding_category = One_Hot_Encoder('categories', 'list', sparse=False)
encoding_attribute = One_Hot_Encoder('attributes', 'dict', sparse=False)
encoding_city= One_Hot_Encoder('city', 'value', sparse=False)
business_rating = Column_Selector(['avg_business_stars'])
# user_rating = Column_Selector(['avg_user_star'])

encoding_union = FeatureUnion([ ('cat', encoding_category), 
                               ('attr', encoding_attribute),
                               ('city', encoding_city), 
                               ('avg_business_rating', business_rating),
#                                ('avg_user_rating', user_rating)
                              ])

# Fit on all restaurants maybe?
# encoding_union.fit(rest_df)

train_features = encoding_union.fit_transform(train_df)
val_features = encoding_union.transform(val_df)

In [24]:
train_features.shape

(1078332, 2612)

In [25]:
val_features.shape

(269584, 2612)

## Create User Profiles using ML regression models

- Maybe we don't scale the data to weight it more heavily on the average business ratings based on popularity?
- Other wise:
    - Scale the numeric features and one-hot encode the categorical ones.

In [26]:
# sc = MinMaxScaler()

In [26]:
sc = StandardScaler()

In [27]:
train_features[:, -1] = sc.fit_transform(train_features[:, -1].reshape(-1, 1))[:, 0]

In [28]:
val_features[:, -1] = sc.transform(val_features[:, -1].reshape(-1, 1))[:, 0]

In [29]:
train_features[:, -1]

array([0.41442058, 0.41442058, 1.24652585, ..., 0.41442058, 0.41442058,
       1.24652585])

In [30]:
def lasso(train_feats, train_labels):
    clf = Lasso()
    clf.fit(train_feats, train_labels)
    return clf

def ridge(train_feats, train_labels):
    clf = Ridge()
    clf.fit(train_feats, train_labels)
    return clf

def elastic_net(train_feats, train_labels):
    clf = ElasticNet()
    clf.fit(train_feats, train_labels)
    return clf

def linear_regression(train_feats, train_labels):
    clf = LinearRegression()
    clf.fit(train_feats, train_labels)
    return clf

In [31]:
def test_algo(clf, test_feats, test_labels):
    predictions = clf.predict(test_feats)
#     print(predictions)
#     f1 = f1_score(test_labels, predictions, average='binary')
#     acc = balanced_accuracy_score(test_labels, predictions)
    mse = mean_squared_error(test_labels, predictions)
    mae = mean_absolute_error(test_labels, predictions)
    return mse, mae

In [32]:
unique_users = train_df['user_id'].unique()

In [33]:
unique_users.shape

(64658,)

In [34]:
import csv
import warnings
warnings.filterwarnings('ignore') 

## Training and validating models for each user

- Train each model on the user review ratings in train set, validate on validation set, and select best algorithm based on the algorithm that performed the best in terms of MSE and save that model. 

In [37]:
total_r_mse = 0.
total_r_mae = 0.
total_l_mse = 0.
total_l_mae = 0.
total_en_mse = 0.
total_en_mae = 0.
final_clf_list = []

with open("content_based/regression/user_profile_minmax_scaled.csv", "w") as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(('user_id', 'Ridge', 'Lasso', 'ElasticNet', 'MIN', 'Best Model'))
    for index, user in enumerate(tqdm(unique_users)):
        train_inds = train_df[train_df['user_id'].values == user].index.values
        val_inds = val_df[val_df['user_id'].values == user].index.values
        
    #     print(train_inds)
    #     print(val_inds)
        train_labels = train_df['stars'].values[train_inds]
        val_labels = val_df['stars'].values[val_inds]
    #     print(train_labels)
    #     print(val_labls)
        train_feats = train_features[train_inds]
        val_feats = val_features[val_inds]

        r_clf = ridge(train_feats, train_labels)
        l_clf = lasso(train_feats, train_labels)
        en_clf = elastic_net(train_feats, train_labels)

        r_mse, r_mae = test_algo(r_clf, val_feats, val_labels)
        l_mse, l_mae = test_algo(l_clf, val_feats, val_labels)
        en_mse, en_mae = test_algo(en_clf, val_feats, val_labels)
        
#         writer.writerow((user, r_mse, l_mse, en_mse, min(r_mse, l_mse, en_mse)))
        total_r_mse += r_mse
        total_r_mae += r_mae
        total_l_mse += l_mse
        total_l_mae += l_mae
        total_en_mse += en_mse
        total_en_mae += en_mae
        min_mse = min(r_mse, l_mse, en_mse)
        final_clf = None
        if r_mse == min_mse:
            final_clf = r_clf
        elif l_mse == min_mse:
            final_clf = l_clf
        else:
            final_clf = en_clf
        
        # Append user_id and best model
        final_clf_list.append((user, type(final_clf).__name__))
        writer.writerow((user, r_mse, l_mse, en_mse, min(r_mse, l_mse, en_mse), type(final_clf).__name__))

        
total_r_mse /= len(unique_users)
total_r_mae /= len(unique_users)
total_l_mse /= len(unique_users)
total_l_mae /= len(unique_users)
total_en_mse /= len(unique_users)
total_en_mae /= len(unique_users)




In [38]:
print("Total MSE: Ridge: {:.4f}, Lasso: {:.4f}, ElasticNet: {:.4f}".format(total_r_mse, total_l_mse, total_en_mse))
print("Total MAE: Ridge: {:.4f}, Lasso: {:.4f}, ElasticNet: {:.4f}".format(total_r_mae, total_l_mae, total_en_mae))

Total MSE: Ridge: 1.7431, Lasso: 1.5242, ElasticNet: 1.5252
Total MAE: Ridge: 1.0195, Lasso: 0.9697, ElasticNet: 0.9695


In [39]:
# with open('content_based/regression/user_model_minmax_scaler.csv','w') as out:
#     csv_out=csv.writer(out)
#     csv_out.writerow(['user_id', 'best_model'])
#     csv_out.writerows(final_clf_list)

In [51]:
np.sqrt(1.4617)

1.2090078577081291

## Regression Experiments

- Unscaled
    - Total MSE: Ridge: 1.6542, Lasso: 1.5203, **ElasticNet: 1.5034**
    - Total MAE: Ridge: 0.9913, Lasso: 0.9684, ElasticNet: 0.9625
- Scaled (scale only average business ratings (numeric column))
    - **StandardScaler**
        - Total RMSE: Ridge: 1.2672, Lasso: 1.22479, **ElasticNet: 1.20901**
        - Total MSE: Ridge: 1.6058, Lasso: 1.5001, **ElasticNet: 1.4617**
        - Total MAE: Ridge: 0.9734, Lasso: 0.9611, ElasticNet: 0.9466
    - **MinMaxScaler**
        - Total MSE: Ridge: 1.7431, Lasso: 1.5242, ElasticNet: 1.5252
        - Total MAE: Ridge: 1.0195, Lasso: 0.9697, ElasticNet: 0.9695
        
- [WRONG] Scaled (all columns) 
    - Total MSE: Ridge: 1.7691, Lasso: 1.7171, ElasticNet: 1.7868
    - Total MAE: Ridge: 1.0242, Lasso: 0.9770, ElasticNet: 0.9838

## Final Evaluation on Test Set

- Using the chosen models for each user

In [35]:
user_profile_models = pd.read_csv("content_based/regression/user_model_standard_scaler.csv")

In [36]:
user_profile_models.head()

Unnamed: 0,user_id,best_model
0,iGtInQDTZ89mKnkhFWdlfA,Lasso
1,O3cItff0mKAfXtl5VmbW2w,Lasso
2,U5YQX_vMl_xQy8EQDqlNQQ,Lasso
3,JrgMipJRhagq42ROTzC_CQ,Lasso
4,8lofUN7rFkwT2bw4b5SM4g,Lasso


In [37]:
test_df['attributes'] = test_df['attributes'].apply(convert_str_to_dict)
test_df['categories'] = test_df['categories'].apply(convert_str_to_list)

In [38]:
test_df.drop(columns=['date', 'review_id', 'text'], inplace=True)

In [39]:
test_features = encoding_union.transform(test_df)

In [40]:
test_features[:, -1] = sc.transform(test_features[:, -1].reshape(-1, 1))[:, 0]

In [41]:
test_unique_users = test_df['user_id'].values

In [46]:
total_test_mse = 0.
total_test_mae = 0.

for index, user in enumerate(tqdm(test_unique_users)):
    train_inds = train_df[train_df['user_id'].values == user].index.values
    test_inds = test_df[test_df['user_id'].values == user].index.values
    
    train_labels = train_df['stars'].values[train_inds]
    test_labels = test_df['stars'].values[test_inds]
    
    train_feats = train_features[train_inds]
    test_feats = test_features[test_inds]
    
    active_user_model = user_profile_models[user_profile_models['user_id'].values == user]['best_model'].values[0]
#     print(active_user_model)
    user_clf = None
    if active_user_model == "Ridge":
        user_clf = ridge(train_feats, train_labels)
    elif active_user_model == "Lasso":
        user_clf = lasso(train_feats, train_labels)
    else:
        user_clf = elastic_net(train_feats, train_labels)
    
    u_mse, u_mae = test_algo(user_clf, test_feats, test_labels)
    total_test_mse += u_mse
    total_test_mae += u_mae

total_test_mse /= len(test_unique_users)
total_test_mae /= len(test_unique_users)  




In [48]:
print("Test MSE: {:.4f}, Test MAE: {:.4f}".format(total_test_mse, total_test_mae))

Test set results, Test MSE: 1.3339, Test MAE: 0.8984


## Test Results (average MSE and MAE per user)

- Scaled with standard scaler
    - Test RMSE: 1.15495
    - Test MSE: 1.3339, Test MAE: 0.8984