In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.dummy import DummyRegressor
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.estimator_checks import check_estimator
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.utils.multiclass import unique_labels
from YelpDataset import *

## Random Forest parameters

In [3]:
regressor = RandomForestRegressor()
regressor.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Random Forest Regressor

In [4]:
class RFRegressor(RandomForestRegressor, ClassifierMixin):
    def fit(self, X, y=None):
        self._classes, y = np.unique(y, return_inverse=True)
        users = X[:, -2]
        self.unique_users = np.unique(X[:, -2])
        self.regressors = {}
        for i in self.unique_users:
            regressor = RandomForestRegressor(**self.get_params())
            userX = X[users==i, 0:-3]
            userY = y[users==i]
            regressor.fit(userX, userY)
            self.regressors[i] = (regressor)
            if i % 1000 == 0:
                print(f'i = {i}')
        return self
    
    def predict(self, X):
        Y = np.zeros(X.shape[0], int)
        for i in range(0, X.shape[0]):
            x = X[i, 0:-3]
            user = X[i, -2]
            if (self.regressors.get(user, None) is None):
                y = [3]
            else:
                y = self.regressors[user].predict([x])
            y = y[0]
            Y[i] = y
            if i % 1000 == 0:
                print(f'i = {i}')
        Y = self._classes[Y]
        return Y

## Testing regressor on dataset

In [5]:
# load data
businesses = load_businesses_json_array(yelp_pittsburgh_business_path)

try:
    data = pd.read_pickle('businesses_features.pkl')
except:
    print('data is not found')
    data = extract_features(businesses)
    data.to_pickle('businesses_features.pkl')

try:
    matrix = np.load('review_features.pkl')
except:
    print('data is not found')
    matrix = extract_review_feature_data_table('businesses_features.pkl', 'data/reviews_pittsburgh_contentbased.json')
    matrix.dump('review_features.pkl')

In [6]:
# split data
dataX = matrix[:, :]
dataY = matrix[:, -1]

trainX, testX, trainY, testY = train_test_split(dataX, dataY)

In [8]:
# fit
max_depth = 3
n_estimators = 5

regressor = RFRegressor(max_depth=max_depth, n_estimators=n_estimators)
regressor.fit(trainX, trainY)

i = 0.0


i = 2000.0


i = 3000.0


i = 4000.0


i = 5000.0


i = 6000.0


i = 8000.0


i = 9000.0


i = 10000.0


i = 11000.0


i = 12000.0


i = 13000.0


i = 15000.0


i = 16000.0


i = 17000.0


i = 18000.0


i = 19000.0


i = 20000.0


i = 21000.0


i = 22000.0


i = 23000.0


i = 24000.0


i = 26000.0


i = 27000.0


i = 28000.0


i = 32000.0


i = 34000.0


i = 35000.0


i = 37000.0


i = 38000.0


i = 39000.0


i = 40000.0


i = 42000.0


i = 43000.0


RFRegressor(bootstrap=True, criterion='mse', max_depth=3, max_features='auto',
      max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1,
      min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=5,
      n_jobs=1, oob_score=False, random_state=None, verbose=0,
      warm_start=False)

In [8]:
# predict
predictY = regressor.predict(testX)
mean_absolute_error(testY, predictY)

i = 0


i = 1000


i = 2000


i = 3000


i = 4000


i = 5000


i = 6000


i = 7000


i = 8000


i = 9000


i = 10000


i = 11000


i = 12000


i = 13000


i = 14000


i = 15000


i = 16000


i = 17000


i = 18000


i = 19000


i = 20000


i = 21000


i = 22000


i = 23000


i = 24000


i = 25000


i = 26000


i = 27000


i = 28000


i = 29000


i = 30000


i = 31000


i = 32000


i = 33000


i = 34000


i = 35000


i = 36000


1.1362591084143741