In [1]:
from tqdm import tqdm

from ngboost import NGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np

In [2]:
features = pd.read_parquet('features.parquet')

In [3]:
drop_clmns = ['property_location',
'parcel_number',
'lot',
'current_sales_date',
'assessed_fixtures_value',
'assessed_improvement_value',
'assessed_land_value',
'assessed_personal_property_value',
'the_geom',
'sqft_price',
'home_price']

#     'property_location': 'object',  # part of UID
#     'parcel_number': 'object',  # part of UID
#     'block': 'object',  # part of UID
#     'lot': 'object',  # part of UID

features['idx'] = (
    features.property_location +
    features.parcel_number +
    features.block  +
    features.lot +
    features.week_number.astype('str')
)
features.set_index('idx', inplace=True)
main_feats = pd.read_csv('main_feats.csv').cols.iloc[:50]
data = features.drop(columns=drop_clmns).loc[:, main_feats]

target = features['home_price'] / features.home_price_lag1_roll26_rolling_median
target_sq = features['sqft_price'] / features.sqft_price_lag1_roll26_rolling_median

In [4]:
target_mask = (target_sq > target_sq.quantile(0.05)) & ( target_sq < target_sq.quantile(0.95))
week_mask = data.week_number > 51

data = data.loc[target_mask & week_mask]
target_sq = target_sq.loc[target_mask & week_mask]

In [5]:
cats = [name for name, type_ in data.dtypes.iteritems() if type_ == 'object']
for cat in cats:
    data[cat] = data[cat].astype('category')

cats = [name for name, type_ in data.dtypes.iteritems() if type_ == 'category']
for cat in cats:
    data[cat] = data[cat].cat.codes
data = data.fillna(-999)

In [6]:
years, week_step = 5, 4

start = 51
end = start + 52*years
number_of_steps = (data.week_number.max() - end)//4

In [7]:
week_number = data.week_number
data = data.drop(columns=['week_number','block'])

In [10]:
class NGBOOST:
    """
    NGBoost regressor predicts mean and standard deviation for given Xi.
    Any quantile could be calculated using these,
    or standard deviation could be used as a feature directly.
    https://github.com/stanfordmlgroup/ngboost/blob/master/ngboost/ngboost.py
    """

    def __init__(self):
        self.best_params = {'ngb_n_estimators': 40, 'base_max_depth': 14, 'base_min_samples_leaf': 16}
        
        base_learner = DecisionTreeRegressor(
                criterion='friedman_mse',
                max_depth=self.best_params["base_max_depth"],
                min_samples_leaf=self.best_params["base_min_samples_leaf"]
                )
        self.model = NGBRegressor(
                verbose=False,
                n_estimators=self.best_params["ngb_n_estimators"],
                Base=base_learner
                )
    
        print("NGBoostRegressor initialized.")

    def finetune(self, X, y):
        """Note! For NGBoost you can tune both NGBoost 
        parameters and Base regressor parameters:
        
        NGBoost Parameters:
            Dist              : assumed distributional form of Y|X=x.
                                A distribution from ngboost.distns, e.g. Normal
            Score             : rule to compare probabilistic predictions P̂ to the observed data y.
                                A score from ngboost.scores, e.g. LogScore
            Base              : base learner to use in the boosting algorithm.
                                Any instantiated sklearn regressor, e.g. DecisionTreeRegressor()
            natural_gradient  : logical flag indicating whether the natural gradient should be used
            n_estimators      : the number of boosting iterations to fit
            learning_rate     : the learning rate
            minibatch_frac    : the percent subsample of rows to use in each boosting iteration
            verbose           : flag indicating whether output should be printed during fitting
            verbose_eval      : increment (in boosting iterations) at which output should be printed
            tol               : numerical tolerance to be used in optimization
            random_state      : seed for reproducibility.
                                See https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn
            validation_fraction: Proportion of training data to set aside as validation data for early stopping.
            early_stopping_rounds: The number of consecutive boosting iterations during which the
                                        loss has to increase before the algorithm stops early.
                                        Set to None to disable early stopping and validation.
                                        None enables running over the full data set.

        Base learner: DecisionTreeRegressor:
            {'Base': DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, max_features=None,
                max_leaf_nodes=None, min_impurity_decrease=0.0,
                min_impurity_split=None, min_samples_leaf=1,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                presort=False, random_state=None, splitter='best'), 'minibatch_frac': 1.0}
        """

        def objective(trial):
            """
            Optuna methods for suggestions:
            suggest_categorical(name, choices)
                Suggest a value for the categorical parameter.
            suggest_discrete_uniform(name, low, high, q)
                Suggest a value for the discrete parameter.
            suggest_float(name, low, high, *[, step, log])
                Suggest a value for the floating point parameter.
            suggest_int(name, low, high[, step, log])
                Suggest a value for the integer parameter.
            suggest_loguniform(name, low, high)
                Suggest a value for the continuous parameter.
            suggest_uniform(name, low, high)
                Suggest a value for the continuous parameter.
            """
            # Use 'ngb_' prefix for NGBoost params:
            ngb_n_estimators = trial.suggest_int("ngb_n_estimators", 2, 200)
            # {'ngb_n_estimators': 40, 'base_max_depth': 14, 'base_min_samples_leaf': 16}
            # Use 'base_' prefix for base learner params:
            base_max_depth = trial.suggest_int("base_max_depth", 2, 20)
            base_min_samples_leaf = trial.suggest_int("base_min_samples_leaf", 2, 40)
            base_learner = DecisionTreeRegressor(
                criterion='friedman_mse', max_depth=base_max_depth, min_samples_leaf=base_min_samples_leaf
                )
            model = NGBRegressor(
                verbose=False,
                n_estimators=ngb_n_estimators, Base=base_learner
                )
            self.cv_score = cross_validate(model, X, y, cv=3)
            return self.cv_score["test_score"].mean()

        self.study = optuna.create_study(direction="minimize")
        self.study.optimize(objective, n_trials=20)

        """Code below have to be modified to work with NGBoost"""
        self.best_params = self.study.best_trials[0].params
        base_learner = DecisionTreeRegressor(
                criterion='friedman_mse',
                max_depth=self.best_params["base_max_depth"],
                min_samples_leaf=self.best_params["base_min_samples_leaf"]
                )
        self.model = NGBRegressor(
                verbose=False,
                n_estimators=self.best_params["ngb_n_estimators"],
                Base=base_learner
                )
        self.model.fit(X, y)

    def fit(self, X, y):
        self.model.fit(X, y)
        print("NGBoostRegressor trained.")

    def predict(self, X):
        return self.model.predict(X)

    def pred_dist(self, X):
        """
        To access dictionary use 'params' property: ngbr.pred_dist(X).params
        {'loc': array([15.71909047, 19.51384116, 19.24509285, 17.8645122 , 24.31325397]),
        'scale': array([1.48748154, 1.37673424, 1.67090687, 1.63854999, 1.52513887])}
        """
        return self.model.pred_dist(X)

In [11]:
ngb = NGBOOST()

for i in tqdm(range(number_of_steps)):
    
    start = 51 + i*week_step
    end = start + 52*years + i*week_step
    
    train_week_mask = (week_number >= start) & (week_number < end)
    val_week_mask =  (week_number >= end) & (week_number < (end + week_step))

    train_x, val_x = data.loc[train_week_mask], data.loc[val_week_mask]
    train_y, val_y = target_sq.loc[train_week_mask], target_sq.loc[val_week_mask]

    ngb.fit(train_x, train_y)

    pred_proba = ngb.pred_dist(val_x)
    
    preds_df = pd.DataFrame(index=val_x.index, data=pred_proba.loc, columns=['ngb_pred'])
    preds_df['ngb_std'] = pred_proba.scale
    preds_df.to_csv(f'ngb_{i}.csv')
    print(f"step {i} mape {np.mean(np.abs(((pred_proba.loc - val_y) / val_y)))}")

NGBoostRegressor initialized.


  0%|                                                                                           | 0/91 [01:30<?, ?it/s]

NGBoostRegressor trained.
step 0 mape 0.18141146423953516



