In [31]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd

from scipy import stats

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import make_pipeline, Pipeline

# To use this experimental feature, we need to explicitly ask for it:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

from catboost import CatBoostRegressor, Pool, cv

from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
import pickle

from sklearn.metrics import r2_score


In [32]:
tax = gpd.read_feather('../data/tax.geofeather')

In [33]:
y = tax['assessed_land_value']

In [34]:
tax['x'] = tax.geometry.centroid.x

In [35]:
tax['y'] = tax.geometry.centroid.y

In [36]:
X = tax.drop(['geometry', 'property_location', 'block', 'lot',
              'assessed_land_value', 'parcel_number', 
              'assessor_neighborhood', 'assessor_neighborhood_district', 
              'analysis_neighborhood',
              'supervisor_district_2012', 'supervisor_district'], 
             axis=1)

In [37]:
current_date = pd.to_datetime('2023-01-01')
time_difference = current_date - X['current_sales_date']
X['years_since_last_sale'] = (time_difference / pd.Timedelta(days=365.25))
X = X.drop(columns='current_sales_date')

In [38]:
if 'imputed_land_value' in X.columns:
    X = X.drop(columns = ['imputed_land_value'])

In [39]:
trainMask = y != 0

In [40]:
X.replace({'None': 'NaN', None: 'NaN', 'nan': 'NaN'}, inplace=True)

In [55]:
trainX, testX = X[trainMask], X[~trainMask]

In [56]:
trainY = y[trainMask]

In [57]:
trainY = np.log10(trainY)

In [65]:
cat_indices = np.where((trainX.dtypes == 'object') | (trainX.dtypes == 'category') | (trainX.dtypes == 'string'))[0]

In [67]:
train_dataset = Pool(data=trainX,
                     label=trainY, 
                     cat_features=cat_indices)

In [76]:
def hyperopt_objective(params):
    print(params)
    cv_data = cv(
        params=params,
        pool=train_dataset,
        fold_count=5,
        shuffle=True,
        partition_random_seed=1,
        stratified=False,
        logging_level='Silent',
        early_stopping_rounds=10
    )
    loss = cv_data['test-RMSE-mean'].min()
    return loss


# We initiate a Trial object to keep track of search results and we set a hyperparameter space
trials = Trials()
space = {
    "iterations": 100,
    'learning_rate': hp.uniform('learning_rate', .999, 1),
    'depth': hp.randint('depth', 2, 8),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -5, 0),
    'border_count': hp.randint('border_count', 2, 50),
    'random_strength': hp.uniform('random_strength', 0, 2),
    'boosting_type': 'Plain',
    'loss_function': 'RMSE',
    'subsample': hp.uniform('subsample', 0.0001, 0.005),
    'one_hot_max_size': hp.randint('one_hot_max_size', 10, 100)
}

# We finally use fmin to perform the search
best = fmin(hyperopt_objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            rstate=np.random.default_rng(0),
            trials=trials)

{'boosting_type': 'Plain', 'border_count': 49, 'depth': 3, 'iterations': 100, 'l2_leaf_reg': 0.6230796831865983, 'learning_rate': 0.9998350599742374, 'loss_function': 'RMSE', 'one_hot_max_size': 37, 'random_strength': 0.37866685630985386, 'subsample': 0.0034430917416565007}
{'boosting_type': 'Plain', 'border_count': 26, 'depth': 3, 'iterations': 100, 'l2_leaf_reg': 0.2745707529802382, 'learning_rate': 0.9995688168224243, 'loss_function': 'RMSE', 'one_hot_max_size': 43, 'random_strength': 0.5514888968076841, 'subsample': 0.004152867627059053}
{'boosting_type': 'Plain', 'border_count': 29, 'depth': 4, 'iterations': 100, 'l2_leaf_reg': 0.03633899160945536, 'learning_rate': 0.9993738457286979, 'loss_function': 'RMSE', 'one_hot_max_size': 34, 'random_strength': 1.4056042615673574, 'subsample': 0.00226655591060802}
{'boosting_type': 'Plain', 'border_count': 12, 'depth': 7, 'iterations': 100, 'l2_leaf_reg': 0.006919942400049253, 'learning_rate': 0.999453998800395, 'loss_function': 'RMSE', 'on

In [77]:
best

{'border_count': 39,
 'depth': 6,
 'l2_leaf_reg': 0.6784225761147133,
 'learning_rate': 0.9996952973484385,
 'one_hot_max_size': 99,
 'random_strength': 1.2455487454176062,
 'subsample': 0.004991921744024715}

In [78]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []

for train_index, test_index in kf.split(trainX):
    X_train_r, X_test_r = trainX.iloc[train_index], trainX.iloc[test_index]
    y_train_r, y_test_r = trainY.iloc[train_index], trainY.iloc[test_index]

    train_dataset_r = Pool(data=X_train_r, label=y_train_r,  cat_features=cat_indices)
    test_dataset_r = Pool(data=X_test_r, label=y_test_r,  cat_features=cat_indices)

    model_r = CatBoostRegressor(iterations=100, loss_function='RMSE', random_seed=0, verbose=1, **best)
    model_r.fit(train_dataset_r, verbose=0)

    predictions_r = model_r.predict(test_dataset_r)
    r2 = r2_score(y_test_r, predictions_r)
    r2_scores.append(r2)
r2_scores

[0.9007638934158005,
 0.8974217162265156,
 0.8970358835944835,
 0.8981508654629815,
 0.8988909527223472]

In [79]:
sum(r2_scores) / 5

0.8984526622844257

In [80]:
pickle.dump(trials, open("my_trials_impute_land_value.pkl", "wb"))

with open("my_trials_impute_land_value.pkl", "rb") as f:
    object = pickle.load(f)

df_trials = pd.DataFrame(object)
df_trials.to_csv(r'catboost_results_impute_land_value.csv')

In [69]:
best = {'border_count': 39,
 'depth': 6,
 'l2_leaf_reg': 0.6784225761147133,
 'learning_rate': 0.9996952973484385,
 'one_hot_max_size': 99,
 'random_strength': 1.2455487454176062,
 'subsample': 0.004991921744024715}

In [70]:
catboostFinal = CatBoostRegressor(loss_function='RMSE', random_seed=0, **best, iterations=100)

In [71]:
catboostFinal.fit(train_dataset)

0:	learn: 0.3340890	total: 243ms	remaining: 24.1s
1:	learn: 0.3030293	total: 359ms	remaining: 17.6s
2:	learn: 0.2880295	total: 470ms	remaining: 15.2s
3:	learn: 0.2749327	total: 528ms	remaining: 12.7s
4:	learn: 0.2703208	total: 583ms	remaining: 11.1s
5:	learn: 0.2678376	total: 640ms	remaining: 10s
6:	learn: 0.2644806	total: 700ms	remaining: 9.3s
7:	learn: 0.2621350	total: 763ms	remaining: 8.78s
8:	learn: 0.2599772	total: 820ms	remaining: 8.29s
9:	learn: 0.2562543	total: 874ms	remaining: 7.86s
10:	learn: 0.2543390	total: 931ms	remaining: 7.53s
11:	learn: 0.2500850	total: 984ms	remaining: 7.21s
12:	learn: 0.2469682	total: 1.03s	remaining: 6.93s
13:	learn: 0.2451688	total: 1.09s	remaining: 6.69s
14:	learn: 0.2438369	total: 1.14s	remaining: 6.48s
15:	learn: 0.2420533	total: 1.2s	remaining: 6.31s
16:	learn: 0.2410386	total: 1.26s	remaining: 6.14s
17:	learn: 0.2397366	total: 1.32s	remaining: 6.01s
18:	learn: 0.2391014	total: 1.37s	remaining: 5.85s
19:	learn: 0.2387519	total: 1.43s	remaining: 

<catboost.core.CatBoostRegressor at 0x13d262750>

In [72]:
predictions = catboostFinal.predict(testX)

In [73]:
original_scale = 10 ** predictions

In [74]:
tax['imputed_land_value'] = tax['assessed_land_value']

In [75]:
(original_scale < 0).mean()

0.0

In [76]:
tax.loc[~trainMask, 'imputed_land_value'] = original_scale

In [77]:
(tax['imputed_land_value']).describe()

count    3.086855e+06
mean     1.359076e+06
std      1.790217e+08
min      7.238988e-03
25%      6.727400e+04
50%      2.350000e+05
75%      4.755210e+05
max      1.241680e+11
Name: imputed_land_value, dtype: float64

In [78]:
(tax['assessed_land_value']).describe()

count    3.086855e+06
mean     4.709204e+05
std      2.867748e+06
min      0.000000e+00
25%      6.003500e+04
50%      2.246580e+05
75%      4.625970e+05
max      6.257076e+08
Name: assessed_land_value, dtype: float64

In [79]:
(tax['imputed_land_value'] < 0).mean()

0.0

In [80]:
tax.to_feather('../data/tax.geofeather')

## Assessed Value (random forest baseline)

In [92]:
X_dummy = pd.get_dummies(X, drop_first=True, dummy_na=True)

In [93]:
X_dummy = X_dummy.fillna(-9999)

In [94]:
trainX_dummy, testX_dummy = X_dummy[trainMask], X_dummy[~trainMask]

In [95]:
regression = Pipeline([('rf',
                        ExtraTreesRegressor(n_estimators=100, random_state=0, 
                                            max_samples=1000, bootstrap=True, n_jobs=-1))])
rf_param_grid = [
    {
        'rf__max_depth': stats.randint(1, 10),
        'rf__ccp_alpha': stats.loguniform(1e-10, 1e-1),
        'rf__max_features': stats.uniform(.05, .95),
        'rf__max_samples': stats.randint(500, 1000)
    }
]

In [97]:
rs = RandomizedSearchCV(regression, rf_param_grid, n_iter=10, random_state=0, cv=5)

rs.fit(trainX_dummy, trainY)

In [98]:
param_search_results = pd.DataFrame(rs.cv_results_['params'])
param_search_results['score'] = rs.cv_results_['mean_test_score']
param_search_results = param_search_results.sort_values('score', ascending=False)
param_search_results

Unnamed: 0,rf__ccp_alpha,rf__max_depth,rf__max_features,rf__max_samples,score
2,4.762242e-08,9,0.96548,814,0.825683
3,1.990609e-06,8,0.55245,588,0.789555
0,8.69604e-06,6,0.852052,751,0.764714
5,2.061455e-07,6,0.789249,931,0.757329
4,3.428176e-07,9,0.132773,615,0.700053
1,8.015833e-06,4,0.663599,792,0.666153
6,0.006773066,4,0.809201,743,0.663868
9,1.829801e-06,3,0.483343,790,0.609365
7,4.833896e-06,4,0.162361,788,0.534285
8,1.730526e-05,1,0.545756,744,0.408146
