In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor, Pool, cv

from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
import pickle
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
tax = gpd.read_feather('../data/tax.geofeather')

In [3]:
tax['x'] = tax.geometry.centroid.x
tax['y'] = tax.geometry.centroid.y

In [4]:
X = tax.drop(['property_location', 'block', 'lot', 'parcel_number', 
              'assessor_neighborhood', 'assessor_neighborhood_district', 
              'analysis_neighborhood', 
              'supervisor_district_2012', 'supervisor_district'], 
             axis=1)

In [5]:
X['assessed_sales_value'] = X['imputed_land_value'] + X['assessed_fixtures_value'] + X['assessed_improvement_value']


In [6]:
fmv = X[(X.year_property_built >= (X.closed_roll_year - 1)) | (X.current_sales_date.dt.year >= (tax.closed_roll_year - 1))].copy()
fmv = fmv[fmv['assessed_sales_value'] > 0]
fmv = fmv.sort_values(['assessed_sales_value'], ascending=False).drop_duplicates(['year_property_built', 'current_sales_date', 'geometry'])

In [7]:
fmv = fmv.drop(columns=['current_sales_date', 'geometry'])

In [8]:
X['assessed_sales_value'].mean()

1035241.3938629171

In [9]:
fmv['assessed_sales_value'].mean()

2688731.003061709

In [10]:
fmv['assessed_sales_value'].min()

1.0

In [11]:
fmv = fmv.drop(columns=['assessed_land_value', 'assessed_fixtures_value', 'assessed_improvement_value', 'assessed_personal_property_value'])
fmv.shape

(99496, 30)

In [12]:
X_fmv = fmv.drop(columns='assessed_sales_value')
y_fmv = np.log10(fmv['assessed_sales_value'])

In [13]:
cat_indices = np.where(X_fmv.dtypes == 'object')[0]

In [14]:
train_pool = Pool(data=X_fmv, label=y_fmv, cat_features=cat_indices)

def hyperopt_objective(params):
    print(params)
    cv_data = cv(
        params=params,
        pool=train_pool,
        fold_count=5,
        shuffle=True,
        partition_random_seed=0,
        stratified=False,
        logging_level='Silent',
        early_stopping_rounds=10
    )
    loss = cv_data['test-RMSE-mean'].min()
    return loss


# We initiate a Trial object to keep track of search results and we set a hyperparameter space
trials = Trials()
space = {
    "iterations": 1000,
    'loss_function': 'RMSE',
    'learning_rate': hp.uniform('learning_rate', 0.25, 0.5),
    'depth': hp.randint('depth', 2, 10),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -5, 5),
    'border_count': hp.randint('border_count', 10, 255),
    'subsample': hp.uniform('subsample', 0.25, .75),
    'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain']),
}

# We finally use fmin to perform the search
best = fmin(hyperopt_objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            rstate=np.random.default_rng(42),
            trials=trials)

{'boosting_type': 'Ordered', 'border_count': 169, 'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 46.491349113984924, 'learning_rate': 0.40502974941916026, 'loss_function': 'RMSE', 'subsample': 0.6568277038201439}
{'boosting_type': 'Plain', 'border_count': 24, 'depth': 9, 'iterations': 1000, 'l2_leaf_reg': 36.94036365256856, 'learning_rate': 0.40206229054278414, 'loss_function': 'RMSE', 'subsample': 0.4002106773952542}
{'boosting_type': 'Plain', 'border_count': 48, 'depth': 6, 'iterations': 1000, 'l2_leaf_reg': 3.2433557292767263, 'learning_rate': 0.268415256362631, 'loss_function': 'RMSE', 'subsample': 0.3234813936425139}
{'boosting_type': 'Plain', 'border_count': 182, 'depth': 3, 'iterations': 1000, 'l2_leaf_reg': 2.5206936794284682, 'learning_rate': 0.30305427924833384, 'loss_function': 'RMSE', 'subsample': 0.4518888351309896}
{'boosting_type': 'Ordered', 'border_count': 94, 'depth': 5, 'iterations': 1000, 'l2_leaf_reg': 0.11926149002222346, 'learning_rate': 0.2741611614192523, 'loss

In [15]:
best

{'boosting_type': 1,
 'border_count': 182,
 'depth': 3,
 'l2_leaf_reg': 2.5206936794284682,
 'learning_rate': 0.30305427924833384,
 'subsample': 0.4518888351309896}

In [17]:
best['boosting_type'] = 'Ordered' if best.get('boosting_type') == 0 else 'Plain'

In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
r2_scores = []

for train_index, test_index in kf.split(X_fmv):
    X_train_r, X_test_r = X_fmv.iloc[train_index], X_fmv.iloc[test_index]
    y_train_r, y_test_r = y_fmv.iloc[train_index], y_fmv.iloc[test_index]

    train_dataset_r = Pool(data=X_train_r, label=y_train_r,  cat_features=cat_indices)
    test_dataset_r = Pool(data=X_test_r, label=y_test_r,  cat_features=cat_indices)

    model_r = CatBoostRegressor(iterations=1000, loss_function='RMSE', random_seed=0, verbose=1, **best)
    model_r.fit(train_dataset_r, verbose=0)

    predictions_r = model_r.predict(test_dataset_r)
    r2 = r2_score(y_test_r, predictions_r)
    r2_scores.append(r2)
r2_scores

[0.9784461401138946,
 0.9733501054538772,
 0.9760272356838665,
 0.9766343203615577,
 0.976591335175723]

In [19]:
sum(r2_scores) / 5

0.9762098273577837

In [20]:
pickle.dump(trials, open("my_trials_fmv.pkl", "wb"))

with open("my_trials_fmv.pkl", "rb") as f:
    object = pickle.load(f)

df_trials = pd.DataFrame(object)
df_trials.to_csv(r'catboost_results_fmv.csv')

In [21]:
catboostFinal = CatBoostRegressor(**best, loss_function='RMSE', random_seed=0, iterations=1000)

In [22]:
catboostFinal.fit(train_pool)

0:	learn: 0.3717565	total: 26.2ms	remaining: 26.1s
1:	learn: 0.2921253	total: 52.4ms	remaining: 26.2s
2:	learn: 0.2378911	total: 79.3ms	remaining: 26.3s
3:	learn: 0.1979664	total: 106ms	remaining: 26.3s
4:	learn: 0.1706814	total: 131ms	remaining: 26.1s
5:	learn: 0.1516278	total: 155ms	remaining: 25.7s
6:	learn: 0.1382901	total: 181ms	remaining: 25.6s
7:	learn: 0.1293655	total: 207ms	remaining: 25.7s
8:	learn: 0.1228840	total: 231ms	remaining: 25.4s
9:	learn: 0.1184185	total: 255ms	remaining: 25.3s
10:	learn: 0.1154582	total: 280ms	remaining: 25.2s
11:	learn: 0.1128225	total: 304ms	remaining: 25s
12:	learn: 0.1107489	total: 327ms	remaining: 24.9s
13:	learn: 0.1092006	total: 352ms	remaining: 24.8s
14:	learn: 0.1078854	total: 375ms	remaining: 24.6s
15:	learn: 0.1063007	total: 412ms	remaining: 25.3s
16:	learn: 0.1054062	total: 434ms	remaining: 25.1s
17:	learn: 0.1045612	total: 459ms	remaining: 25.1s
18:	learn: 0.1038270	total: 483ms	remaining: 25s
19:	learn: 0.1030233	total: 509ms	remainin

<catboost.core.CatBoostRegressor at 0x147b81a90a0>

In [23]:
X = tax.drop(columns = 
             ['property_location', 'block', 'lot', 'parcel_number', 
              'assessor_neighborhood', 'assessor_neighborhood_district', 
              'analysis_neighborhood',
              'supervisor_district_2012', 'supervisor_district', 'geometry', 
              'current_sales_date', 
              'assessed_land_value',
              'assessed_fixtures_value',
              'assessed_improvement_value', 
              'assessed_personal_property_value'])


In [24]:
predictions = catboostFinal.predict(X)

In [25]:
predictions.shape, X.shape

((3086855,), (3086855, 29))

In [26]:
tax.shape

(3086855, 44)

In [39]:
10 ** predictions.min()

0.8851891094350547

In [28]:
fmv.index

Index([ 465702, 1142808, 1448137, 1727496, 1956833, 1510364, 1531132, 1247134,
       1851763, 1319556,
       ...
       2873268, 2957517, 2830798, 2828012, 2829837, 1791195, 1106797, 1215313,
       1421060, 1564480],
      dtype='int64', length=99496)

In [30]:
tax['imputed_fair_market_acquisition'] = 10 ** predictions

In [35]:
tax.loc[fmv.index, 'imputed_fair_market_acquisition'] = fmv['assessed_sales_value']

In [36]:
tax.sort_values('imputed_fair_market_acquisition', ascending=True)

Unnamed: 0,property_class_code_definition,lot_code,property_area,volume_number,percent_of_ownership,misc_exemption_value,zoning_code,year_property_built,analysis_neighborhood,number_of_units,...,assessed_land_value,basement_area,assessed_improvement_value,geometry,zillow_neighborhood_name,x,y,imputed_land_value,years_since_last_sale,imputed_fair_market_acquisition
2701519,Vacant Lot - Restrictions,,0.0,40.0,,0.0,RH1,,Outer Mission,0.0,...,0.0,0.0,0.000000e+00,POINT (-214255.424 -29068.547),Outer Mission,-214255.423595,-29068.546512,5.135027e+01,,8.851891e-01
2704190,Vacant Lot - Restrictions,,0.0,40.0,,0.0,RH2,,West of Twin Peaks,0.0,...,0.0,0.0,0.000000e+00,POINT (-214775.592 -28863.972),Outer Mission,-214775.591801,-28863.972164,3.241978e+01,,9.148448e-01
2713235,Vacant Lot - Restrictions,,0.0,42.0,,0.0,RH1,,Outer Mission,0.0,...,0.0,0.0,0.000000e+00,POINT (-215617.435 -30710.843),Outer Mission,-215617.434835,-30710.842658,3.143088e+01,,9.242261e-01
2709598,Vacant Lot - Restrictions,,0.0,42.0,,0.0,RH1,,Outer Mission,0.0,...,0.0,0.0,0.000000e+00,POINT (-215559.212 -30663.382),Outer Mission,-215559.211853,-30663.381947,3.143088e+01,,9.242261e-01
2711132,Vacant Lot - Restrictions,,0.0,42.0,,0.0,RH1,,Outer Mission,0.0,...,0.0,0.0,0.000000e+00,POINT (-215080.653 -30319.423),Outer Mission,-215080.653333,-30319.423029,3.242856e+01,,9.251364e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448137,Office,R,1420430.0,25.0,1.0,0.0,C3OSD,2018.0,Financial District/South Beach,0.0,...,211747199.0,0.0,1.479998e+09,POINT (-210762.748 -22516.193),Financial District,-210762.747553,-22516.192853,2.117472e+08,9.768652,1.691745e+09
2638896,Office,R,1420430.0,25.0,1.0,0.0,C3OSD,2018.0,Financial District/South Beach,0.0,...,218219716.0,0.0,1.583330e+09,POINT (-210762.748 -22516.193),Financial District,-210762.747553,-22516.192853,2.182197e+08,9.768652,1.700999e+09
1238901,Office,R,1420430.0,25.0,1.0,0.0,C3OSD,2018.0,Financial District/South Beach,0.0,...,215982142.0,0.0,1.567095e+09,POINT (-210762.748 -22516.193),Financial District,-210762.747553,-22516.192853,2.159821e+08,9.768652,1.700999e+09
1142808,Hospitals,R,1015000.0,5.0,1.0,691737757.0,RC4,2019.0,Western Addition,0.0,...,73531026.0,0.0,1.663416e+09,POINT (-213019.407 -22883.233),Western Addition,-213019.406713,-22883.232908,7.353103e+07,,2.606239e+09


In [37]:
tax.to_feather('../data/fair_market_acquisition.geofeather')