In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd

from scipy import stats

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.pipeline import make_pipeline, Pipeline

# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline

In [9]:
tax = gpd.read_file('../data/tax.geojson')

# Reduce file size by casting to numeric
to_float = ['percent_of_ownership', 'year_property_built', 'supervisor_district_2012',
            'supervisor_district', 'number_of_rooms', 'number_of_stories', 'misc_exemption_value',
            'number_of_units', 'property_area', 'basement_area']
tax[to_float] = tax[to_float].apply(lambda x: pd.to_numeric(x, errors='coerce'))

to_int = ['assessed_improvement_value', 'assessed_land_value', 'assessed_personal_property_value',
          'assessed_fixtures_value', 'homeowner_exemption_value',
          'lot_frontage', 'number_of_bathrooms', 'lot_depth',  'lot_area',
          'number_of_bedrooms', 'closed_roll_year', 'volume_number']
tax[to_int] = tax[to_int].apply(lambda x: pd.to_numeric(x, errors='coerce', downcast='integer'))

# Remove redundant columns (These columns use codes described in plain english by other columns)
# tax = tax.drop(['assessor_neighborhood_code', 'property_class_code', 'exemption_code', 'use_code'], axis=1)

# Remove other irrelevant columns
# tax = tax.drop(['row_id', 'data_as_of', 'data_loaded_at'], axis=1)

# Save in efficient format
tax.to_feather('../data/tax.geofeather')

In [2]:
tax = gpd.read_feather('../data/tax.geofeather')

In [3]:
y = tax['assessed_land_value']

In [4]:
X = tax.drop(['geometry', 'property_location', 'block', 'lot', 'assessed_land_value', 'parcel_number'], axis=1)

In [7]:
if 'imputed_land_value' in X.columns:
    X = X.drop(columns = ['imputed_land_value'])

In [5]:
current_date = pd.to_datetime('2023-01-01')
time_difference = current_date - X['current_sales_date']
X['years_since_last_sale'] = (time_difference / pd.Timedelta(days=365.25))

X = X.drop(columns='current_sales_date')

In [8]:
X.columns

Index(['property_class_code_definition', 'lot_code', 'property_area',
       'volume_number', 'percent_of_ownership', 'misc_exemption_value',
       'zoning_code', 'year_property_built', 'analysis_neighborhood',
       'number_of_units', 'use_definition', 'closed_roll_year', 'status_code',
       'number_of_bedrooms', 'assessor_neighborhood', 'number_of_stories',
       'assessor_neighborhood_district', 'exemption_code_definition',
       'lot_area', 'number_of_rooms', 'lot_depth',
       'assessed_personal_property_value', 'supervisor_district_2012',
       'number_of_bathrooms', 'construction_type', 'lot_frontage',
       'homeowner_exemption_value', 'tax_rate_area_code',
       'assessed_fixtures_value', 'supervisor_district', 'basement_area',
       'assessed_improvement_value', 'years_since_last_sale'],
      dtype='object')

In [9]:
X = pd.get_dummies(X, drop_first=True, dummy_na=True)

In [12]:
X = X.fillna(-9999)

In [15]:
trainMask = y != 0

In [16]:
trainX, testX = X[trainMask], X[~trainMask]

In [17]:
trainY, testY = y[trainMask], y[~trainMask]

In [18]:
trainX.shape

(3003214, 506)

In [19]:
trainY

0            13294
2          3207694
4            21280
6            21643
7            21643
            ...   
3086505     510170
3086506     857541
3086507     744017
3086508     135200
3086509    1081200
Name: assessed_land_value, Length: 3003214, dtype: int32

In [20]:
regression = Pipeline([('rf',
                        ExtraTreesRegressor(n_estimators=100, random_state=0, 
                                            max_samples=1000, n_jobs=-1, bootstrap=True))])
rf_param_grid = [
    {
        'rf__max_depth': stats.randint(5, 35),
        'rf__ccp_alpha': stats.loguniform(1e-14, 1e-7),
        'rf__max_features': stats.uniform(.05, .95),
        'rf__max_samples': stats.randint(900, int(trainY.shape[0]*.8))
    }
]

In [None]:
rs = RandomizedSearchCV(regression, rf_param_grid, n_iter=10, random_state=0, cv=5)

rs.fit(trainX, trainY)


In [155]:
param_search_results = pd.DataFrame(rs.cv_results_['params'])
param_search_results['score'] = rs.cv_results_['mean_test_score']
param_search_results = param_search_results.sort_values('score', ascending=False)

In [138]:
param_search_results.to_csv('./impute_land_value_rf_hyperparams.csv', index=False)

In [139]:
param_search_results

Unnamed: 0,rf__ccp_alpha,rf__max_depth,rf__max_features,rf__min_samples_split,score
5,4.665955e-08,15,0.989309,2,0.605876
6,6.389465e-08,18,0.83944,2,0.603155
3,2.560162e-09,20,0.918039,2,0.602815
19,1.065785e-07,17,0.985973,4,0.602014
11,4.597148e-07,24,0.724975,2,0.600676
7,3.105409e-09,24,0.972334,4,0.599031
2,2.189162e-12,16,0.895863,4,0.598013
4,3.332543e-12,18,0.684121,3,0.596088
17,1.118186e-08,15,0.569091,3,0.595667
16,3.098147e-09,19,0.622213,3,0.594887


In [198]:
etr = ExtraTreesRegressor(n_estimators=2500, random_state=0, max_samples=10000, n_jobs=-1, bootstrap=True,
                          ccp_alpha=4.665e-08, max_depth=17, max_features=0.99)

In [199]:
etr.fit(trainX, trainY)

ExtraTreesRegressor(bootstrap=True, ccp_alpha=4.665e-08, max_depth=17,
                    max_features=0.99, max_samples=10000, n_estimators=2500,
                    n_jobs=-1, random_state=0)

In [200]:
etr.score(trainX, trainY)

0.582542502994855

In [201]:
etr.score(trainX, trainY)

0.582542502994855

In [203]:
predY = etr.predict(testX)

In [204]:
predY

array([ 873344.69947623,  656759.74407583,  662886.21558916, ...,
         58989.96341101,  538492.9247818 , 3336440.07412276])

In [205]:
tax['imputed_land_value'] = tax['assessed_land_value']

In [213]:
tax.loc[~trainMask, 'imputed_land_value'] = predY

In [219]:
tax.to_feather('../data/tax.geofeather')