In [1]:
# Importing libraries and packages
import geopandas as gpd
import pandas as pd
import numpy as np
import json

# Visualization
import matplotlib.pyplot as plt

# Modeling
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics

# Save models
import pickle

In [2]:
# Display Options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load the Test Dataset - NC

In [3]:
femaTestData = pd.read_csv('../../data/open-fema/FEMA-Large-NC-clean.csv')
print('There are {} records in our test dataset.'.format(len(femaTestData)))

There are 132384 records in our test dataset.


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
femaTestData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'tractid',
       'haAmount', 'ownRent_Owner', 'ownRent_Renter', 'ownRent_Unknown',
       'residenceType_Apartment', 'r

### Define a subset of columns

IHP: https://docs.google.com/document/d/1nu0yENGAWnoiMcTufxYnH7xwdh8NfFum9ni9IYiSIdk/edit#

Demographics: https://docs.google.com/document/d/1cpznnaIb5CE21I2RO8y2xRvZKS8StcP_JeXDW2mUIis/edit?ts=60319d34#heading=h.j8u0tgugtaw

In [5]:
ihp_exclude_cols = ['disasterNumber', 
                    'damagedCity', 
                    'damagedStateAbbreviation',
                    'damagedZipCode',
                    'grossIncome',
                    'foundationDamageAmount',
                    'roofDamageAmount',
                    'tsaCheckedIn',
                    'rentalAssistanceAmount',
                    'repairAmount',
                    'replacementAmount',
                    'renterDamageLevel', 
                    'rentalAssistanceEndDate', 
                    'rentalResourceCity',
                    'rentalResourceStateAbbreviation', 
                    'rentalResourceZipCode',
                    'personalPropertyEligible', 
                    'ppfvl',
                    'censusBlockId', 
                    'censusYear', 
                    'id']
demo_exclude_cols = ['censusTractId', 
                     'censusid',
                     'tractid', 
                     'tractname', 
                     'county', 
                     'state',
                     'median_earnings_total',]
demo_dvi_col = ['dvi']
demo_rate_cols = ['below_poverty_rate',
                  'unemployed_labor_rate',
                  'built_1979_or_earlier_rate', 
                  'owner_occupied_rate']

### RandomForest

https://machinelearningmastery.com/random-forest-ensemble-in-python/

- The “max_samples” argument can be set to a float between 0 and 1 to control the percentage of the size of the training dataset to make the bootstrap sample used to train each decision tree.
- max_features argument and defaults to the square root of the number of input features. 
- The number of trees can be set via the “n_estimators” argument and defaults to 100.
- The maximum tree depth can be specified via the max_depth argument and is set to None (no maximum depth) by default.

In [6]:
def run_rf(df, frac, max_depth, max_samples, n_estimators, min_samples_leaf):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42) 
  
    print('Shape of Training and Test inputs')
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    # Define the model
    model_rf = RandomForestRegressor(max_depth = max_depth, max_samples = max_samples, n_estimators = n_estimators, 
                                     min_samples_leaf = min_samples_leaf, random_state = 42)
    
    # Fit the model
    model_rf.fit(X_train, y_train)
    
    return (model_rf, model_rf.predict(X_test), y_test)

In [7]:
def run_rf_grid_search(df, frac):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)
    
    print('Shape of Training and Test inputs')
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    # RandomForestRegressor default model
    model_rf = RandomForestRegressor(random_state = 42)
    
    # Create the parameter grid
    param_grid_rf = {
        'bootstrap': [True],
        'max_samples': [0.8, 0.9, None],
        'max_depth': [8, 9, 10],
        'n_estimators': [75, 100, 125],
        'min_samples_leaf': [1, 5, 10]
    }
    
    # Instantiate the grid search model
    grid_search_rf = GridSearchCV(estimator = model_rf, param_grid = param_grid_rf, 
                                  scoring='neg_mean_squared_error', cv = 3, n_jobs = -1, verbose = 2)
    
    # Fit the grid search to the data
    grid_search_rf.fit(X_train, y_train)
    
    print('Best params:\n', grid_search_rf.best_params_)
    
    # Predict using best model
    model_rf_best = grid_search_rf.best_estimator_
    
    return (model_rf_best, model_rf_best.predict(X_test), y_test)

In [8]:
def evaluate(y_test, y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R-squared:', metrics.r2_score(y_test, y_pred))
    print('\n')
    print('Total HA Amount actual:    ${:,.2f}'.format(y_test.sum()))
    print('Total HA Amount predicted: ${:,.2f}'.format(y_pred.sum()))
    print('\n')
    
    # Calculate the number of observations that are off by more than 20%
    results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
    results_df['diff'] = results_df['predicted'] - results_df['actual']    
    results_df['percent_diff'] = (abs(abs(results_df['predicted'] / (results_df['actual'])) - 1) * 100).where(results_df['actual'] > 0, 0)    
    print('Percentage of predictions that are off by more than 20%: {:.2f}'.format( 
          len(results_df[results_df['percent_diff'] > 20])/len(results_df) * 100))   

In [9]:
def predict(df, model):
    X_test = df.loc[:, df.columns != 'haAmount']
    y_test = df.loc[:, 'haAmount']    
  
    print('Shape of Training and Test inputs')    
    print(X_test.shape, y_test.shape)
    
    return (model.predict(X_test), y_test)  

### Create IHP-only Test Dataset

In [12]:
femaTestDf = femaTestData[femaTestData.columns[~femaTestData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

### RandomForest Model2 - Hyperparameters based on current grid search
    max_depth = 10, max_samples = 0.9, n_estimators = 125, 
    min_samples_leaf = 5, random_state = 42

In [23]:
# model_rf2, y_pred, y_test = run_rf(femaDf, frac = 1.0, max_depth = 10, max_samples = 0.9, 
#                                   n_estimators = 125, min_samples_leaf = 5)

Shape of Training and Test inputs
(2549828, 34) (2549828,)
(637457, 34) (637457,)


In [None]:
predict(femaTestDf, model_rf_sav)

In [24]:
evaluate(y_test, y_pred)

Mean Absolute Error: 62.8185168269257
Mean Squared Error: 333776.8924875107
Root Mean Squared Error: 577.7342749807308
R-squared: 0.9518735085349349


Total HA Amount actual:    $306,323,367.07
Total HA Amount predicted: $305,457,100.07


Percentage of predictions that are off by more than 20%: 2.74


In [25]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,637457.0,480.539655,2633.518498,0.0,0.0,0.0,0.0,33300.0
predicted,637457.0,479.180713,2561.189489,0.0,0.0,0.0,0.0,32649.353177
errors,637457.0,-1.358942,577.73313,-33300.0,0.0,0.0,0.0,30262.173749


In [26]:
model_rf2.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.9,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 125,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Predict on Test (NC) using RandomForest Model2

In [2]:
# Load model
model_rf2 = pickle.load(open('./models/random_forest.sav', 'rb'))

model_rf2.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.9,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 125,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [15]:
y_pred, y_test = predict(femaTestDf, model_rf2)

Shape of Training and Test inputs
(132384, 34) (132384,)


In [16]:
evaluate(y_test, y_pred)

Mean Absolute Error: 151.08004639575432
Mean Squared Error: 641588.6873988011
Root Mean Squared Error: 800.9923141945877
R-squared: 0.8985953064009858


Total HA Amount actual:    $84,945,334.78
Total HA Amount predicted: $100,564,941.41


Percentage of predictions that are off by more than 20%: 10.13


In [17]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,132384.0,641.65862,2515.364693,0.0,0.0,0.0,0.0,33267.0
predicted,132384.0,759.645738,2684.142262,0.0,0.0,0.0,0.0,32547.1247
errors,132384.0,117.987118,792.25783,-23798.135914,0.0,0.0,0.0,31235.6247


### Generate Prediction Files using RandomForest Model2 predictions

In [18]:
# Append predictions to femaTestData
femaTestData['haAmount_predicted'] = y_pred

In [19]:
femaTestData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'tractid',
       'haAmount', 'ownRent_Owner', 'ownRent_Renter', 'ownRent_Unknown',
       'residenceType_Apartment', 'r

### Create NC Prediction Files

In [20]:
# Write predictions
femaTestData.to_csv("./predictions/FEMA-Large-NC-clean-predictions.csv", index=False, encoding='utf-8')
print(len(femaTestData))

132384


In [70]:
# Write predictions - zipped
#femaTestData.to_csv("./predictions/FEMA-Large-NC-clean-predictions.csv.gz", index=False, encoding='utf-8', compression='gzip')

In [21]:
# Rollup aggregations to censusTractId
femaTestData_CensusTract = femaTestData.groupby(['censusTractId',]).agg({
                                           'id': ['count'], 
                                           'grossIncome': ['mean'],
                                           'householdComposition': ['mean'],                                           
                                           'specialNeeds': ['sum'], 
                                           'homeOwnersInsurance': ['sum'],
                                           'floodInsurance': ['sum'],
                                           'inspected': ['sum'],
                                           'rpfvl': ['sum'],
                                           'habitabilityRepairsRequired': ['sum'],
                                           'destroyed': ['sum'],
                                           'waterLevel': ['mean'],
                                           'floodDamage': ['sum'],
                                           'foundationDamage': ['sum'], 
                                           'foundationDamageAmount': ['sum'], 
                                           'roofDamage': ['sum'],
                                           'roofDamageAmount': ['sum'], 
                                           'tsaEligible': ['sum'], 
                                           'tsaCheckedIn': ['sum'],
                                           'rentalAssistanceEligible': ['sum'], 
                                           'rentalAssistanceAmount': ['sum'], 
                                           'repairAssistanceEligible': ['sum'],
                                           'repairAmount': ['sum'],
                                           'replacementAssistanceEligible': ['sum'], 
                                           'replacementAmount': ['sum'], 
                                           'sbaEligible': ['sum'],
                                           'primaryResidence': ['sum'], 
                                           'personalPropertyEligible': ['sum'], 
                                           'ppfvl': ['sum'],
                                           'haAmount': ['sum'],
                                           'haAmount_predicted': ['sum']
                                          }).round(2)

In [22]:
pd.set_option('display.float_format', str)
femaTestData_CensusTract.head(10)

Unnamed: 0_level_0,id,grossIncome,householdComposition,specialNeeds,homeOwnersInsurance,floodInsurance,inspected,rpfvl,habitabilityRepairsRequired,destroyed,waterLevel,floodDamage,foundationDamage,foundationDamageAmount,roofDamage,roofDamageAmount,tsaEligible,tsaCheckedIn,rentalAssistanceEligible,rentalAssistanceAmount,repairAssistanceEligible,repairAmount,replacementAssistanceEligible,replacementAmount,sbaEligible,primaryResidence,personalPropertyEligible,ppfvl,haAmount,haAmount_predicted
Unnamed: 0_level_1,count,mean,mean,sum,sum,sum,sum,sum,sum,sum,mean,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
censusTractId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
34031236602.0,1,130000.0,1.0,0,1,1,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0
37001020100.0,2,,1.5,1,2,0,0,0.0,2,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,2,0,0.0,0.0,0.0
37001020200.0,2,13500.0,2.5,0,0,0,0,0.0,2,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,2,0,0.0,0.0,0.0
37001020300.0,1,,1.0,0,0,0,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0
37001020400.0,1,,1.0,0,1,0,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0
37001020502.0,2,1.0,1.5,0,0,0,0,0.0,2,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,2,0,0.0,0.0,0.0
37001020802.0,1,18768.0,1.0,0,0,0,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0
37001021000.0,2,15744.0,3.0,0,0,0,0,0.0,2,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,2,0,0.0,0.0,0.0
37001021101.0,1,1553.0,2.0,0,0,0,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0
37001021201.0,1,51948.0,2.0,0,0,0,0,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0


In [23]:
# Write aggregated predictions
femaTestData_CensusTract.to_csv("./predictions/FEMA-Large-NC-clean-predictions-censusTract.csv", index=True, encoding='utf-8')