In [2]:
# Importing libraries and packages
import geopandas as gpd
import pandas as pd
import numpy as np
import json

# Visualization
import matplotlib.pyplot as plt

# Modeling
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics

# Save models
import pickle

In [3]:
# Display Options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Random Forest Model trained on Fema Large without Outliers

### Load the Training and Test Datasets - FL/TX and PR

In [5]:
femaTrainData = pd.read_csv('../data/open-fema/FEMA-Large-Demographics-FL-TX-clean.csv')
print('There are {} records in our training dataset.'.format(len(femaTrainData)))

  interactivity=interactivity, compiler=compiler, result=result)


There are 3187285 records in our training dataset.


In [6]:
femaTrainData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

In [9]:
femaTestData = pd.read_csv('../data/open-fema/FEMA-Large-Demographics-PR-clean.csv')
print('There are {} records in our test dataset.'.format(len(femaTestData)))

There are 1041745 records in our test dataset.


In [10]:
femaTestData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

### Define a subset of columns

IHP: https://docs.google.com/document/d/1nu0yENGAWnoiMcTufxYnH7xwdh8NfFum9ni9IYiSIdk/edit#

Demographics: https://docs.google.com/document/d/1cpznnaIb5CE21I2RO8y2xRvZKS8StcP_JeXDW2mUIis/edit?ts=60319d34#heading=h.j8u0tgugtaw

In [9]:
ihp_exclude_cols = ['disasterNumber', 
                    'damagedCity', 
                    'damagedStateAbbreviation',
                    'damagedZipCode',
                    'grossIncome',
                    'foundationDamageAmount',
                    'roofDamageAmount',
                    'tsaCheckedIn',
                    'rentalAssistanceAmount',
                    'repairAmount',
                    'replacementAmount',
                    'renterDamageLevel', 
                    'rentalAssistanceEndDate', 
                    'rentalResourceCity',
                    'rentalResourceStateAbbreviation', 
                    'rentalResourceZipCode',
                    'personalPropertyEligible', 
                    'ppfvl',
                    'censusBlockId', 
                    'censusYear', 
                    'id']
demo_exclude_cols = ['censusTractId', 
                     'censusid',
                     'tractid', 
                     'tractname', 
                     'county', 
                     'state',
                     'median_earnings_total',]
demo_dvi_col = ['dvi']
demo_rate_cols = ['below_poverty_rate',
                  'unemployed_labor_rate',
                  'built_1979_or_earlier_rate', 
                  'owner_occupied_rate']

### RandomForest

https://machinelearningmastery.com/random-forest-ensemble-in-python/

- The “max_samples” argument can be set to a float between 0 and 1 to control the percentage of the size of the training dataset to make the bootstrap sample used to train each decision tree.
- max_features argument and defaults to the square root of the number of input features. 
- The number of trees can be set via the “n_estimators” argument and defaults to 100.
- The maximum tree depth can be specified via the max_depth argument and is set to None (no maximum depth) by default.

In [10]:
def run_rf(df, frac, max_depth, max_samples, n_estimators, min_samples_leaf):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42) 
  
    print('Shape of Training and Test inputs')
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    # Define the model
    model_rf = RandomForestRegressor(max_depth = max_depth, max_samples = max_samples, n_estimators = n_estimators, 
                                     min_samples_leaf = min_samples_leaf, random_state = 42)
    
    # Fit the model
    model_rf.fit(X_train, y_train)
    
    return (model_rf, model_rf.predict(X_test), y_test)

In [11]:
def run_rf_grid_search(df, frac, scoring):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)
    
    print('Shape of Training and Test inputs')
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    # RandomForestRegressor default model
    model_rf = RandomForestRegressor(random_state = 42)
    
    # Create the parameter grid
    param_grid_rf = {
        'bootstrap': [True],
        'max_samples': [0.8, 0.9, None],
        'max_depth': [8, 9, 10],
        'n_estimators': [75, 100, 125],
        'min_samples_leaf': [1, 5, 10]
    }
    
    # Instantiate the grid search model
    grid_search_rf = GridSearchCV(estimator = model_rf, param_grid = param_grid_rf, 
                                  scoring = scoring, cv = 3, n_jobs = -1, verbose = 2)
    
    # Fit the grid search to the data
    grid_search_rf.fit(X_train, y_train)
    
    print('Best params:\n', grid_search_rf.best_params_)
    
    # Predict using best model
    model_rf_best = grid_search_rf.best_estimator_
    
    return (model_rf_best, model_rf_best.predict(X_test), y_test)

In [12]:
def evaluate(y_test, y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R-squared:', metrics.r2_score(y_test, y_pred))
    print('\n')
    print('Total HA Amount actual:    ${:,.2f}'.format(y_test.sum()))
    print('Total HA Amount predicted: ${:,.2f}'.format(y_pred.sum()))
    print('\n')
    
    # Calculate the number of observations that are off by more than 20%
    results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
    results_df['diff'] = results_df['predicted'] - results_df['actual']    
    results_df['percent_diff'] = (abs(abs(results_df['predicted'] / (results_df['actual'])) - 1) * 100).where(results_df['actual'] > 0, 0)    
    print('Percentage of predictions that are off by more than 20%: {:.2f}'.format( 
          len(results_df[results_df['percent_diff'] > 20])/len(results_df) * 100))   

In [13]:
def predict(df, model):
    X_test = df.loc[:, df.columns != 'haAmount']
    y_test = df.loc[:, 'haAmount']    
  
    print('Shape of Training and Test inputs')    
    print(X_test.shape, y_test.shape)
    
    return (model.predict(X_test), y_test)  

### Create IHP-only Train and Test Datasets

In [106]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

In [107]:
femaTestDf = femaTestData[femaTestData.columns[~femaTestData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

### Grid Search
Parameter Grid:

    param_grid_rf = {
        'bootstrap': [True],
        'max_samples': [0.8, 0.9, None],
        'max_depth': [8, 9, 10],
        'n_estimators': [75, 100, 125],
        'min_samples_leaf': [1, 5, 10]
    }
    
Grid Search Params:

    scoring='neg_mean_squared_error'
    cv = 3    

In [19]:
model_rf_best, y_pred, y_test = run_rf_grid_search(femaDf, frac = 1.0, scoring = 'neg_mean_squared_error')

Shape of Training and Test inputs
(2549828, 34) (2549828,)
(637457, 34) (637457,)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best params:
 {'bootstrap': True, 'max_depth': 10, 'max_samples': 0.9, 'min_samples_leaf': 5, 'n_estimators': 125}


In [59]:
evaluate(y_test, y_pred)

Mean Absolute Error: 62.8185168269257
Mean Squared Error: 333776.8924875107
Root Mean Squared Error: 577.7342749807308
R-squared: 0.9518735085349349


Total HA Amount actual:    $306,323,367.07
Total HA Amount predicted: $305,457,100.07


Percentage of predictions that are off by more than 20%: 2.74


In [20]:
# Look at the distribution of predictions/actuals/errors
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,637457.0,480.539655,2633.518498,0.0,0.0,0.0,0.0,33300.0
predicted,637457.0,479.180713,2561.189489,0.0,0.0,0.0,0.0,32649.353177
errors,637457.0,-1.358942,577.73313,-33300.0,0.0,0.0,0.0,30262.173749


In [36]:
# Save the model
pickle.dump(model_rf_best, open('./models/random_forest.sav', 'wb'))

In [15]:
# Load saved model
model_rf_sav = pickle.load(open('./models/random_forest.sav', 'rb'))
model_rf_sav.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.9,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 125,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Grid Search
Parameter Grid:

    param_grid_rf = {
        'bootstrap': [True],
        'max_samples': [0.8, 0.9, None],
        'max_depth': [8, 9, 10],
        'n_estimators': [75, 100, 125],
        'min_samples_leaf': [1, 5, 10]
    }
    
Grid Search Params:

    scoring='neg_mean_absolute_error'
    cv = 3  

In [109]:
model_rf_best_mae, y_pred, y_test = run_rf_grid_search(femaDf, frac = 1.0, scoring = 'neg_mean_absolute_error')

Shape of Training and Test inputs
(2549828, 34) (2549828,)
(637457, 34) (637457,)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best params:
 {'bootstrap': True, 'max_depth': 10, 'max_samples': 0.9, 'min_samples_leaf': 5, 'n_estimators': 125}


In [110]:
evaluate(y_test, y_pred)

Mean Absolute Error: 62.8185168269257
Mean Squared Error: 333776.8924875107
Root Mean Squared Error: 577.7342749807308
R-squared: 0.9518735085349349


Total HA Amount actual:    $306,323,367.07
Total HA Amount predicted: $305,457,100.07


Percentage of predictions that are off by more than 20%: 2.74


In [111]:
# Look at the distribution of predictions/actuals/errors
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,637457.0,480.5396553336153,2633.518498364936,0.0,0.0,0.0,0.0,33300.0
predicted,637457.0,479.18071346972505,2561.189488723821,0.0,0.0,0.0,0.0,32649.35317740866
errors,637457.0,-1.3589418639326545,577.7331298846236,-33300.0,0.0,0.0,0.0,30262.173749418565


### RandomForest Model1 - Hyperparameters based on prior grid search
    max_depth = 10, max_samples = 0.8, n_estimators = 75, 
    min_samples_leaf = 10, random_state = 42

In [None]:
model_rf1, y_pred, y_test = run_rf(femaDf, frac = 1.0, max_depth = 10, max_samples = 0.8, 
                                  n_estimators = 75, min_samples_leaf = 10)

In [49]:
evaluate(y_test, y_pred)

Mean Absolute Error: 62.89334403477637
Mean Squared Error: 335313.7561781083
Root Mean Squared Error: 579.0628257608222
R-squared: 0.9516519118368013


Total HA Amount actual:    $306,323,367.07
Total HA Amount predicted: $305,455,969.57


Percentage of predictions that are off by more than 20%: 2.73


In [50]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,637457.0,480.539655,2633.518498,0.0,0.0,0.0,0.0,33300.0
predicted,637457.0,479.17894,2560.828884,0.0,0.0,0.0,0.0,32628.580871
errors,637457.0,-1.360715,579.061681,-33300.0,0.0,0.0,0.0,30344.002017


In [51]:
model_rf1.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.8,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 75,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### RandomForest Model2 - Hyperparameters based on current grid search
    max_depth = 10, max_samples = 0.9, n_estimators = 125, 
    min_samples_leaf = 5, random_state = 42

In [17]:
model_rf2, y_pred, y_test = run_rf(femaDf, frac = 1.0, max_depth = 10, max_samples = 0.9, 
                                  n_estimators = 125, min_samples_leaf = 5)

Shape of Training and Test inputs
(2549828, 34) (2549828,)
(637457, 34) (637457,)


In [18]:
evaluate(y_test, y_pred)

Mean Absolute Error: 62.8185168269257
Mean Squared Error: 333776.8924875107
Root Mean Squared Error: 577.7342749807308
R-squared: 0.9518735085349349


Total HA Amount actual:    $306,323,367.07
Total HA Amount predicted: $305,457,100.07


Percentage of predictions that are off by more than 20%: 2.74


In [19]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,637457.0,480.539655,2633.518498,0.0,0.0,0.0,0.0,33300.0
predicted,637457.0,479.180713,2561.189489,0.0,0.0,0.0,0.0,32649.353177
errors,637457.0,-1.358942,577.73313,-33300.0,0.0,0.0,0.0,30262.173749


In [20]:
model_rf2.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.9,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 125,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Predict on Test (PR) using RandomForest Model1

In [60]:
y_pred, y_test = predict(femaTestDf, model_rf1)

Shape of Training and Test inputs
(1041745, 34) (1041745,)


In [61]:
evaluate(y_test, y_pred)

Mean Absolute Error: 148.39029073886576
Mean Squared Error: 340624.6058108677
Root Mean Squared Error: 583.6305387921949
R-squared: 0.9498066132658147


Total HA Amount actual:    $635,239,074.74
Total HA Amount predicted: $747,983,257.74


Percentage of predictions that are off by more than 20%: 9.09


In [62]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,1041745.0,609.783656,2605.043423,0.0,0.0,0.0,0.0,33300.0
predicted,1041745.0,718.009933,2746.260793,0.0,0.0,0.0,0.0,32764.975792
errors,1041745.0,108.226277,573.508496,-33300.0,0.0,0.0,0.0,30383.500813


### Predict on Test (PR) using RandomForest Model2

In [21]:
y_pred, y_test = predict(femaTestDf, model_rf2)

Shape of Training and Test inputs
(1041745, 34) (1041745,)


In [42]:
evaluate(y_test, y_pred)

Mean Absolute Error: 148.33306720584585
Mean Squared Error: 341074.8684423491
Root Mean Squared Error: 584.0161542648876
R-squared: 0.9497402639592516


Total HA Amount actual:    $635,239,074.74
Total HA Amount predicted: $748,574,488.17


Percentage of predictions that are off by more than 20%: 9.16


In [43]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,1041745.0,609.783656,2605.043423,0.0,0.0,0.0,0.0,33300.0
predicted,1041745.0,718.577472,2751.919321,0.0,0.0,0.0,0.0,32834.874317
errors,1041745.0,108.793816,573.793595,-33300.0,0.0,0.0,0.0,30016.005418


### Generate Prediction Files using RandomForest Model2 predictions

In [45]:
# Append predictions to femaTestData
femaTestData['haAmount_predicted'] = y_pred

In [54]:
femaTestData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

In [55]:
# Drop demographics from FEMA file
femaTestData2 = femaTestData.drop(['below_poverty_rate', 'unemployed_labor_rate',
                                   'built_1979_or_earlier_rate', 'owner_occupied_rate', 'dvi'], axis=1)
                            
femaTestData2.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'median_earnings_total',
       'haAmount', 'ownRent_Owner

### Combine Demographics with PR Predictions (GisJoin)

In [56]:
# Load demographics data
demoDf = pd.read_csv("../data/census-tract/census-tract-demographics.csv")

In [57]:
# Select subset of variables
demoDf = demoDf[['tractid', 'gisjoin']]

demoDf.head(3)

Unnamed: 0,tractid,gisjoin
0,12001000200,G1200010000200
1,12001000301,G1200010000301
2,12001000302,G1200010000302


In [68]:
# Merge predictions with demographics - need GISJOIN for Tableau visualizations
femaTestData = pd.merge(femaTestData, demoDf, left_on='censusTractId', right_on='tractid')

# Remove common join column
femaTestData = femaTestData.drop(['tractid_y'], axis=1)

### Create Prediction Files

In [69]:
# Write predictions
femaTestData.to_csv("./predictions/FEMA-Large-PR-predictions.csv", index=False, encoding='utf-8')
print(len(femaTestData))

1041745


In [70]:
# Write predictions - zipped
femaTestData.to_csv("./predictions/FEMA-Large-PR-predictions.csv.gz", index=False, encoding='utf-8', compression='gzip')

In [71]:
# Rollup aggregations to censusTractId
femaTestData_CensusTract = femaTestData.groupby(['censusTractId', 'gisjoin']).agg({
                                           'id': ['count'], 
                                           'grossIncome': ['mean'],
                                           'householdComposition': ['mean'],                                           
                                           'specialNeeds': ['sum'], 
                                           'homeOwnersInsurance': ['sum'],
                                           'floodInsurance': ['sum'],
                                           'inspected': ['sum'],
                                           'rpfvl': ['sum'],
                                           'habitabilityRepairsRequired': ['sum'],
                                           'destroyed': ['sum'],
                                           'waterLevel': ['mean'],
                                           'floodDamage': ['sum'],
                                           'foundationDamage': ['sum'], 
                                           'foundationDamageAmount': ['sum'], 
                                           'roofDamage': ['sum'],
                                           'roofDamageAmount': ['sum'], 
                                           'tsaEligible': ['sum'], 
                                           'tsaCheckedIn': ['sum'],
                                           'rentalAssistanceEligible': ['sum'], 
                                           'rentalAssistanceAmount': ['sum'], 
                                           'repairAssistanceEligible': ['sum'],
                                           'repairAmount': ['sum'],
                                           'replacementAssistanceEligible': ['sum'], 
                                           'replacementAmount': ['sum'], 
                                           'sbaEligible': ['sum'],
                                           'primaryResidence': ['sum'], 
                                           'personalPropertyEligible': ['sum'], 
                                           'ppfvl': ['sum'],
                                           'haAmount': ['sum'],
                                           'haAmount_predicted': ['sum']
                                          }).round(2)

In [72]:
pd.set_option('display.float_format', str)
femaTestData_CensusTract.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,grossIncome,householdComposition,specialNeeds,homeOwnersInsurance,floodInsurance,inspected,rpfvl,habitabilityRepairsRequired,destroyed,waterLevel,floodDamage,foundationDamage,foundationDamageAmount,roofDamage,roofDamageAmount,tsaEligible,tsaCheckedIn,rentalAssistanceEligible,rentalAssistanceAmount,repairAssistanceEligible,repairAmount,replacementAssistanceEligible,replacementAmount,sbaEligible,primaryResidence,personalPropertyEligible,ppfvl,haAmount,haAmount_predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,mean,sum,sum,sum,sum,sum,sum,sum,mean,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
censusTractId,gisjoin,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2
12099007201,G1200990007201,1,2000.0,4.0,0,0,0,1,21523.41,1,0,0.0,0,0,0.0,1,1920.0,1,1,1,1080.0,1,21523.41,0,0.0,0,1,1,14819.02,22603.41,24479.32
12103026601,G1201030026601,1,69800.0,1.0,0,0,0,1,0.0,1,0,0.0,0,0,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,1,0,0.0,0.0,0.0
12105012503,G1201050012503,1,20000.0,1.0,0,0,0,1,0.0,0,0,0.0,0,0,0.0,0,0.0,1,0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0
72001956300,G7200010956300,3034,23879.69,2.34,73,124,14,2202,6297242.05,2089,27,0.12,44,52,86689.77,741,876777.02,2771,28,531,441056.0,689,2788698.19,18,545495.04,101,2924,878,2604425.11,3775249.23,4393409.96
72001956400,G7200010956400,35,36590.36,2.14,0,0,1,27,44463.88,22,0,0.4,2,2,5011.97,10,11940.6,32,0,7,5592.0,11,39423.41,0,0.0,0,32,18,37541.08,45015.41,55982.52
72001956500,G7200010956500,122,13109.69,2.37,4,2,1,87,264345.62,88,1,0.87,4,1,1117.03,36,38054.12,104,1,29,26251.0,36,118017.55,1,21964.16,3,115,41,122966.6,166232.71,202030.18
72001956600,G7200010956600,1040,39354.27,2.21,17,54,7,716,1329886.54,695,5,0.16,16,15,49252.91,211,215598.25,926,8,146,138175.0,182,660180.1,3,91487.55,26,1009,258,625858.67,889842.65,1044062.92
72001956700,G7200010956700,506,15960.97,2.14,9,29,8,374,704028.45,304,2,0.13,12,11,4612.48,136,123651.62,414,3,101,105317.0,121,401896.18,2,64988.0,17,490,163,320987.41,572201.18,634597.24
72001956800,G7200010956800,955,22377.89,2.5,22,51,8,734,2183525.6,643,8,0.2,17,18,7815.43,280,350929.77,864,8,208,189456.0,271,1127337.69,6,188202.56,52,920,312,951804.55,1504996.25,1749909.58
72003430100,G7200030430100,385,45395.84,2.27,12,31,9,242,428094.69,302,1,3.22,41,3,111.39,41,42803.37,349,4,51,48212.0,66,227772.94,0,0.0,14,376,77,373226.73,275984.94,351118.18


In [73]:
# Write aggregated predictions
femaTestData_CensusTract.to_csv("./predictions/FEMA-Large-PR-predictions-censusTract.csv", index=True, encoding='utf-8')

## Random Forest Model trained on Fema Large with Outliers

### Load the Training and Test Datasets - FL/TX and PR

In [5]:
femaFullTrainData = pd.read_csv('../data/open-fema/FEMA-Large-Demographics-FL-TX.csv')
print('There are {} records in our training dataset.'.format(len(femaFullTrainData)))

There are 3535319 records in our training dataset.


In [6]:
femaFullTrainData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

In [22]:
femaFullTestData = pd.read_csv('../data/open-fema/FEMA-Large-Demographics-PR.csv')
print('There are {} records in our test dataset.'.format(len(femaFullTestData)))

  interactivity=interactivity, compiler=compiler, result=result)


There are 1067075 records in our test dataset.


In [23]:
femaFullTestData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

### Create IHP-only Train and Test Datasets

In [14]:
femaFullTrainDf = femaFullTrainData[femaFullTrainData.columns[~femaFullTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

In [24]:
femaFullTestDf = femaFullTestData[femaFullTestData.columns[~femaFullTestData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

### Grid Search
    Parameter Grid:

    param_grid_rf = {
        'bootstrap': [True],
        'max_samples': [0.8, 0.9, None],
        'max_depth': [8, 9, 10],
        'n_estimators': [75, 100, 125],
        'min_samples_leaf': [1, 5, 10]
    }
    Grid Search Params:

    scoring='neg_mean_squared_error'
    cv = 3 

In [19]:
model_rf_full_best, y_pred, y_test = run_rf_grid_search(femaFullTrainDf, frac = 1.0, scoring = 'neg_mean_squared_error')

Shape of Training and Test inputs
(2828255, 34) (2828255,)
(707064, 34) (707064,)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best params:
 {'bootstrap': True, 'max_depth': 10, 'max_samples': 0.9, 'min_samples_leaf': 5, 'n_estimators': 125}


In [20]:
evaluate(y_test, y_pred)

Mean Absolute Error: 72.7897017595783
Mean Squared Error: 292746.5688606674
Root Mean Squared Error: 541.0605962927511
R-squared: 0.9542814819392489


Total HA Amount actual:    $387,597,186.13
Total HA Amount predicted: $387,213,676.22


Percentage of predictions that are off by more than 20%: 4.77


In [21]:
# Look at the distribution of predictions/actuals/errors
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,707064.0,548.178363,2530.463875,0.0,0.0,0.0,0.0,34788.83
predicted,707064.0,547.635965,2465.395835,0.0,0.0,0.0,0.0,33045.69661
errors,707064.0,-0.542398,541.060707,-30265.376578,0.0,0.0,0.0,30102.962259


### Predict on PR Full Test data

In [26]:
y_pred, y_test = predict(femaFullTestDf, model_rf_full_best)

Shape of Training and Test inputs
(1067075, 34) (1067075,)


In [27]:
evaluate(y_test, y_pred)

Mean Absolute Error: 159.46991700098732
Mean Squared Error: 347641.02627830085
Root Mean Squared Error: 589.6109109220256
R-squared: 0.9476730525338654


Total HA Amount actual:    $649,009,278.07
Total HA Amount predicted: $778,337,781.40


Percentage of predictions that are off by more than 20%: 11.10


In [28]:
results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
results_df['errors'] = results_df['predicted'] - results_df['actual']
results_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,1067075.0,608.213367,2577.525763,0.0,0.0,0.0,0.0,66600.0
predicted,1067075.0,729.412442,2726.728367,0.0,0.0,0.0,0.0,33059.200534
errors,1067075.0,121.199075,577.020036,-35777.615721,0.0,0.0,0.0,30286.007326
