In [322]:
# Importing libraries and packages
import geopandas as gpd
import pandas as pd
import numpy as np
import json

# Visualization
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Modeling
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics

import pickle

In [217]:
femaTrainData = pd.read_csv('../data/open-fema/FEMA-Large-Demographics-FL-TX.csv')
print('There are {} records combined.'.format(len(femaTrainData)))

  interactivity=interactivity, compiler=compiler, result=result)


There are 3533663 records combined.


In [218]:
femaTrainData.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'homeOwnersInsurance', 'floodInsurance', 'inspected', 'rpfvl',
       'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage',
       'foundationDamage', 'foundationDamageAmount', 'roofDamage',
       'roofDamageAmount', 'tsaEligible', 'tsaCheckedIn',
       'rentalAssistanceEligible', 'rentalAssistanceAmount',
       'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl',
       'censusBlockId', 'censusYear', 'id', 'censusTractId', 'censusid',
       'tractid', 'tractname', 'county', 'state', 'below_poverty_rate',
       'median_earnings_total', 'une

### Select a subset of columns
https://docs.google.com/document/d/1nu0yENGAWnoiMcTufxYnH7xwdh8NfFum9ni9IYiSIdk/edit#
https://docs.google.com/document/d/1cpznnaIb5CE21I2RO8y2xRvZKS8StcP_JeXDW2mUIis/edit?ts=60319d34#heading=h.j8u0tgugtaw

In [285]:
# Select a subset of columns (No demographics)
ihp_exclude_cols = ['disasterNumber', 
                    'damagedCity', 
                    'damagedStateAbbreviation',
                    'damagedZipCode',
                    'grossIncome',
                    'foundationDamageAmount',
                    'roofDamageAmount',
                    'tsaCheckedIn',
                    'rentalAssistanceAmount',
                    'repairAmount',
                    'replacementAmount',
                    'renterDamageLevel', 
                    'rentalAssistanceEndDate', 
                    'rentalResourceCity',
                    'rentalResourceStateAbbreviation', 
                    'rentalResourceZipCode',
                    'personalPropertyEligible', 
                    'ppfvl',
                    'censusBlockId', 
                    'censusYear', 
                    'id']
demo_exclude_cols = ['censusTractId', 
                     'censusid',
                     'tractid', 
                     'tractname', 
                     'county', 
                     'state',
                     'median_earnings_total',]
demo_dvi_col = ['dvi']
demo_rate_cols = ['below_poverty_rate',
                  'unemployed_labor_rate',
                  'built_1979_or_earlier_rate', 
                  'owner_occupied_rate']

In [325]:
def run_mlp(df, frac):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)
    
    # Scale the numeric fields
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Define the model
    model_mlp = MLPRegressor(hidden_layer_sizes=(15, 10, 5), activation='relu', 
                 learning_rate='constant', learning_rate_init=0.008, 
                 random_state=42, solver='adam', max_iter=300)
    
    # Fit the model
    model_mlp.fit(X_train, y_train)
    
    return (model_mlp, model_mlp.predict(X_test), y_test)

In [314]:
def run_mlp_grid_search(df, frac):
    # Sample the dataset
    df_train = df.sample(frac=frac) if frac < 1.0 else df    
        
    # Create test/train split
    X = df_train.loc[:, df_train.columns != 'haAmount']
    y = df_train.loc[:, 'haAmount']    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)
    
    # Scale the numeric fields
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # MLPRegressor default model
    model_mlp = MLPRegressor(random_state = 42)
    
    # Create the parameter grid
    param_grid_mlp = {
        'solver' : ['adam'],
        'activation': ['relu'],
        'learning_rate_init': [0.008, 0.01],
        'learning_rate' : ['constant', 'adaptive'],
        'max_iter' : [300, 400, 500],
        'hidden_layer_sizes' : [(15, 10, 5), (15, 5, 5), (10, 5, 5)]
    }
    
    # Instantiate the grid search model
    grid_search_mlp = GridSearchCV(estimator = model_mlp, param_grid = param_grid_mlp, 
                                   scoring='neg_mean_absolute_error', cv = 3, n_jobs = -1, verbose = 2)    
    
    # Fit the grid search to the data
    grid_search_mlp.fit(X_train, y_train)
    
    print('Best params:\n', grid_search_mlp.best_params_)
    
    # Predict using best model
    model_mlp_best = grid_search_mlp.best_estimator_
    
    return (model_mlp_best, model_mlp_best.predict(X_test), y_test)

In [315]:
def evaluate(y_test, y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R-squared:', metrics.r2_score(y_test, y_pred))
    print('\n')
    print('Total HA Amount actual:    ${:,.2f}'.format(y_test.sum()))
    print('Total HA Amount predicted: ${:,.2f}'.format(y_pred.sum()))
    print('\n')
    
    # Calculate the number of observations that are off by more than 20%
    results_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
    results_df['diff'] = results_df['predicted'] - results_df['actual']
    results_df['percent_diff'] = (abs(abs(results_df['predicted'] / (results_df['actual'])) - 1) * 100).where(results_df['actual'] > 0, 0)
    print('Percentage of predictions that are off by more than 20%: {:.2f}'.format( 
          len(results_df[results_df['percent_diff'] > 20])/len(results_df) * 100))    

### IHP - No Demographics - Learning Rate 0.01

In [305]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

In [306]:
model_mlp, y_pred, y_test = run_mlp(femaDf, 1.0)

In [307]:
evaluate(y_test, y_pred)

Mean Absolute Error: 76.67466922814468
Mean Squared Error: 312161.2357454662
Root Mean Squared Error: 558.7139122533698
R-squared: 0.9503545384394623


Total HA Amount actual:    $385,808,834.81
Total HA Amount predicted: $380,428,831.67


Percentage of predictions that are off by more than 20%: 4.87


### IHP - No Demographics - Learning Rate 0.008

In [326]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

In [327]:
model_mlp, y_pred, y_test = run_mlp(femaDf, 1.0)

In [328]:
evaluate(y_test, y_pred)

Mean Absolute Error: 79.62948180089535
Mean Squared Error: 319310.21712897916
Root Mean Squared Error: 565.0754083562468
R-squared: 0.9492175795866931


Total HA Amount actual:    $385,808,834.81
Total HA Amount predicted: $385,179,175.88


Percentage of predictions that are off by more than 20%: 5.25


### IHP - with DVI

In [308]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_rate_cols)]]

In [309]:
model_mlp, y_pred, y_test = run_mlp(femaDf, 1.0)

In [310]:
evaluate(y_test, y_pred)

Mean Absolute Error: 91.16777553572913
Mean Squared Error: 335289.9008132428
Root Mean Squared Error: 579.0422271417196
R-squared: 0.9466762045495198


Total HA Amount actual:    $385,808,834.81
Total HA Amount predicted: $405,418,588.13


Percentage of predictions that are off by more than 20%: 5.87


### IHP - with rate demographics

In [311]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col)]]

In [312]:
model_mlp, y_pred, y_test = run_mlp(femaDf, 1.0)

In [313]:
evaluate(y_test, y_pred)

Mean Absolute Error: 77.8405918101761
Mean Squared Error: 320196.253444546
Root Mean Squared Error: 565.8588635380257
R-squared: 0.9490766662483002


Total HA Amount actual:    $385,808,834.81
Total HA Amount predicted: $387,586,600.55


Percentage of predictions that are off by more than 20%: 5.20


### Grid Search - IHP Only, 0.5 data due to time complexity

In [317]:
femaDf = femaTrainData[femaTrainData.columns[~femaTrainData.columns.isin(
    ihp_exclude_cols + demo_exclude_cols + demo_dvi_col + demo_rate_cols)]]

In [318]:
model_mlp, y_pred, y_test = run_mlp_grid_search(femaDf, 0.5)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best params:
 {'activation': 'relu', 'hidden_layer_sizes': (15, 10, 5), 'learning_rate': 'constant', 'learning_rate_init': 0.008, 'max_iter': 300, 'solver': 'adam'}


In [319]:
evaluate(y_test, y_pred)

Mean Absolute Error: 78.44164053677102
Mean Squared Error: 347239.98531831946
Root Mean Squared Error: 589.2707232828723
R-squared: 0.9459570569394383


Total HA Amount actual:    $193,860,176.64
Total HA Amount predicted: $188,011,821.32


Percentage of predictions that are off by more than 20%: 5.34


In [329]:
# Look at actual verus predicted
df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
3400246,0.0,0.097991
901899,1014.0,1020.344917
3226677,0.0,0.097991
1347233,0.0,0.097991
1809092,0.0,0.097991
...,...,...
3181447,0.0,0.097991
1977809,0.0,0.097991
2151645,0.0,0.097991
2406434,0.0,0.097991


In [330]:
# Save the model
pickle.dump(model_mlp, open('mlp.sav', 'wb'))

In [331]:
# Load saved model
model_mlp_sav = pickle.load(open('mlp.sav', 'rb'))
model_mlp_sav.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (15, 10, 5),
 'learning_rate': 'constant',
 'learning_rate_init': 0.008,
 'max_fun': 15000,
 'max_iter': 300,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}