In [3]:
# Importing libraries and packages
import geopandas as gpd
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

# Metrics
from sklearn import metrics

### Load the TX/FL datasets

In [4]:
# Load FEMA data
flData = pd.read_csv("FEMA-Large-DR-4337-FL.csv")
txData = pd.read_csv("FEMA-Large-DR-4332-TX.csv")

# Combine FL + TX datasets
femaDf = pd.concat([flData, txData])

In [5]:
print(f'There are {len(femaDf)} records combined.')

There are 3538955 records combined.


In [6]:
# Select subset of columns
cols = ['householdComposition', 'specialNeeds', 'ownRent', 'residenceType', 'homeOwnersInsurance', 'floodInsurance', 
        'inspected', 'rpfvl', 'habitabilityRepairsRequired', 'destroyed', 'waterLevel', 'floodDamage', 
        'foundationDamage', 'roofDamage', 'tsaEligible', 'rentalAssistanceEligible', 'rentalAssistanceAmount', 
        'repairAssistanceEligible', 'repairAmount', 'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible', 
        'primaryResidence']
femaDf = femaDf.loc[:, cols]

### Feature Engineering

In [7]:
# Convert dtype from object to boolean
femaDf['habitabilityRepairsRequired'] = femaDf['habitabilityRepairsRequired'].astype('bool')
femaDf['primaryResidence'] = femaDf['primaryResidence'].astype('bool')

In [8]:
# Convert boolean columns to int
bool_cols = ['specialNeeds', 'homeOwnersInsurance', 'floodInsurance', 'inspected', 'destroyed', 
             'habitabilityRepairsRequired', 'floodDamage', 'foundationDamage', 'roofDamage', 
             'tsaEligible', 'rentalAssistanceEligible', 'repairAssistanceEligible', 
             'replacementAssistanceEligible', 'sbaEligible', 'primaryResidence']
femaDf[bool_cols] = femaDf[bool_cols].astype(int)

In [9]:
# One-hot encode the categorical variables
cat_cols = ['ownRent', 'residenceType']
for col in cat_cols:
    femaDf = pd.concat([femaDf, pd.get_dummies(femaDf[col], prefix=col)], axis=1)
femaDf.drop(cat_cols, axis=1, inplace=True)

In [10]:
# Fill the na's with zero's in waterLevel
femaDf['waterLevel'].fillna(0, inplace=True)

In [11]:
# Define all the dollar amount columns
amount_cols = ['rentalAssistanceAmount', 'replacementAmount','repairAmount', 'rpfvl']
# Set them to zero if null
femaDf[amount_cols] = femaDf[amount_cols].fillna(0)
# Calculate the aggregated haAmount
femaDf['haAmount'] = femaDf['rentalAssistanceAmount'] + femaDf['replacementAmount'] + femaDf['repairAmount']

In [12]:
# Drop the individual amount columns
femaDf.drop(['rentalAssistanceAmount', 'replacementAmount','repairAmount'], axis=1, inplace=True)

### Split in Train and Test

In [13]:
# Split the data into train and test data
X = femaDf.loc[:, femaDf.columns != 'haAmount']
y = femaDf.loc[:, 'haAmount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2831164, 34) (2831164,)
(707791, 34) (707791,)


In [14]:
# Function to get cross validation scores
def get_cv_scores(model):
    scores = cross_val_score(model,
                             X_train,
                             y_train,
                             cv=5,
                             scoring='r2')
    
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))

### Linear Regression (Ordinary Least Squares)

In [15]:
# LinearRegression default model
model_lr = LinearRegression()

# Fit the LR model to the training
model_lr.fit(X_train, y_train)

# Get cross val scores
get_cv_scores(model_lr)

CV Mean:  0.7031206384780899
STD:  0.0007893505036995003


In [16]:
# Predict
y_pred = model_lr.predict(X_test)

# Look at actual versus predicted
predDf = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predDf

Unnamed: 0,Actual,Predicted
1624564,920.0,633.265717
2091582,0.0,103.256239
1738799,1014.0,1449.788623
77868,0.0,-559.942261
1110471,0.0,-215.300813
...,...,...
2200400,0.0,131.048104
2186364,0.0,33.447657
181173,0.0,-158.830087
1732289,0.0,116.755035


In [17]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error: 429.888273860699
Mean Squared Error: 1882964.9101146152
Root Mean Squared Error: 1372.211685606348
Mean Absolute Percentage Error: 7.042780630736899e+17


In [22]:
# Get feature importance
impt = zip(X_train.columns, model_lr.coef_)
sort_impt = sorted(impt, key = lambda x: abs(x[1]), reverse=True)
for x, y in sort_impt:
    print(f'{x}: {y}')

replacementAssistanceEligible: 18921.414986664182
repairAssistanceEligible: 6232.89153354628
destroyed: -3282.3685983740506
rentalAssistanceEligible: 1320.394680503326
roofDamage: -863.4869327382406
foundationDamage: -689.6487522266149
floodInsurance: -685.7879407668682
sbaEligible: 633.9394264738618
residenceType_Boat: -575.6965287710591
floodDamage: 315.32622690799775
inspected: -218.36925242898698
habitabilityRepairsRequired: -206.12765903724582
homeOwnersInsurance: 199.43237846463182
residenceType_Travel Trailer: -194.97319718614503
residenceType_Unknown: 185.27366021971116
ownRent_Owner: -127.08141239873937
residenceType_College Dorm: 118.56120236684059
residenceType_Mobile Home: -116.65169069349685
residenceType_Townhouse: 108.54644016191581
residenceType_Condo: 102.32546654269508
tsaEligible: 99.44974179118432
ownRent_Renter: 91.72697405804453
primaryResidence: -90.9898628459262
residenceType_House/Duplex: 87.08874232962708
residenceType_Other: 72.03040323178313
residenceType_Mi

### Ridge

In [52]:
# Initialize Ridge default model
model_rdg = Ridge()

# Create param grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Conduct grid search
grid_search = GridSearchCV(estimator = model_rdg, param_grid = param_grid, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1, verbose = 2)

In [53]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


GridSearchCV(cv=3, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             scoring='neg_mean_absolute_error', verbose=2)

In [54]:
grid_search.best_params_

{'alpha': 1000}

In [55]:
# Predict
bestModel = grid_search.best_estimator_
y_pred = bestModel.predict(X_test)

In [56]:
# Look at actual versus predicted
predDf = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predDf

Unnamed: 0,Actual,Predicted
1896425,0.0,-14.165707
1775561,0.0,60.791266
1201741,0.0,-559.718047
1333585,0.0,125.944875
393160,0.0,-179.865159
...,...,...
1829549,920.0,1435.875147
2548030,0.0,100.780287
2357859,0.0,2.477066
1926370,0.0,10.998881


In [59]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error: 429.2021105043105
Mean Squared Error: 1936785.5704951524
Root Mean Squared Error: 1391.6844363917965
Mean Absolute Percentage Error: 6.827772897345399e+17


In [60]:
# Get feature importance
impt = zip(X_train.columns, bestModel.coef_)
sort_impt = sorted(impt, key = lambda x: abs(x[1]), reverse=True)
for x, y in sort_impt:
    print(f'{x}: {y}')

replacementAssistanceEligible: 6149.70250954469
repairAssistanceEligible: 6087.606500644544
rentalAssistanceEligible: 1326.1002746171087
roofDamage: -829.4679427121873
floodInsurance: -699.5697289318133
sbaEligible: 618.7977932592112
foundationDamage: -576.989638998238
residenceType_Boat: -350.45888187863517
floodDamage: 311.59335654329436
inspected: -210.46439746849424
habitabilityRepairsRequired: -193.8216244753124
homeOwnersInsurance: 181.9622173214632
residenceType_Travel Trailer: -150.35343792000157
ownRent_Owner: -115.9401768234867
residenceType_Townhouse: 112.14860200083812
residenceType_Condo: 103.9782892241435
residenceType_Mobile Home: -102.17578454257564
tsaEligible: 98.30322167594748
residenceType_House/Duplex: 90.16424478828199
primaryResidence: -88.69428624270326
ownRent_Renter: 83.91844839991623
residenceType_Other: 73.06802889919673
residenceType_Apartment: 64.99965689170295
residenceType_College Dorm: 53.59963615353418
destroyed: 53.489817457379594
specialNeeds: 42.948

### Lasso

In [61]:
# Initialize Lasso default model
model_las = Lasso()

# Create param grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Conduct grid search
grid_search = GridSearchCV(estimator = model_las, param_grid = param_grid, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1, verbose = 2)

In [62]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


GridSearchCV(cv=3, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             scoring='neg_mean_absolute_error', verbose=2)

In [63]:
grid_search.best_params_

{'alpha': 10}

In [64]:
# Predict
bestModel = grid_search.best_estimator_
y_pred = bestModel.predict(X_test)

In [65]:
# Look at actual versus predicted
predDf = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predDf

Unnamed: 0,Actual,Predicted
1896425,0.0,-68.785350
1775561,0.0,-19.419140
1201741,0.0,-517.847909
1333585,0.0,50.130560
393160,0.0,-119.233516
...,...,...
1829549,920.0,1363.503128
2548030,0.0,50.130560
2357859,0.0,-2.276878
1926370,0.0,-68.785350


In [66]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error: 371.7921038419338
Mean Squared Error: 1999466.9495712966
Root Mean Squared Error: 1414.0250880275416
Mean Absolute Percentage Error: 4.2389385963486176e+17


In [67]:
# Get feature importance
impt = zip(X_train.columns, bestModel.coef_)
sort_impt = sorted(impt, key = lambda x: abs(x[1]), reverse=True)
for x, y in sort_impt:
    print(f'{x}: {y}')

repairAssistanceEligible: 5658.929048270398
rentalAssistanceEligible: 1301.031015974845
floodInsurance: -569.5535856604749
sbaEligible: 118.31689429409064
inspected: -66.5084722695119
ownRent_Owner: -62.789718550440405
residenceType_Mobile Home: -58.109420778417885
tsaEligible: 52.40743794413293
homeOwnersInsurance: 39.68172985914522
roofDamage: -29.552147063469487
waterLevel: 17.62550113436901
householdComposition: 12.341552548717967
rpfvl: 0.2928848797084562
specialNeeds: 0.0
habitabilityRepairsRequired: -0.0
destroyed: 0.0
floodDamage: 0.0
foundationDamage: -0.0
replacementAssistanceEligible: 0.0
primaryResidence: -0.0
ownRent_Renter: 0.0
ownRent_Unknown: 0.0
residenceType_Apartment: 0.0
residenceType_Assisted Living Facility: -0.0
residenceType_Boat: -0.0
residenceType_College Dorm: 0.0
residenceType_Condo: 0.0
residenceType_Correctional Facility: -0.0
residenceType_House/Duplex: 0.0
residenceType_Military Housing: 0.0
residenceType_Other: 0.0
residenceType_Townhouse: 0.0
residence