# Feature Engineering - FEMA Large Disasters with Demographics

In [2]:
# Basic libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import warnings

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

  import pandas.util.testing as tm


In [3]:
# Options
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load FEMA Data

In [4]:
# Load FEMA data
df = pd.read_csv("../../data/open-fema/FEMA-Large-DR-4393-NC.csv")

In [5]:
# NC datasets
print(f"There are {len(df)} records in the dataframe.")

There are 132634 records in the dataframe.


In [32]:
# Look at data types for each of the columns
df.dtypes

disasterNumber                       int64
damagedCity                         object
damagedStateAbbreviation            object
damagedZipCode                       int64
householdComposition                 int64
grossIncome                        float64
specialNeeds                          bool
ownRent                             object
residenceType                       object
homeOwnersInsurance                   bool
floodInsurance                        bool
inspected                             bool
rpfvl                              float64
habitabilityRepairsRequired         object
destroyed                             bool
waterLevel                         float64
floodDamage                           bool
foundationDamage                      bool
foundationDamageAmount             float64
roofDamage                            bool
roofDamageAmount                   float64
tsaEligible                           bool
tsaCheckedIn                          bool
rentalAssis

In [33]:
df.drop_duplicates(inplace=True)
print(f"There are {len(df)} records after dropping any duplicates.")

There are 132634 records after dropping any duplicates.


In [34]:
# Change incorrect datatype
df['habitabilityRepairsRequired'] = df['habitabilityRepairsRequired'].astype('bool')
df['primaryResidence'] = df['primaryResidence'].astype('bool')
df['censusBlockId'] = df['censusBlockId'].astype('str').replace('\.0', '', regex=True)
print(f"habitabilityRepairsRequired datatype: {df['habitabilityRepairsRequired'].dtype}")
print(f"primaryResidence datatype: {df['primaryResidence'].dtype}")
print(f"censusBlockId datatype: {df['censusBlockId'].dtype}")

habitabilityRepairsRequired datatype: bool
primaryResidence datatype: bool
censusBlockId datatype: object


In [35]:
# Create census tract IDs for FEMA dataset
df['censusTractId'] = df.apply(lambda row: row.censusBlockId[0:11], axis=1)
df['tractid'] = df['censusTractId']
df[['censusBlockId', 'censusTractId', 'tractid']].head()


Unnamed: 0,censusBlockId,censusTractId,tractid
0,370499609001004,37049960900,37049960900
1,370939702022004,37093970202,37093970202
2,371539710002065,37153971000,37153971000
3,370499610024029,37049961002,37049961002
4,370190204042003,37019020404,37019020404


In [36]:
# Check how many census tracts are in NC
print(f"There are {len(df['tractid'].unique())} tracts represented in the dataset.")
print(f"Of these, {len(df.loc[df['damagedStateAbbreviation'] == 'NC', 'tractid'].unique())} are from NC.")

There are 1297 tracts represented in the dataset.
Of these, 1297 are from NC.


### Feature Engineering

In [37]:
# Define all dollar amount columns
amountCols = ['foundationDamageAmount', 'roofDamageAmount', 'rentalAssistanceAmount', 
              'replacementAmount','repairAmount', 'rpfvl', 'ppfvl']

# Set them to zero if null
df[amountCols] = df[amountCols].fillna(0)

# Calculate the aggregated haAmount
df['haAmount'] = df['rentalAssistanceAmount'] + df['replacementAmount'] + df['repairAmount']

print('Total ${:,.2f} in HA payouts'.format(df['haAmount'].sum()))
print('Total ${:,.2f} in mainland HA payouts'.format(df.loc[df['damagedStateAbbreviation'] != 'PR', 'haAmount'].sum()))

Total $92,365,426.73 in HA payouts
Total $92,365,426.73 in mainland HA payouts


In [38]:
# Convert boolean columns to int
bool_cols = ['specialNeeds', 'homeOwnersInsurance', 'floodInsurance', 'inspected', 'destroyed', 
             'habitabilityRepairsRequired', 'floodDamage', 'foundationDamage', 'roofDamage', 
             'tsaEligible', 'rentalAssistanceEligible', 'repairAssistanceEligible', 
             'replacementAssistanceEligible', 'sbaEligible', 'primaryResidence']

df[bool_cols] = df[bool_cols].astype(int)

In [39]:
# Fill the na's with zero's in waterLevel 
df['waterLevel'].fillna(0, inplace=True)

In [40]:
# One-hot encode the categorical variables
cat_cols = ['ownRent', 'residenceType']
for col in cat_cols:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
df.drop(cat_cols, axis=1, inplace=True)

In [41]:
# Test (PR) data
testDf = df.loc[df['damagedStateAbbreviation'] == 'NC', ]

# Write test to csv
testDf.to_csv("../../data/open-fema/FEMA-Large-NC.csv", index=False, encoding='utf-8')
print('Test (PR) rows:', len(testDf))

Test (PR) rows: 132634


In [42]:
# Do drop uninspected with HA amount
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['haAmount']>0) & (df['inspected']==False)])}")
uninspected = df.loc[(df['haAmount'] > 0) & (df['inspected'] == False)].index
df.drop(uninspected, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132633
Amount after dropped: 132633


In [43]:
# Drop those with HA amount over max
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['haAmount'] > 33300)])}")
maxLimit_ha = df.loc[(df['haAmount'] > 33300)].index
df.drop(maxLimit_ha, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132417
Amount after dropped: 132417


In [44]:
# Drop those with household composition over 15
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['householdComposition'] > 15)])}")
maxLimit_house = df.loc[(df['householdComposition'] > 15)].index
df.drop(maxLimit_house, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132415
Amount after dropped: 132415


In [45]:
# Drop those with water level over 120
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['waterLevel'] > 120)])}")
maxLimit_water = df.loc[(df['waterLevel'] > 120)].index
df.drop(maxLimit_water, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132384
Amount after dropped: 132384


### Create NC Test - Outliers Removed

In [46]:
# Create test without outliers
testDf = df.loc[df['damagedStateAbbreviation'] == 'NC', ]

# Write test to csv
testDf.to_csv("../../data/open-fema/FEMA-Large-NC-clean.csv", index=False, encoding='utf-8')
print('Test (PR) rows:', len(testDf))

Test (PR) rows: 132384


### Split FL/TX into Train/Dev

In [5]:
# Read in train/test
trainDf = pd.read_csv('../../data/open-fema/FEMA-Large-Demographics-FL-TX-clean.csv')
print('Train (FL/TX) rows:', len(trainDf))

Train (FL/TX) rows: 3534889


In [6]:
# Split the data into train and test data
X = trainDf
y = trainDf.loc[:, 'haAmount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state=42)

In [7]:
# Write train to csv
X_train.to_csv("./predictions/train.csv", index=False, encoding='utf-8')
print('Train rows:', len(X_train))

Train rows: 2827911


In [8]:
# Write dev to csv
X_test.to_csv("./predictions/dev.csv", index=False, encoding='utf-8')
print('Test rows:', len(X_test))

Test rows: 706978
