# Hurricane Harvey & Hurricane Irma HA Preliminary Clustering with Demographics Data
Kai Nham

In [76]:
# Importing libraries and packages
import geopandas as gpd
import pandas as pd
import numpy as np
import sklearn

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load FEMA dataset, check datatypes, etc.

In [54]:
# Load FEMA data
flData = pd.read_csv("FEMA-Large-DR-4337-FL.csv")
txData = pd.read_csv("FEMA-Large-DR-4332-TX.csv")
prData = pd.read_csv("FEMA-Large-DR-4339-PR.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
# Combine FL + TX datasets
femaDf = pd.concat([flData, txData, prData])
print(f"There are {len(femaDf)} records in the dataframe.")

There are 4661523 records in the dataframe.


In [80]:
# Look at data types for each of the columns
femaDf.dtypes

disasterNumber                       int64
damagedCity                         object
damagedStateAbbreviation            object
damagedZipCode                     float64
householdComposition                 int64
grossIncome                        float64
specialNeeds                          bool
ownRent                             object
residenceType                       object
homeOwnersInsurance                   bool
floodInsurance                        bool
inspected                             bool
rpfvl                              float64
habitabilityRepairsRequired         object
destroyed                             bool
waterLevel                         float64
floodDamage                           bool
foundationDamage                      bool
foundationDamageAmount             float64
roofDamage                            bool
roofDamageAmount                   float64
tsaEligible                           bool
tsaCheckedIn                          bool
rentalAssis

In [81]:
femaDf.drop_duplicates(inplace=True)
print(f"There are {len(femaDf)} records after dropping any duplicates.")

There are 4661523 records after dropping any duplicates.


In [82]:
# Change incorrect datatype
femaDf['habitabilityRepairsRequired'] = femaDf['habitabilityRepairsRequired'].astype('bool')
femaDf['primaryResidence'] = femaDf['primaryResidence'].astype('bool')
femaDf['censusBlockId'] = femaDf['censusBlockId'].astype('str')
print(f"habitabilityRepairsRequired datatype: {femaDf['habitabilityRepairsRequired'].dtype}")
print(f"primaryResidence datatype: {femaDf['primaryResidence'].dtype}")
print(f"censusBlockId datatype: {femaDf['censusBlockId'].dtype}")

habitabilityRepairsRequired datatype: bool
primaryResidence datatype: bool
censusBlockId datatype: object


In [83]:
# Create census tract IDs for FEMA dataset
femaDf['censusTractId'] = femaDf.apply(lambda row: row.censusBlockId[0:11], axis=1)

In [60]:
femaDf[['censusBlockId', 'censusTractId']].head()

Unnamed: 0,censusBlockId,censusTractId
0,120111105011006.0,12011110501
1,120860011032003.0,12086001103
2,120830012043000.0,12083001204
3,120860135002000.0,12086013500
4,120090641241018.0,12009064124


## Load demographics data & perform merge with FEMA data

In [77]:
# Load demographics data
demoDf = pd.read_csv("w210-bluetarp/data/census-tract/census-tract-demographics.csv")
demoDf.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,censusid,gisjoin,tractid,tractname,county,state,total_population,male_population,female_population,total_population_edu,high_school_grad,bachelors_degree,graduate_prof_degree,high_school_grad_rate,bachelors_degree_rate,graduate_prof_degree_rate,bachelors_plus_degree_rate,total_population_employ,labor_force_total,civilian_labor_force,employed_labor_force,unemployed_labor_force,armed_forces_labor_force,not_labor_force_total,labor_force_rate,civilian_labor_rate,employed_labor_rate,unemployed_labor_rate,armed_forces_labor_rate,not_labor_force_rate,total_population_income,income_less_10k,income_10k_15k,income_15k_20k,income_20k_25k,income_25k_30k,income_30k_35k,income_35k_40k,income_40k_45k,income_45k_50k,income_50k_60k,income_60k_75k,income_75k_100k,income_100k_125k,income_125k_150k,income_150k_200k,income_200k_more,median_earnings_total,median_earnings_male,median_earnings_male_fulltime,median_earnings_male_other,median_earnings_female,median_earnings_female_fulltime,median_earnings_female_other,total_households_poverty,below_poverty,below_poverty_family,below_poverty_family_married,below_poverty_family_other,below_poverty_family_other_male_no_spouse,below_poverty_family_other_female_no_spouse,below_poverty_nonfamily,below_poverty_nonfamily_male,below_poverty_nonfamily_female,above_poverty,above_poverty_family,above_poverty_family_married,above_poverty_family_other,above_poverty_family_other_male_no_spouse,above_poverty_family_other_female_no_spouse,above_poverty_nonfamily,above_poverty_nonfamily_male,above_poverty_nonfamily_female,below_poverty_rate,below_poverty_family_rate,below_poverty_family_married_rate,below_poverty_family_other_rate,below_poverty_family_other_male_no_spouse_rate,below_poverty_family_other_female_no_spouse_rate,below_poverty_nonfamily_rate,below_poverty_nonfamily_male_rate,below_poverty_nonfamily_female_rate,above_poverty_rate,above_poverty_family_rate,above_poverty_family_married_rate,above_poverty_family_other_rate,above_poverty_family_other_male_no_spouse_rate,above_poverty_family_other_female_no_spouse_rate,above_poverty_nonfamily_rate,above_poverty_nonfamily_male_rate,above_poverty_nonfamily_female_rate,total_population_assist,with_assistance,without_assistance,with_assistance_rate,without_assistance_rate,total_housing_units,total_housing_units_rate,occupied_housing_units,occupied_housing_units_rate,vacant_housing_units,vacant_housing_units_rate,homeowner_vacancy_rate,homeowner_vacancy_rate_rate,rental_vacancy_rate,rental_vacancy_rate_rate,tenure_occupied_housing_units,tenure_occupied_housing_units_rate,owner_occupied,owner_occupied_rate,renter_occupied,renter_occupied_rate,avg_size_owner_unit,avg_size_owner_unit_rate,avg_size_renter_unit,avg_size_of_renter__unit_rate,moved_in_2015_or_later,moved_in_2015_or_later_rate,moved_in_2010_to_2014,moved_in_2010_to_2014_rate,moved_in_2000_to_2009,moved_in_2000_to_2009_rate,moved_in_1990_to_1999,moved_in_1990_to_1999_rate,moved_in_1980_to_1989,moved_in_1980_to_1989_rate,moved_in_1979_and_earlier,moved_in_1979_and_earlier_rate,lacking_complete_plumbing_facilities,lacking_complete_plumbing_facilities_rate,lacking_complete_kitchen_facilities,lacking_complete_kitchen_facilities_rate,no_telephone_service_available,no_telephone_service_available_rate,value_less_50k,value_less_50k_rate,value_50k_100k,value_50k_100k_rate,value_100k_150k,value_100k_150k_rate,value_150k_200k,value_150k_200k_rate,value_200k_300k,value_200k_300k_rate,value_300k_500k,value_300k_500k_rate,value_500k_1M,value_500k_1M_rate,1M_more,1M_more_rate,median_value,median_value_rate,mortgage,mortgage_rate,mortgage_less_20_percent,mortgage_less_20_percent_rate,mortgage_20_25_percent,mortgage_20_25_percent_rate,mortgage_25_30_percent,mortgage_25_30_percent_rate,mortgage_30_35_percent,mortgage_30_35_percent_rate,mortgage_35_percent_plus,mortgage_35_percent_plus_rate,mortgage_not_computed,mortgage_not_computed_rate,without_mortgage,without_mortgage_rate,without_mortgage_less_10_percent,without_mortgage_less_10_percent_rate,without_mortgage_10_15_percent,without_mortgage_10_15_percent_rate,without_mortgage_15_20_percent,without_mortgage_15_20_percent_rate,without_mortgage_20_25_percent,without_mortgage_20_25_percent_rate,without_mortgage_25_30_percent,without_mortgage_25_30_percent_rate,without_mortgage_30_35_percent,without_mortgage_30_35_percent_rate,without_mortgage_35_percent_plus,without_mortgage_35_percent_plus_rate,without_mortgage_not_computed,without_mortgage_not_computed_rate,occupied_units_paying_rent,occupied_units_paying_rent_rate,rent_less_15_percent,rent_less_15_percent_rate,rent_15_20_percent,rent_15_20_percent_rate,rent_20_25_percent,rent_20_25_percent_rate,rent_25_30_percent,rent_25_30_percent_rate,rent_30_35_percent,rent_30_35_percent_rate,rent_35_percent_plus,rent_35_percent_plus_rate,rent_not_computed,rent_not_computed_rate
0,1400000US12001000200,G1200010000200,12001000200,"Census Tract 2, Alachua County, Florida",Alachua County,Florida,6834,3096,3738,1941,564,256,479,0.290572,0.131891,0.24678,0.378671,6681,3151,3147,2923,224,4,3530,0.471636,0.471037,0.437509,0.033528,0.000599,0.528364,345,28,49,30,0,0,31,0,0,0,71,49,14,21,30,22,0,6752,12345,25520,6125,4405,27139,3134,2364,1328,91,22,69,0,69,1237,622,615,1036,254,52,202,80,122,782,431,351,0.56176,0.038494,0.009306,0.029188,0.0,0.029188,0.523266,0.263113,0.260152,0.43824,0.107445,0.021997,0.085448,0.033841,0.051607,0.330795,0.182318,0.148477,2364,196,2168,0.08291,0.91709,3374,3374,2364,70.1,1010,29.9,10.1,,25.7,,2364,2364,417,17.6,1947,82.4,2.62,,2.4,,602,25.5,1320,55.8,129,5.5,178,7.5,70,3.0,65,2.7,0,0.0,29,1.2,93.0,3.9,34,8.2,173,41.5,62,14.9,91,21.8,41,9.8,16,3.8,0,0.0,0,0,106300,,221,221,36,16.3,26,11.8,0,0.0,0,0.0,159,71.9,0,,191,191,120,62.8,14,7.3,19,9.9,21,11.0,0,0.0,0,0.0,17,8.9,5,,1617,1617,50,3.1,56,3.5,152,9.4,101,6.2,107,6.6,1151,71.2,330,
1,1400000US12001000301,G1200010000301,12001000301,"Census Tract 3.01, Alachua County, Florida",Alachua County,Florida,3849,1806,2043,2443,467,570,440,0.191158,0.23332,0.180106,0.413426,3499,2397,2397,2161,236,0,1102,0.685053,0.685053,0.617605,0.067448,0.0,0.314947,592,55,71,12,63,5,10,53,39,66,55,24,86,0,14,23,16,20481,22639,36750,11402,18427,34900,9127,1751,608,151,94,57,0,57,457,244,213,1143,441,314,127,22,105,702,251,451,0.34723,0.086236,0.053684,0.032553,0.0,0.032553,0.260994,0.139349,0.121645,0.65277,0.251856,0.179326,0.07253,0.012564,0.059966,0.400914,0.143347,0.257567,1751,487,1264,0.278127,0.721873,2014,2014,1751,86.9,263,13.1,0.0,,11.0,,1751,1751,360,20.6,1391,79.4,2.33,,2.14,,459,26.2,747,42.7,351,20.0,48,2.7,52,3.0,94,5.4,0,0.0,5,0.3,72.0,4.1,25,6.9,100,27.8,69,19.2,80,22.2,75,20.8,11,3.1,0,0.0,0,0,143500,,150,150,50,33.3,55,36.7,7,4.7,0,0.0,38,25.3,0,,170,170,80,47.1,50,29.4,15,8.8,25,14.7,0,0.0,0,0.0,0,0.0,40,,1332,1332,118,8.9,59,4.4,191,14.3,184,13.8,93,7.0,687,51.6,59,
2,1400000US12001000302,G1200010000302,12001000302,"Census Tract 3.02, Alachua County, Florida",Alachua County,Florida,2374,1151,1223,1700,464,246,194,0.272941,0.144706,0.114118,0.258824,2047,1241,1241,1064,177,0,806,0.606253,0.606253,0.519785,0.086468,0.0,0.393747,433,8,22,12,106,24,29,19,0,13,34,57,22,41,23,23,0,25108,31607,45063,11434,23733,28911,14018,1101,269,53,40,13,0,13,216,117,99,832,380,215,165,35,130,452,243,209,0.244323,0.048138,0.036331,0.011807,0.0,0.011807,0.196185,0.106267,0.089918,0.755677,0.345141,0.195277,0.149864,0.031789,0.118074,0.410536,0.220708,0.189827,1101,284,817,0.257947,0.742053,1489,1489,1101,73.9,388,26.1,9.6,,22.2,,1101,1101,469,42.6,632,57.4,2.14,,2.12,,230,20.9,404,36.7,224,20.3,94,8.5,48,4.4,101,9.2,0,0.0,33,3.0,54.0,4.9,69,14.7,153,32.6,163,34.8,36,7.7,19,4.1,6,1.3,23,4.9,0,0,103100,,296,296,87,29.4,44,14.9,15,5.1,48,16.2,102,34.5,0,,173,173,58,33.5,24,13.9,25,14.5,39,22.5,7,4.0,20,11.6,0,0.0,0,,565,565,59,10.4,63,11.2,43,7.6,41,7.3,49,8.7,310,54.9,67,
3,1400000US12001000400,G1200010000400,12001000400,"Census Tract 4, Alachua County, Florida",Alachua County,Florida,5996,2617,3379,3683,1303,429,260,0.353788,0.116481,0.070595,0.187076,4258,2565,2565,2333,232,0,1693,0.602395,0.602395,0.54791,0.054486,0.0,0.397605,1139,101,54,134,39,82,79,74,65,45,55,152,141,79,0,0,39,27784,35645,41438,16402,26691,35379,11884,2127,571,278,117,161,0,161,293,77,216,1556,861,436,425,65,360,695,358,337,0.268453,0.130701,0.055007,0.075693,0.0,0.075693,0.137753,0.036201,0.101551,0.731547,0.404795,0.204984,0.199812,0.030559,0.169252,0.326751,0.168312,0.158439,2127,535,1592,0.251528,0.748472,2469,2469,2127,86.1,342,13.9,6.2,,17.5,,2127,2127,1453,68.3,674,31.7,2.78,,2.87,,153,7.2,497,23.4,806,37.9,549,25.8,65,3.1,57,2.7,20,0.9,20,0.9,83.0,3.9,45,3.1,849,58.4,396,27.3,72,5.0,76,5.2,0,0.0,0,0.0,15,1,96400,,971,971,443,45.6,109,11.2,71,7.3,51,5.3,297,30.6,42,,402,402,176,43.8,19,4.7,76,18.9,111,27.6,0,0.0,0,0.0,20,5.0,38,,610,610,0,0.0,32,5.2,39,6.4,59,9.7,81,13.3,399,65.4,64,
4,1400000US12001000500,G1200010000500,12001000500,"Census Tract 5, Alachua County, Florida",Alachua County,Florida,5202,2617,2585,3581,422,950,1147,0.117844,0.265289,0.320302,0.585591,4647,3286,3286,2825,461,0,1361,0.707123,0.707123,0.607919,0.099204,0.0,0.292877,887,125,46,34,17,56,0,0,38,0,32,29,185,67,23,58,177,28010,31128,52750,10578,24841,42346,10286,2649,745,215,110,105,29,76,530,228,302,1904,672,512,160,55,105,1232,567,665,0.281238,0.081163,0.041525,0.039638,0.010948,0.02869,0.200076,0.08607,0.114005,0.718762,0.253681,0.19328,0.0604,0.020763,0.039638,0.465081,0.214043,0.251038,2649,252,2397,0.09513,0.90487,2850,2850,2649,92.9,201,7.1,0.0,,7.6,,2649,2649,1261,47.6,1388,52.4,2.27,,1.58,,408,15.4,1095,41.3,688,26.0,210,7.9,32,1.2,216,8.2,0,0.0,0,0.0,70.0,2.6,193,15.3,204,16.2,210,16.7,177,14.0,248,19.7,186,14.8,43,3.4,0,0,153600,,861,861,483,56.1,133,15.4,33,3.8,33,3.8,179,20.8,29,,371,371,138,37.2,42,11.3,43,11.6,15,4.0,71,19.1,0,0.0,62,16.7,0,,1309,1309,66,5.0,210,16.0,155,11.8,122,9.3,67,5.1,689,52.6,79,


In [84]:
# Change to tractid datatype to string
demoDf['tractid'] = demoDf['tractid'].astype('str')
print(f"tractid datatype: {demoDf['tractid'].dtype}")

tractid datatype: object


In [85]:
# Merge demographics and FEMA datasets
df = pd.merge(femaDf, demoDf, left_on='censusTractId', right_on='tractid')

In [88]:
# Check how many rows we have in the merged dataframe
print(f"There are {len(df)} records in the merged dataset.")
print(f"This is a {len(femaDf) - len(df)} difference b/w the original dataset and the merged.")

There are 4602394 records in the merged dataset.
This is a 59129 difference b/w the original dataset and the merged.


In [94]:
# Check how many census tracts are represented between TX & FL
print(f"There are {len(df['tractid'].unique())} tracts represented in the dataset.")
print(f"Of these, {len(df.loc[df['damagedStateAbbreviation'] == 'PR', 'censusid'].unique())} are from PR.")
print(f"Of these, {len(df.loc[df['damagedStateAbbreviation'] == 'TX', 'censusid'].unique())} are from TX.")
print(f"Of these, {len(df.loc[df['damagedStateAbbreviation'] == 'FL', 'censusid'].unique())} are from FL.")

There are 7512 tracts represented in the dataset.
Of these, 896 are from PR.
Of these, 2461 are from TX.
Of these, 4158 are from FL.


In [99]:
# Define all dollar amount columns
amountCols = ['foundationDamageAmount', 'roofDamageAmount', 'rentalAssistanceAmount', 
               'replacementAmount','repairAmount', 'rpfvl', 'ppfvl']

# Set them to zero if null
df[amountCols] = df[amountCols].fillna(0)

# Calculate the aggregated haAmount
df['haAmount'] = df['rentalAssistanceAmount'] + df['replacementAmount'] + df['repairAmount']

print('Total ${:,.2f} in HA payouts'.format(df['haAmount'].sum()))
print('Total ${:,.2f} in mainland HA payouts'.format(df.loc[df['damagedStateAbbreviation'] != 'PR', 'haAmount'].sum()))

Total $2,580,177,998.38 in HA payouts
Total $1,931,168,720.31 in mainland HA payouts


## Feature Engineering

In [96]:
# Convert boolean columns to int
bool_cols = ['specialNeeds', 'homeOwnersInsurance', 'floodInsurance', 'inspected', 'destroyed', 
             'habitabilityRepairsRequired', 'floodDamage', 'foundationDamage', 'roofDamage', 
             'tsaEligible', 'rentalAssistanceEligible', 'repairAssistanceEligible', 
             'replacementAssistanceEligible', 'sbaEligible', 'primaryResidence']

df[bool_cols] = df[bool_cols].astype(int)

In [97]:
# Fill the na's with zero's in waterLevel 
df['waterLevel'].fillna(0, inplace=True)

In [100]:
# One-hot encode the categorical variables
cat_cols = ['ownRent']
for col in cat_cols:
    fdf = df.join(pd.get_dummies(df[col], prefix=col))
df.drop(cat_cols, axis=1, inplace=True)