# Feature Engineering - FEMA Large Disasters with Demographics

In [4]:
# Basic libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import warnings

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [5]:
# Options
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load FEMA Data

In [6]:
# Load FEMA data
df = pd.read_csv("../../data/open-fema/FEMA-Large-DR-4393-NC.csv")

In [7]:
# NC datasets
print(f"There are {len(df)} records in the dataframe.")

There are 132634 records in the dataframe.


In [9]:
# Look at data types for each of the columns
df.dtypes

disasterNumber                       int64
damagedCity                         object
damagedStateAbbreviation            object
damagedZipCode                       int64
householdComposition                 int64
grossIncome                        float64
specialNeeds                          bool
ownRent                             object
residenceType                       object
homeOwnersInsurance                   bool
floodInsurance                        bool
inspected                             bool
rpfvl                              float64
habitabilityRepairsRequired         object
destroyed                             bool
waterLevel                         float64
floodDamage                           bool
foundationDamage                      bool
foundationDamageAmount             float64
roofDamage                            bool
roofDamageAmount                   float64
tsaEligible                           bool
tsaCheckedIn                          bool
rentalAssis

In [11]:
df.drop_duplicates(inplace=True)
print(f"There are {len(df)} records after dropping any duplicates.")

There are 132634 records after dropping any duplicates.


In [12]:
# Change incorrect datatype
df['habitabilityRepairsRequired'] = df['habitabilityRepairsRequired'].astype('bool')
df['primaryResidence'] = df['primaryResidence'].astype('bool')
df['censusBlockId'] = df['censusBlockId'].astype('str').replace('\.0', '', regex=True)
print(f"habitabilityRepairsRequired datatype: {df['habitabilityRepairsRequired'].dtype}")
print(f"primaryResidence datatype: {df['primaryResidence'].dtype}")
print(f"censusBlockId datatype: {df['censusBlockId'].dtype}")

habitabilityRepairsRequired datatype: bool
primaryResidence datatype: bool
censusBlockId datatype: object


In [13]:
# Create census tract IDs for FEMA dataset
df['censusTractId'] = df.apply(lambda row: row.censusBlockId[0:11], axis=1)
df[['censusBlockId', 'censusTractId']].head()

Unnamed: 0,censusBlockId,censusTractId
0,370499609001004,37049960900
1,370939702022004,37093970202
2,371539710002065,37153971000
3,370499610024029,37049961002
4,370190204042003,37019020404


In [14]:
# Create county FIPS codes
df['fips'] = df.apply(lambda row: row.censusBlockId[0:5], axis=1)
df[['censusTractId', 'fips']].head()

Unnamed: 0,censusTractId,fips
0,37049960900,37049
1,37093970202,37093
2,37153971000,37153
3,37049961002,37049
4,37019020404,37019


### Load Census Demographics

In [15]:
# Load demographics data
demoDf = pd.read_csv("../../data/census-tract/census-tract-demographics.csv")
demoDf.head(3)

Unnamed: 0,censusid,gisjoin,tractid,tractname,county,state,total_population,male_population,female_population,total_population_edu,high_school_grad,bachelors_degree,graduate_prof_degree,high_school_grad_rate,bachelors_degree_rate,graduate_prof_degree_rate,bachelors_plus_degree_rate,total_population_employ,labor_force_total,civilian_labor_force,employed_labor_force,unemployed_labor_force,armed_forces_labor_force,not_labor_force_total,labor_force_rate,civilian_labor_rate,employed_labor_rate,unemployed_labor_rate,armed_forces_labor_rate,not_labor_force_rate,total_population_income,income_less_10k,income_10k_15k,income_15k_20k,income_20k_25k,income_25k_30k,income_30k_35k,income_35k_40k,income_40k_45k,income_45k_50k,income_50k_60k,income_60k_75k,income_75k_100k,income_100k_125k,income_125k_150k,income_150k_200k,income_200k_more,median_earnings_total,median_earnings_male,median_earnings_male_fulltime,median_earnings_male_other,median_earnings_female,median_earnings_female_fulltime,median_earnings_female_other,total_households_poverty,below_poverty,below_poverty_family,below_poverty_family_married,below_poverty_family_other,below_poverty_family_other_male_no_spouse,below_poverty_family_other_female_no_spouse,below_poverty_nonfamily,below_poverty_nonfamily_male,below_poverty_nonfamily_female,above_poverty,above_poverty_family,above_poverty_family_married,above_poverty_family_other,above_poverty_family_other_male_no_spouse,above_poverty_family_other_female_no_spouse,above_poverty_nonfamily,above_poverty_nonfamily_male,above_poverty_nonfamily_female,below_poverty_rate,below_poverty_family_rate,below_poverty_family_married_rate,below_poverty_family_other_rate,below_poverty_family_other_male_no_spouse_rate,below_poverty_family_other_female_no_spouse_rate,below_poverty_nonfamily_rate,below_poverty_nonfamily_male_rate,below_poverty_nonfamily_female_rate,above_poverty_rate,above_poverty_family_rate,above_poverty_family_married_rate,above_poverty_family_other_rate,above_poverty_family_other_male_no_spouse_rate,above_poverty_family_other_female_no_spouse_rate,above_poverty_nonfamily_rate,above_poverty_nonfamily_male_rate,above_poverty_nonfamily_female_rate,total_population_assist,with_assistance,without_assistance,with_assistance_rate,without_assistance_rate,total_housing_units,total_housing_units_rate,occupied_housing_units,occupied_housing_units_rate,vacant_housing_units,vacant_housing_units_rate,homeowner_vacancy_rate,homeowner_vacancy_rate_rate,rental_vacancy_rate,rental_vacancy_rate_rate,built_total_housing_units,built_total_housing_units_rate,built_2014_or_later,built_2014_or_later_rate,built_2010_to_2013,built_2010_to_2013_rate,built_2000_to_2009,built_2000_to_2009_rate,built_1990_to_1999,built_1990_to_1999_rate,built_1980_to_1989,built_1980_to_1989_rate,built_1970_to_1979,built_1970_to_1979_rate,built_1960_to_1969,built_1960_to_1969_rate,built_1950_to_1959,built_1950_to_1959_rate,built_1940_to_1949,built_1940_to_1949_rate,built_1939_or_earlier,built_1939_or_earlier_rate,tenure_occupied_housing_units,tenure_occupied_housing_units_rate,owner_occupied,owner_occupied_rate,renter_occupied,renter_occupied_rate,avg_size_owner_unit,avg_size_owner_unit_rate,avg_size_renter_unit,avg_size_of_renter__unit_rate,moved_occupied_housing_unit,moved_occupied_housing_unit_rate,moved_in_2015_or_later,moved_in_2015_or_later_rate,moved_in_2010_to_2014,moved_in_2010_to_2014_rate,moved_in_2000_to_2009,moved_in_2000_to_2009_rate,moved_in_1990_to_1999,moved_in_1990_to_1999_rate,moved_in_1980_to_1989,moved_in_1980_to_1989_rate,moved_in_1979_and_earlier,moved_in_1979_and_earlier_rate,lacking_complete_plumbing_facilities,lacking_complete_plumbing_facilities_rate,lacking_complete_kitchen_facilities,lacking_complete_kitchen_facilities_rate,no_telephone_service_available,no_telephone_service_available_rate,value_less_50k,value_less_50k_rate,value_50k_100k,value_50k_100k_rate,value_100k_150k,value_100k_150k_rate,value_150k_200k,value_150k_200k_rate,value_200k_300k,value_200k_300k_rate,value_300k_500k,value_300k_500k_rate,value_500k_1M,value_500k_1M_rate,value_1M_more,value_1M_more_rate,median_value,median_value_rate,mortgage,mortgage_rate,mortgage_less_20_percent,mortgage_less_20_percent_rate,mortgage_20_25_percent,mortgage_20_25_percent_rate,mortgage_25_30_percent,mortgage_25_30_percent_rate,mortgage_30_35_percent,mortgage_30_35_percent_rate,mortgage_35_percent_plus,mortgage_35_percent_plus_rate,mortgage_not_computed,mortgage_not_computed_rate,without_mortgage,without_mortgage_rate,without_mortgage_less_10_percent,without_mortgage_less_10_percent_rate,without_mortgage_10_15_percent,without_mortgage_10_15_percent_rate,without_mortgage_15_20_percent,without_mortgage_15_20_percent_rate,without_mortgage_20_25_percent,without_mortgage_20_25_percent_rate,without_mortgage_25_30_percent,without_mortgage_25_30_percent_rate,without_mortgage_30_35_percent,without_mortgage_30_35_percent_rate,without_mortgage_35_percent_plus,without_mortgage_35_percent_plus_rate,without_mortgage_not_computed,without_mortgage_not_computed_rate,occupied_units_paying_rent,occupied_units_paying_rent_rate,rent_less_15_percent,rent_less_15_percent_rate,rent_15_20_percent,rent_15_20_percent_rate,rent_20_25_percent,rent_20_25_percent_rate,rent_25_30_percent,rent_25_30_percent_rate,rent_30_35_percent,rent_30_35_percent_rate,rent_35_percent_plus,rent_35_percent_plus_rate,rent_not_computed,rent_not_computed_rate,built_1959_or_earlier_rate,built_1960_to_1989_rate,built_1990_or_later_rate,built_1979_or_earlier_rate,built_1980_or_later_rate,total_population_nativity,total_native,native_in_state,native_out_state,native_outside_us,native_outside_us_puerto,native_outside_us_islands,native_outside_us_abroad,total_foreign,foreign_citizen,foreign_non_citizen,total_native_rate,native_in_state_rate,native_out_state_rate,native_outside_us_rate,native_outside_us_puerto_rate,native_outside_us_islands_rate,native_outside_us_abroad_rate,total_foreign_rate,foreign_citizen_rate,foreign_non_citizen_rate
0,1400000US12001000200,G1200010000200,12001000200,"Census Tract 2, Alachua County, Florida",Alachua County,Florida,6834.0,3096.0,3738.0,1941.0,564.0,256.0,479.0,0.290572,0.131891,0.24678,0.378671,6681.0,3151.0,3147.0,2923.0,224.0,4.0,3530.0,0.471636,0.471037,0.437509,0.033528,0.000599,0.528364,345.0,28.0,49.0,30.0,0.0,0.0,31.0,0.0,0.0,0.0,71.0,49.0,14.0,21.0,30.0,22.0,0.0,6752.0,12345.0,25520.0,6125.0,4405.0,27139.0,3134.0,2364.0,1328.0,91.0,22.0,69.0,0.0,69.0,1237.0,622.0,615.0,1036.0,254.0,52.0,202.0,80.0,122.0,782.0,431.0,351.0,0.56176,0.038494,0.009306,0.029188,0.0,0.029188,0.523266,0.263113,0.260152,0.43824,0.107445,0.021997,0.085448,0.033841,0.051607,0.330795,0.182318,0.148477,2364.0,196.0,2168.0,0.08291,0.91709,3374.0,3374.0,2364.0,70.1,1010.0,29.9,10.1,,25.7,,3374.0,3374.0,94.0,2.8,273.0,8.1,931.0,27.6,378.0,11.2,307.0,9.1,216.0,6.4,335.0,9.9,312.0,9.2,196.0,5.8,332.0,9.8,2364.0,2364.0,417.0,0.176396,1947.0,0.823604,2.62,,2.4,,2364.0,2364.0,602.0,25.5,1320.0,55.8,129.0,5.5,178.0,7.5,70.0,3.0,65.0,2.7,0.0,0.0,29.0,1.2,93.0,3.9,34.0,8.2,173.0,41.5,62.0,14.9,91.0,21.8,41.0,9.8,16.0,3.8,0.0,0.0,0.0,0.0,106300.0,,221.0,221.0,36.0,16.3,26.0,11.8,0.0,0.0,0.0,0.0,159.0,71.9,0.0,,191.0,191.0,120.0,62.8,14.0,7.3,19.0,9.9,21.0,11.0,0.0,0.0,0.0,0.0,17.0,8.9,5.0,,1617.0,1617.0,50.0,3.1,56.0,3.5,152.0,9.4,101.0,6.2,107.0,6.6,1151.0,71.2,330.0,,0.248963,0.254298,0.49674,0.41227,0.58773,6834.0,6322.0,4479.0,1754.0,89.0,0.0,0.0,89.0,512.0,128.0,384.0,0.92508,0.655399,0.256658,0.013023,0.0,0.0,0.013023,0.07492,0.01873,0.05619
1,1400000US12001000301,G1200010000301,12001000301,"Census Tract 3.01, Alachua County, Florida",Alachua County,Florida,3849.0,1806.0,2043.0,2443.0,467.0,570.0,440.0,0.191158,0.23332,0.180106,0.413426,3499.0,2397.0,2397.0,2161.0,236.0,0.0,1102.0,0.685053,0.685053,0.617605,0.067448,0.0,0.314947,592.0,55.0,71.0,12.0,63.0,5.0,10.0,53.0,39.0,66.0,55.0,24.0,86.0,0.0,14.0,23.0,16.0,20481.0,22639.0,36750.0,11402.0,18427.0,34900.0,9127.0,1751.0,608.0,151.0,94.0,57.0,0.0,57.0,457.0,244.0,213.0,1143.0,441.0,314.0,127.0,22.0,105.0,702.0,251.0,451.0,0.34723,0.086236,0.053684,0.032553,0.0,0.032553,0.260994,0.139349,0.121645,0.65277,0.251856,0.179326,0.07253,0.012564,0.059966,0.400914,0.143347,0.257567,1751.0,487.0,1264.0,0.278127,0.721873,2014.0,2014.0,1751.0,86.9,263.0,13.1,0.0,,11.0,,2014.0,2014.0,0.0,0.0,93.0,4.6,156.0,7.7,321.0,15.9,175.0,8.7,437.0,21.7,313.0,15.5,257.0,12.8,140.0,7.0,122.0,6.1,1751.0,1751.0,360.0,0.205597,1391.0,0.794403,2.33,,2.14,,1751.0,1751.0,459.0,26.2,747.0,42.7,351.0,20.0,48.0,2.7,52.0,3.0,94.0,5.4,0.0,0.0,5.0,0.3,72.0,4.1,25.0,6.9,100.0,27.8,69.0,19.2,80.0,22.2,75.0,20.8,11.0,3.1,0.0,0.0,0.0,0.0,143500.0,,150.0,150.0,50.0,33.3,55.0,36.7,7.0,4.7,0.0,0.0,38.0,25.3,0.0,,170.0,170.0,80.0,47.1,50.0,29.4,15.0,8.8,25.0,14.7,0.0,0.0,0.0,0.0,0.0,0.0,40.0,,1332.0,1332.0,118.0,8.9,59.0,4.4,191.0,14.3,184.0,13.8,93.0,7.0,687.0,51.6,59.0,,0.257696,0.459285,0.283019,0.630089,0.369911,3849.0,3718.0,1812.0,1750.0,156.0,75.0,0.0,81.0,131.0,74.0,57.0,0.965965,0.470772,0.454664,0.04053,0.019486,0.0,0.021044,0.034035,0.019226,0.014809
2,1400000US12001000302,G1200010000302,12001000302,"Census Tract 3.02, Alachua County, Florida",Alachua County,Florida,2374.0,1151.0,1223.0,1700.0,464.0,246.0,194.0,0.272941,0.144706,0.114118,0.258824,2047.0,1241.0,1241.0,1064.0,177.0,0.0,806.0,0.606253,0.606253,0.519785,0.086468,0.0,0.393747,433.0,8.0,22.0,12.0,106.0,24.0,29.0,19.0,0.0,13.0,34.0,57.0,22.0,41.0,23.0,23.0,0.0,25108.0,31607.0,45063.0,11434.0,23733.0,28911.0,14018.0,1101.0,269.0,53.0,40.0,13.0,0.0,13.0,216.0,117.0,99.0,832.0,380.0,215.0,165.0,35.0,130.0,452.0,243.0,209.0,0.244323,0.048138,0.036331,0.011807,0.0,0.011807,0.196185,0.106267,0.089918,0.755677,0.345141,0.195277,0.149864,0.031789,0.118074,0.410536,0.220708,0.189827,1101.0,284.0,817.0,0.257947,0.742053,1489.0,1489.0,1101.0,73.9,388.0,26.1,9.6,,22.2,,1489.0,1489.0,0.0,0.0,0.0,0.0,64.0,4.3,69.0,4.6,114.0,7.7,383.0,25.7,220.0,14.8,357.0,24.0,172.0,11.6,110.0,7.4,1101.0,1101.0,469.0,0.425976,632.0,0.574024,2.14,,2.12,,1101.0,1101.0,230.0,20.9,404.0,36.7,224.0,20.3,94.0,8.5,48.0,4.4,101.0,9.2,0.0,0.0,33.0,3.0,54.0,4.9,69.0,14.7,153.0,32.6,163.0,34.8,36.0,7.7,19.0,4.1,6.0,1.3,23.0,4.9,0.0,0.0,103100.0,,296.0,296.0,87.0,29.4,44.0,14.9,15.0,5.1,48.0,16.2,102.0,34.5,0.0,,173.0,173.0,58.0,33.5,24.0,13.9,25.0,14.5,39.0,22.5,7.0,4.0,20.0,11.6,0.0,0.0,0.0,,565.0,565.0,59.0,10.4,63.0,11.2,43.0,7.6,41.0,7.3,49.0,8.7,310.0,54.9,67.0,,0.429147,0.481531,0.089322,0.834117,0.165883,2374.0,2221.0,1372.0,828.0,21.0,7.0,0.0,14.0,153.0,112.0,41.0,0.935552,0.577928,0.348778,0.008846,0.002949,0.0,0.005897,0.064448,0.047178,0.01727


In [16]:
# Check vars in index
print(demoDf.columns)

Index(['censusid', 'gisjoin', 'tractid', 'tractname', 'county', 'state',
       'total_population', 'male_population', 'female_population',
       'total_population_edu',
       ...
       'total_native_rate', 'native_in_state_rate', 'native_out_state_rate',
       'native_outside_us_rate', 'native_outside_us_puerto_rate',
       'native_outside_us_islands_rate', 'native_outside_us_abroad_rate',
       'total_foreign_rate', 'foreign_citizen_rate',
       'foreign_non_citizen_rate'],
      dtype='object', length=250)


In [17]:
# Change to tractid datatype to string
demoDf['tractid'] = demoDf['tractid'].astype('str')
print(f"tractid datatype: {demoDf['tractid'].dtype}")

tractid datatype: object


In [19]:
# Select subset of variables
demoDf = demoDf[['censusid', 'tractid', 'tractname', 'county', 'state', 'below_poverty_rate', 'median_earnings_total',
                 'unemployed_labor_rate', 'built_1979_or_earlier_rate', 'owner_occupied_rate']]

demoDf.head(3)

Unnamed: 0,censusid,tractid,tractname,county,state,below_poverty_rate,median_earnings_total,unemployed_labor_rate,built_1979_or_earlier_rate,owner_occupied_rate
0,1400000US12001000200,12001000200,"Census Tract 2, Alachua County, Florida",Alachua County,Florida,0.56176,6752.0,0.033528,0.41227,0.176396
1,1400000US12001000301,12001000301,"Census Tract 3.01, Alachua County, Florida",Alachua County,Florida,0.34723,20481.0,0.067448,0.630089,0.205597
2,1400000US12001000302,12001000302,"Census Tract 3.02, Alachua County, Florida",Alachua County,Florida,0.244323,25108.0,0.086468,0.834117,0.425976


### Combine Demographics with FEMA data

In [21]:
# Merge demographics and FEMA datasets
df = pd.merge(df, demoDf, left_on='censusTractId', right_on='tractid')

### HUD Fair Market Rates

In [22]:
# Load HUD Fair Market Rates
hudDf = pd.read_csv("../../data/hud/HUD-FMR-with-demo.csv")
hudDf.head(3)

Unnamed: 0,fips,county,occupied_housing_units_rate,vacant_housing_units_rate,owner_occupied_rate,fmr0,fmr1,fmr2,fmr3,fmr4,metro
0,12001,Alachua County,0.845622,0.154378,0.536,624,717,878,1177,1339,1
1,12003,Baker County,0.843566,0.156434,0.775877,477,645,752,951,1266,1
2,12005,Bay County,0.676942,0.323058,0.624478,693,813,949,1359,1609,1


In [23]:
# Check vars in index
print(hudDf.columns)

Index(['fips', 'county', 'occupied_housing_units_rate',
       'vacant_housing_units_rate', 'owner_occupied_rate', 'fmr0', 'fmr1',
       'fmr2', 'fmr3', 'fmr4', 'metro'],
      dtype='object')


In [24]:
# Change to tractid datatype to string
hudDf['fips'] = hudDf['fips'].astype('str')
print(f"fips datatype: {hudDf['fips'].dtype}")

fips datatype: object


In [25]:
# Select subset of variables
hudDf = hudDf[['fips', 'fmr0', 'fmr1', 'fmr2', 'fmr3', 'fmr4',
                 'occupied_housing_units_rate', 'vacant_housing_units_rate', 'owner_occupied_rate']]

hudDf.head(3)

Unnamed: 0,fips,fmr0,fmr1,fmr2,fmr3,fmr4,occupied_housing_units_rate,vacant_housing_units_rate,owner_occupied_rate
0,12001,624,717,878,1177,1339,0.845622,0.154378,0.536
1,12003,477,645,752,951,1266,0.843566,0.156434,0.775877
2,12005,693,813,949,1359,1609,0.676942,0.323058,0.624478


### Combine HUD with FEMA Data

In [26]:
# Original file size
original_len = len(df)

# Merge HUD and FEMA datasets
df = pd.merge(df, hudDf, left_on='fips', right_on='fips')

print(f"There are {len(df)} records in the dataset merged with HUD.")
print(f"This is a {original_len - len(df)} difference b/w the original dataset and the merged.")

There are 132586 records in the dataset merged with HUD.
This is a 0 difference b/w the original dataset and the merged.


### Feature Engineering

In [27]:
# Define all dollar amount columns
amountCols = ['foundationDamageAmount', 'roofDamageAmount', 'rentalAssistanceAmount', 
              'replacementAmount','repairAmount', 'rpfvl', 'ppfvl']

# Set them to zero if null
df[amountCols] = df[amountCols].fillna(0)

# Calculate the aggregated haAmount
df['haAmount'] = df['rentalAssistanceAmount'] + df['replacementAmount'] + df['repairAmount']

print('Total ${:,.2f} in HA payouts'.format(df['haAmount'].sum()))
print('Total ${:,.2f} in mainland HA payouts'.format(df.loc[df['damagedStateAbbreviation'] != 'PR', 'haAmount'].sum()))

Total $92,355,415.13 in HA payouts
Total $92,355,415.13 in mainland HA payouts


In [28]:
# Convert boolean columns to int
bool_cols = ['specialNeeds', 'homeOwnersInsurance', 'floodInsurance', 'inspected', 'destroyed', 
             'habitabilityRepairsRequired', 'floodDamage', 'foundationDamage', 'roofDamage', 
             'tsaEligible', 'rentalAssistanceEligible', 'repairAssistanceEligible', 
             'replacementAssistanceEligible', 'sbaEligible', 'primaryResidence']

df[bool_cols] = df[bool_cols].astype(int)

In [29]:
# Fill the na's with zero's in waterLevel 
df['waterLevel'].fillna(0, inplace=True)

In [30]:
# One-hot encode the categorical variables
cat_cols = ['ownRent', 'residenceType']
for col in cat_cols:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
df.drop(cat_cols, axis=1, inplace=True)

### Create NC - With Outliers

In [31]:
# Test (NC) data
testDf = df.loc[df['damagedStateAbbreviation'] == 'NC', ]

# Write test to csv
testDf.to_csv("../data/open-fema/FEMA-Large-Demographics-hud-NC.csv", index=False, encoding='utf-8')
print('Test (PR) rows:', len(testDf))

Test (PR) rows: 132586


### Treat Outliers

In [32]:
# Do drop uninspected with HA amount
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['haAmount']>0) & (df['inspected']==False)])}")
uninspected = df.loc[(df['haAmount'] > 0) & (df['inspected'] == False)].index
df.drop(uninspected, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132585
Amount after dropped: 132585


In [33]:
# Drop those with HA amount over max
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['haAmount'] > 33300)])}")
maxLimit_ha = df.loc[(df['haAmount'] > 33300)].index
df.drop(maxLimit_ha, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132369
Amount after dropped: 132369


In [34]:
# Drop those with household composition over 15
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['householdComposition'] > 15)])}")
maxLimit_house = df.loc[(df['householdComposition'] > 15)].index
df.drop(maxLimit_house, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132367
Amount after dropped: 132367


In [35]:
# Drop those with water level over 120
print(f"Expected amount of rows after dropped: {len(df) - len(df[(df['waterLevel'] > 120)])}")
maxLimit_water = df.loc[(df['waterLevel'] > 120)].index
df.drop(maxLimit_water, inplace=True)
print(f"Amount after dropped: {len(df)}")

Expected amount of rows after dropped: 132336
Amount after dropped: 132336


### Create NC Test - Outliers Removed

In [36]:
# Create test without outliers
testDf = df.loc[df['damagedStateAbbreviation'] == 'NC', ]

# Write test to csv
testDf.to_csv("../../data/open-fema/FEMA-Large-Demographics-hud-NC-clean.csv", index=False, encoding='utf-8')
print('Test (NC) rows:', len(testDf))

Test (NC) rows: 132336
