In [1]:
# Basic libraries
import geopandas as gpd
import pandas as pd
import numpy as np

### Load the FEMA data for PR

In [2]:
femaDf = pd.read_csv("../data/open-fema/FEMA-Large-DR-4339-PR.csv")
print(f"There are {len(femaDf)} records in the dataframe.")

  interactivity=interactivity, compiler=compiler, result=result)


There are 1122568 records in the dataframe.


In [3]:
femaDf.drop_duplicates(inplace=True)
print(f"There are {len(femaDf)} records after dropping any duplicates.")

There are 1122568 records after dropping any duplicates.


In [4]:
# Change incorrect datatype
femaDf['habitabilityRepairsRequired'] = femaDf['habitabilityRepairsRequired'].astype('bool')
femaDf['primaryResidence'] = femaDf['primaryResidence'].astype('bool')
femaDf['censusBlockId'] = femaDf['censusBlockId'].astype('str').replace('\.0', '', regex=True)
print(f"habitabilityRepairsRequired datatype: {femaDf['habitabilityRepairsRequired'].dtype}")
print(f"primaryResidence datatype: {femaDf['primaryResidence'].dtype}")
print(f"censusBlockId datatype: {femaDf['censusBlockId'].dtype}")

habitabilityRepairsRequired datatype: bool
primaryResidence datatype: bool
censusBlockId datatype: object


In [5]:
# Create census tract IDs for FEMA dataset
femaDf['censusTractId'] = femaDf.apply(lambda row: row.censusBlockId[0:11], axis=1)
femaDf['county_fips'] = femaDf.apply(lambda row: row.censusBlockId[0:5], axis=1)
femaDf[['censusBlockId', 'censusTractId', 'county_fips']].head()

Unnamed: 0,censusBlockId,censusTractId,county_fips
0,,,
1,720310510021022.0,72031051002.0,72031.0
2,721031704003001.0,72103170400.0,72103.0
3,721390604042000.0,72139060404.0,72139.0
4,720439539002080.0,72043953900.0,72043.0


In [6]:
# Define all dollar amount columns, set them to zero if null
amountCols = ['foundationDamageAmount', 'roofDamageAmount', 'rentalAssistanceAmount', 
              'replacementAmount', 'repairAmount', 'rpfvl', 'ppfvl']

femaDf[amountCols] = femaDf[amountCols].fillna(0)

In [7]:
# Convert boolean columns to int
bool_cols = ['specialNeeds', 'homeOwnersInsurance', 'floodInsurance', 'inspected', 'destroyed', 
             'habitabilityRepairsRequired', 'floodDamage', 'foundationDamage', 'roofDamage', 
             'tsaEligible', 'rentalAssistanceEligible', 'repairAssistanceEligible', 
             'replacementAssistanceEligible', 'sbaEligible', 'primaryResidence']
femaDf[bool_cols] = femaDf[bool_cols].astype(int)

### Load demographics data

In [8]:
# Load demographics data
demoDf = pd.read_csv("../data/census-tract/census-tract-demographics.csv")
print(f"There are {len(demoDf)} records in the dataframe.")

There are 12650 records in the dataframe.


In [9]:
# Change to tractid datatype to string
demoDf['tractid'] = demoDf['tractid'].astype('str')
print(f"tractid datatype: {demoDf['tractid'].dtype}")

tractid datatype: object


In [10]:
# Select subset of variables
demoDf = demoDf[['censusid', 'tractid', 'tractname', 'county', 'state', 'total_population', 'below_poverty_rate', 'median_earnings_total',
                 'unemployed_labor_rate', 'built_1979_or_earlier_rate', 'owner_occupied_rate']]

### Merge femaDf and demoDf

In [11]:
# Merge demographics and FEMA datasets
df = pd.merge(femaDf, demoDf, left_on='censusTractId', right_on='tractid')
print(f"There are {len(df)} records in the merged dataset.")
print(f"This is a {len(femaDf) - len(df)} difference b/w the original dataset and the merged.")

There are 1067075 records in the merged dataset.
This is a 55493 difference b/w the original dataset and the merged.


### Load wind-speed data for Hurricane Maria PR

In [12]:
windDataDf = pd.read_csv('../data/hurricane/ARA-wind-data/maria-wind-data-county.csv')
print(f"There are {len(windDataDf)} records in the dataframe.")

There are 7647 records in the dataframe.


In [13]:
# Change to tractid datatype to string
windDataDf['tract_id'] = windDataDf['tract_id'].astype('str')
print(f"tractid datatype: {windDataDf['tract_id'].dtype}")
windDataDf.rename(columns={'Vg (mph)': 'pwg_mph'}, inplace=True)

tractid datatype: object


In [14]:
# Create county averages 
windDataCountyDf = windDataDf.groupby(['county_fips']).agg(pwg_county_avg_mph=('pwg_mph', 'mean')).reset_index()
windDataCountyDf['county_fips'] = windDataCountyDf['county_fips'].astype('str')

### Merge femaDf and windDataDf

In [15]:
# Merge FEMA dataset and wind-data
df = df.merge(windDataDf.loc[:, ['tract_id', 'pwg_mph']], left_on='censusTractId', right_on='tract_id', how='left')
df.drop('tract_id', inplace=True, axis=1)

In [16]:
print(f"There are {df['pwg_mph'].isna().sum()} missing wind-data observations.")
print(f"Here is a breakdown by county,")
df[df['pwg_mph'].isna()]['county'].value_counts()

There are 117135 missing wind-data observations.
Here is a breakdown by county,


San Juan Municipio         42936
Bayamón Municipio          14359
Carolina Municipio         11549
Caguas Municipio            7163
Cataño Municipio            5450
Ponce Municipio             5410
Guaynabo Municipio          4739
Toa Baja Municipio          3527
Trujillo Alto Municipio     3126
Camuy Municipio             2505
Yabucoa Municipio           2227
Coamo Municipio             1964
Mayagüez Municipio          1953
Arecibo Municipio           1802
Aguadilla Municipio         1670
Yauco Municipio             1170
Cayey Municipio              935
Vega Baja Municipio          799
Fajardo Municipio            780
Loíza Municipio              768
Guánica Municipio            617
Guayama Municipio            537
Añasco Municipio             456
Hormigueros Municipio        390
Gurabo Municipio             299
Palm Beach County              1
Pinellas County                1
Ceiba Municipio                1
Polk County                    1
Name: county, dtype: int64

### Use the county average wind speed for missing tracts

In [17]:
# Merge FEMA dataset and wind-data
df = df.merge(windDataCountyDf.loc[:, ['county_fips', 'pwg_county_avg_mph']], on='county_fips', how='left')

### Drop uneeded columns and save results

In [18]:
df.drop(['renterDamageLevel', 'rentalAssistanceEndDate', 'rentalResourceCity',
       'rentalResourceStateAbbreviation', 'rentalResourceZipCode', 'censusBlockId', 'censusid', 'tractid'], inplace=True, axis=1)

In [21]:
df.columns

Index(['disasterNumber', 'damagedCity', 'damagedStateAbbreviation',
       'damagedZipCode', 'householdComposition', 'grossIncome', 'specialNeeds',
       'ownRent', 'residenceType', 'homeOwnersInsurance', 'floodInsurance',
       'inspected', 'rpfvl', 'habitabilityRepairsRequired', 'destroyed',
       'waterLevel', 'floodDamage', 'foundationDamage',
       'foundationDamageAmount', 'roofDamage', 'roofDamageAmount',
       'tsaEligible', 'tsaCheckedIn', 'rentalAssistanceEligible',
       'rentalAssistanceAmount', 'repairAssistanceEligible', 'repairAmount',
       'replacementAssistanceEligible', 'replacementAmount', 'sbaEligible',
       'primaryResidence', 'personalPropertyEligible', 'ppfvl', 'censusYear',
       'id', 'censusTractId', 'county_fips', 'tractname', 'county', 'state',
       'total_population', 'below_poverty_rate', 'median_earnings_total',
       'unemployed_labor_rate', 'built_1979_or_earlier_rate',
       'owner_occupied_rate', 'pwg_mph', 'pwg_county_avg_mph'],
      

In [22]:
df.to_csv('../data/open-fema/FEMA-Large-Demographics-WindSpeed-PR.csv', index=False, encoding='utf-8')
print('{} rows saved to: {}'.format(len(df), '../data/open-fema/FEMA-Large-Demographics-WindSpeed-PR.csv'))

10211619 rows saved to: ../data/open-fema/FEMA-Large-Demographics-WindSpeed-PR.csv
