In [1]:
import sys
import json
import math

import urllib.request

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from datetime import datetime

### Download FEMA data using their API

In [None]:
# Adapted from https://www.fema.gov/about/openfema/developer-resources

#!/usr/bin/env python3
# Paging example using Python 3. Output in JSON.

def fema_api_download(baseUrl, fileName):
    top = 1000      # number of records to get per call
    skip = 0        # number of records to skip

    # Return 1 record with your criteria to get total record count. 
    # Specifying only 1 column here to reduce amount of data returned. 
    # Need inlinecount to get record count. 
    webUrl = urllib.request.urlopen(baseUrl + '&$inlinecount=allpages&$select=id&$top=1')
    result = webUrl.read()
    jsonData = json.loads(result.decode())

    # Calculate the number of calls we will need to get all of our data (using the maximum of 1000)
    recCount = jsonData['metadata']['count']
    loopNum = math.ceil(recCount / top)
    
    # Logging
    print('Starting download @ {}, {} records, {} records per call, {} iterations needed'.format(
          str(datetime.now()), str(recCount), str(top), str(loopNum)))

    # Initialize the output file. 
    outFile = open(fileName, 'w')
    outFile.write('{"fema_open_api":[')

    # Loop and call the API endpoint changing the record start each iteration. The metadata is being
    #   suppressed as we no longer need it.
    i = 0
    while (i < loopNum):
        # By default data is returned as a JSON object, the data set name being the root element. Unless
        #   you extract records as you process, you will end up with 1 distinct JSON object for EVERY 
        #   call/iteration. An alternative is to return the data as JSONA (an array of json objects) with 
        #   no root element - just a bracket at the start and end. This is easier to manipulate.
        webUrl = urllib.request.urlopen(baseUrl + "&$metadata=off&$format=jsona&$skip=" + str(skip) + "&$top=" + str(top))
        result = webUrl.read()

        # The data is already returned in a JSON format. There is no need to decode and load as a JSON object.
        #   If you want to begin working with and manipulating the JSON, import the json library and load with
        #   something like: jsonData = json.loads(result.decode())

        # Append results to file, trimming off first and last JSONA brackets, adding comma except for last call,
        #   AND root element terminating array bracket and brace to end unless on last call. The goal here is to 
        #   create a valid JSON file that contains ALL the records. This can be done differently.
        if (i == (loopNum - 1)):
            # on the last so terminate the single JSON object
            outFile.write(str(result[1:-1],'utf-8') + "]}")
        else:
            outFile.write(str(result[1:-1],'utf-8') + ",")

        # increment the loop counter and skip value
        i += 1
        skip = i * top

        print("Iteration " + str(i) + " done")

    print('Data downloaded to {}'.format(fileName))
    outFile.close()

In [None]:
# BaseUrl for Hurricane Irma Disaster (4337) in FL  Large Dataset
baseUrl = 'https://www.fema.gov/api/open/v1/IndividualAssistanceHousingRegistrantsLargeDisasters?$filter=disasterNumber%20eq%204337%20and%20damagedStateAbbreviation%20eq%20%27FL%27'

fema_api_download(baseUrl=baseUrl, fileName='../data/FEMA-Large-DR-4337-FL.json')

In [3]:
# Check the number of records downloaded
femaFile = open('../data/FEMA-Large-DR-4337-FL.json', 'r')
femaData = json.load(femaFile)
print('Found {} records in file'.format(str(len(femaData['fema_open_api']))))

# Convert to Pandas DataFrame
femaDf = pd.json_normalize(femaData['fema_open_api'])
femaFile.close()
femaData = None

Found 2643443 records in file


In [4]:
# Drop any duplicates
femaDf.drop_duplicates(keep=False, inplace=True)

In [None]:
# Convert to CSV
femaDf.to_csv('FEMA-Large-DR-4337-FL.csv', index=False, encoding='utf-8')

In [5]:
# Describe Numeric Columns
femaDf.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
disasterNumber,2643443.0,4337.0,0.0,4337.0,4337.0,4337.0,4337.0,4337.0
householdComposition,2643443.0,2.153942,1.445825,1.0,1.0,2.0,3.0,99.0
grossIncome,2126795.0,65072.689816,554051.246906,0.0,16000.0,28000.0,48000.0,99999999.0
rpfvl,216499.0,1733.766603,4424.632604,0.0,499.5,499.5,1000.0,169212.99
waterLevel,999985.0,0.423967,3.425269,0.0,0.0,0.0,0.0,480.0
rentalAssistanceAmount,401149.0,1268.382867,659.683584,367.2,1002.0,1295.0,1307.0,32800.0
repairAmount,48010.0,3936.477762,5340.755882,40.81,1000.0,1000.0,5660.0,33300.0
replacementAmount,345.0,26891.435797,5725.360937,4734.2,28336.2,29764.8,30110.0,33300.0
ppfvl,999985.0,113.610223,535.272476,0.0,0.0,0.0,0.0,32527.8


In [6]:
# All Columns
femaDf.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
disasterNumber,2643440.0,,,,4337.0,0.0,4337.0,4337.0,4337.0,4337.0,4337.0
damagedCity,2643435.0,2291.0,MIAMI,352105.0,,,,,,,
damagedStateAbbreviation,2643443.0,1.0,FL,2643443.0,,,,,,,
damagedZipCode,2643435.0,1548.0,33311,21070.0,,,,,,,
householdComposition,2643440.0,,,,2.15394,1.44582,1.0,1.0,2.0,3.0,99.0
grossIncome,2126800.0,,,,65072.7,554051.0,0.0,16000.0,28000.0,48000.0,100000000.0
specialNeeds,2643443.0,2.0,False,2589504.0,,,,,,,
ownRent,2643443.0,3.0,Renter,1425311.0,,,,,,,
residenceType,2643443.0,13.0,House/Duplex,1515849.0,,,,,,,
homeOwnersInsurance,2643443.0,2.0,False,1829550.0,,,,,,,


In [8]:
femaDf.head()

Unnamed: 0,disasterNumber,damagedCity,damagedStateAbbreviation,damagedZipCode,householdComposition,grossIncome,specialNeeds,ownRent,residenceType,homeOwnersInsurance,...,rentalAssistanceEndDate,rentalResourceCity,rentalResourceStateAbbreviation,rentalResourceZipCode,primaryResidence,personalPropertyEligible,ppfvl,censusBlockId,censusYear,id
0,4337,MIRAMAR,FL,33023,1,0.0,False,Renter,Apartment,False,...,,,,,True,False,,120111105011006,2017,5c12d8dbf2238e6fe40c7edf
1,4337,MIAMI SHORES,FL,33150,5,23920.0,False,Renter,House/Duplex,False,...,,,,,True,False,,120860011032003,2017,5c12d8dbf2238e6fe40c7ee9
2,4337,OCALA,FL,34472,6,,False,Renter,House/Duplex,False,...,,,,,True,False,,120830012043000,2017,5c12d8dbf2238e6fe40c7eda
3,4337,HIALEAH,FL,33016,4,75000.0,False,Renter,House/Duplex,False,...,,,,,True,False,,120860135002000,2017,5c12d8dbf2238e6fe40c7ef3
4,4337,MELBOURNE,FL,32935,3,68700.0,False,Owner,House/Duplex,False,...,,,,,True,False,0.0,120090641241018,2017,5c12d8dbf2238e6fe40c7ee4


In [45]:
(~femaDf['inspected']).sum()

1643458

In [43]:
femaDf['inspected'].value_counts()

False    1643458
True      999985
Name: inspected, dtype: int64

In [49]:
# Visualize some columns related to insurance/inspections
import plotly.graph_objects as go

fig1 = go.Figure(data=[go.Bar(x=['Home Insurance',  'Flood Insurance', 'Inspected'], 
                              y =[femaDf['homeOwnersInsurance'].sum(),
                                  femaDf['floodInsurance'].sum(), 
                                  femaDf['inspected'].sum()], 
                                  name='True', marker_color='indianred'),
                       go.Bar(x=['Home Insurance',  'Flood Insurance', 'Inspected'], 
                              y =[(~femaDf['homeOwnersInsurance']).sum(),
                                  (~femaDf['floodInsurance']).sum(), 
                                  (~femaDf['inspected']).sum()], 
                                  name='False', marker_color='lightsalmon')])
fig1.update_layout(barmode='stack',   bargap=0.07, width=600, height=400)

In [50]:
# Visualize some columns related to damage
import plotly.graph_objects as go

fig1 = go.Figure(data=[go.Bar(x=['Flood Damage',  'Foundation Damage', 'Roof Damage', 'Repairs Required', 'Destroyed'], 
                              y =[femaDf['floodDamage'].sum(),
                                  femaDf['foundationDamage'].sum(), 
                                  femaDf['roofDamage'].sum(),
                                  femaDf['habitabilityRepairsRequired'].sum(),
                                  femaDf['destroyed'].sum()], 
                                  name='True', marker_color='indianred'),
                       go.Bar(x=['Flood Damage',  'Foundation Damage', 'Roof Damage', 'Repairs Required', 'Destroyed'], 
                              y =[(~femaDf['floodDamage']).sum(),
                                  (~femaDf['foundationDamage']).sum(), 
                                  (~femaDf['roofDamage']).sum(),
                                  femaDf['habitabilityRepairsRequired'].value_counts()[False],
                                  (~femaDf['destroyed']).sum()], 
                                  name='False', marker_color='lightsalmon')])
fig1.update_layout(barmode='stack',   bargap=0.07, width=600, height=400)

In [52]:
# Visualize some boolean columns
import plotly.graph_objects as go

fig1 = go.Figure(data=[go.Bar(x=['TSA Eligible',  'Rental Assistance Eligible', 'Repair Assistance Eligible', 'Replacement Assistance Eligible', 'SBA Eligble', 'Personal Property Eligible'], 
                              y =[femaDf['tsaEligible'].sum(),
                                  femaDf['rentalAssistanceEligible'].sum(), 
                                  femaDf['repairAssistanceEligible'].sum(),
                                  femaDf['replacementAssistanceEligible'].sum(),
                                  femaDf['sbaEligible'].sum(),
                                  femaDf['personalPropertyEligible'].sum()], 
                                  name='True', marker_color='indianred'),
                       go.Bar(x=['TSA Eligible',  'Rental Assistance Eligible', 'Repair Assistance Eligible', 'Replacement Assistance Eligible', 'SBA Eligble', 'Personal Property Eligible'], 
                              y =[(~femaDf['tsaEligible']).sum(),
                                  (~femaDf['rentalAssistanceEligible']).sum(), 
                                  (~femaDf['repairAssistanceEligible']).sum(),
                                  (~femaDf['replacementAssistanceEligible']).sum(),
                                  (~femaDf['sbaEligible']).sum(),
                                  (~femaDf['personalPropertyEligible']).sum()], 
                                  name='False', marker_color='lightsalmon')])
fig1.update_layout(barmode='stack',   bargap=0.07, width=600, height=400)

### Calculate HA payouts

In [55]:
femaDf['haAmount'] = femaDf['rentalAssistanceAmount'] + femaDf['repairAmount'] + femaDf['replacementAmount']
femaDf['haAmount'].sum()

667853.15