In [None]:
import json

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

### Read the csv/json file that was downloaded into a pandas dataframe

In [None]:
femaLargeDf = pd.read_csv('../data/FEMA-Large-DR-4339-PR.csv', encoding='utf-8')

In [None]:
# Read from JSON
with open('../data/open-fema/FEMA-DR-4339-PR.json','r') as f:
    femaData = json.loads(f.read())
femaDf = pd.json_normalize(femaData['fema_open_api'])

# Read from CSV
# femaDf = pd.read_csv('../data/open-fema/FEMA-DR-4339-PR.csv', encoding='utf-8')

In [None]:
print('Found {} records'.format(len(femaDf)))

In [None]:
# Look at few lines
femaDf.head()

In [None]:
femaDf.columns.tolist()

### Aggregate 'haStatus' column across all counties - Top 20 reasons

In [None]:
femaDf.groupby(['haStatus'])['haStatus'].count().nlargest(30)

### Aggregate 'inspnReturned' column, grouped by county - Top 20 counties with lowest rates of inspection

In [None]:
femaDf.groupby(['county'])['inspnReturned'].mean().nsmallest(20)*100

### Aggregate 'ihpAmount' column, grouped by county - Top 20 counties

In [None]:
# View aggregated ihpAmount, grouped by county
femaDf.groupby(['county'])['ihpAmount'].agg('sum').nlargest(20)

### Aggregate 'roofDamage' column, grouped by county - Top 20 counties with most roof damage

In [None]:
# View aggregated ihpAmount, grouped by county
femaDf.groupby(['county'])['roofDamage'].mean().nlargest(20)*100

### Aggregate registrations by county

In [None]:
femaDf['tsaEligible'].value_counts()

### Some stats around inspnReturned, homeOwnersInsurance, floodInsurance, ihpEligible, inspnIssued

In [None]:
femaDf['inspnReturned'].value_counts()

In [None]:
femaDf['homeOwnersInsurance'].value_counts()

In [None]:
femaDf['floodInsurance'].value_counts()

In [None]:
femaDf['roofDamage'].mean()

In [None]:
femaDf['homeDamage'].value_counts()

In [None]:
femaDf['utilitiesOut'].mean()

In [None]:
femaDf['homeDamage'].mean()

In [None]:
len(femaDf[(femaDf['ihpAmount'] == 0) & (femaDf['homeDamage'])])

### Importing PR county data (../data/county-demographics.csv)

In [None]:
import unidecode

# Load county data scraped from Wikipedia
countyDemographicsDf = pd.read_csv('../data/county-demographics.csv', delimiter=',')

# Format the county column
countyDemographicsDf['county_name'] = countyDemographicsDf.apply(lambda row: unidecode.unidecode(row['county_name']), axis=1)
countyDemographicsDf['county_name'] = countyDemographicsDf.apply(lambda row: row['county_name'].split(' Municipio')[0], axis=1)

countyDemographicsDf.head(20)

### Merge the data sources

In [None]:
# Extract the aggregated ihpAmounts per county
ihpAndIncomePerCapitaDf = femaDf.groupby(['county'])['ihpAmount'].agg('sum').reset_index()

# Drop the Statewide row
ihpAndIncomePerCapitaDf = ihpAndIncomePerCapitaDf[ihpAndIncomePerCapitaDf['county'] != 'Statewide']

# Format the county column
ihpAndIncomePerCapitaDf['county'] = ihpAndIncomePerCapitaDf.county.str.extract('(.+?) \(')

ihpAndIncomePerCapitaDf = pd.merge(ihpAndIncomePerCapitaDf, 
                                   countyDemographicsDf[['county_name','pop_estimates_2019', 'per_capita_income_past_year', 'fips_code']],
                                   left_on='county', right_on='county_name', how='inner')

# Calculate the ihpAmount per capita
ihpAndIncomePerCapitaDf['ihpAmountPerCapita'] = np.round(ihpAndIncomePerCapitaDf['ihpAmount']/ihpAndIncomePerCapitaDf['pop_estimates_2019']*100,decimals=2)

ihpAndIncomePerCapitaDf.head(20)

In [None]:
femaDf.groupby(['county'])['haAmount'].agg('sum')

In [None]:
# Extract the aggregated registrations per county
registrationsPerCountyDf = femaDf['county'].value_counts().reset_index()
#registrationsPerCountyDf = femaDf[femaDf['homeDamage'] == True]['county'].value_counts().reset_index()

registrationsPerCountyDf.rename(columns={'index': 'county', 'county': 'registrations'}, inplace=True)

# Drop the Statewide row
registrationsPerCountyDf = registrationsPerCountyDf[registrationsPerCountyDf['county'] != 'Statewide']

# Format the county column
registrationsPerCountyDf['county'] = registrationsPerCountyDf.county.str.extract('(.+?) \(')

registrationsPerCountyDf = pd.merge(registrationsPerCountyDf, 
                                   countyDemographicsDf[['county_name','pop_estimates_2019', 'fips_code']],
                                   left_on='county', right_on='county_name', how='inner')

# Calculate the ihpAmount per capita
registrationsPerCountyDf['registrationsPerCapita'] = np.round(registrationsPerCountyDf['registrations']/registrationsPerCountyDf['pop_estimates_2019']*100,decimals=2)

registrationsPerCountyDf.head(20)

### Choropleth Map using plotly (https://plotly.com/python/county-choropleth/#single-state)

In [None]:
# Install plotly dependencies
#!pip install plotly
#!pip install plotly-geo
#!pip install geopandas==0.3.0
#!pip install pyshp==1.2.10
#!pip install shapely==1.6.3
import plotly.figure_factory as ff

In [None]:
# Per Capita Income (2019) by county
values = ihpAndIncomePerCapitaDf['per_capita_income_past_year'].tolist()
fips = ihpAndIncomePerCapitaDf['fips_code'].tolist()

endpts = list(np.mgrid[min(values):max(values):5j])
colorscale = [
    'rgb(193, 193, 193)',
    'rgb(239,239,239)',
    'rgb(195, 196, 222)',
    'rgb(144,148,194)',
    'rgb(101,104,168)',
    'rgb(65, 53, 132)'
]
incomePerCapitaFig = ff.create_choropleth(
    fips=fips, values=values, scope=['PR'], show_state_data=True,
    colorscale=colorscale, binning_endpoints=endpts, round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='Income Per Capita by County',
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.25},    
    exponent_format=True,
)
incomePerCapitaFig.layout.template = None
incomePerCapitaFig.show()

In [None]:
# ihp Amount Per Capita by County
values = ihpAndIncomePerCapitaDf['ihpAmountPerCapita'].tolist()
fips = ihpAndIncomePerCapitaDf['fips_code'].tolist()

endpts = list(np.mgrid[min(values):max(values):5j])
colorscale = [
    'rgb(193, 193, 193)',
    'rgb(239,239,239)',
    'rgb(195, 196, 222)',
    'rgb(144,148,194)',
    'rgb(101,104,168)',
    'rgb(65, 53, 132)'
]
ihpPerCapitaFig = ff.create_choropleth(
    fips=fips, values=values, scope=['PR'], show_state_data=True,
    colorscale=colorscale, binning_endpoints=endpts, round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='IHP Amount Awarded Per Capita by County',
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.25},
    exponent_format=True,
)
ihpPerCapitaFig.layout.template = None
ihpPerCapitaFig.show()

In [None]:
# Registrations Per Capita by County
values = registrationsPerCountyDf['registrationsPerCapita'].tolist()
fips = registrationsPerCountyDf['fips_code'].tolist()

endpts = list(np.mgrid[min(values):max(values):5j])
colorscale = [
    'rgb(193, 193, 193)',
    'rgb(239,239,239)',
    'rgb(195, 196, 222)',
    'rgb(144,148,194)',
    'rgb(101,104,168)',
    'rgb(65, 53, 132)'
]
registrationsPerCapitaFig = ff.create_choropleth(
    fips=fips, values=values, scope=['PR'], show_state_data=True,
    colorscale=colorscale, binning_endpoints=endpts, round_legend_values=True,
    plot_bgcolor='rgb(229,229,229)',
    paper_bgcolor='rgb(229,229,229)',
    legend_title='Applications Per Capita by County',
    county_outline={'color': 'rgb(255,255,255)', 'width': 0.25},
    exponent_format=True,
)
registrationsPerCapitaFig.layout.template = None
registrationsPerCapitaFig.show()

### haStatus codes for denied applications

In [None]:
from collections import defaultdict
haStatusCodes = defaultdict(int)

statusCodes = femaDf[(femaDf['ihpAmount'] == 0) & (femaDf['haStatus'] != 'TSA - Transitional Sheltering Assistance') & (femaDf.haStatus.notnull())]['haStatus'].str.extract('(.+?)\s?-')[0].str.split(',')
#statusCodes = femaDf[(femaDf.haStatus.notnull())]['haStatus'].str.extract('(.+?)\s?-')[0].str.split(',')
statusCodes = statusCodes.dropna()
for index, value in statusCodes.items(): 
    for statusCode in value:
        haStatusCodes[statusCode] = haStatusCodes.get(statusCode, 0) + 1
        
haStatusCodesDf = pd.DataFrame(list(haStatusCodes.items()), columns=['statusCode', 'Count'])
haStatusCodesDf = haStatusCodesDf.sort_values('Count', ascending=True)

In [None]:
import plotly.express as px

fig = px.pie(haStatusCodesDf, values='Count', names='statusCode', 
             color_discrete_sequence=px.colors.sequential.tempo)
fig.update_traces(hoverinfo='label+percent+name',textposition='inside', textinfo='label+percent')
fig.update(layout_title_text='Housing Assistance Status Codes for Denied Applications',
           layout_showlegend=False)
fig.show()

In [None]:
import plotly.express as px

fig = px.bar(haStatusCodesDf, y='Count', x='statusCode', text='Count', color_continuous_scale = "darkmint")
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor='rgb(229,229,229)')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()

In [None]:
femaDf.columns

In [None]:
X = femaDf[['accessFunctionalNeeds', 'applicantAge', 'autoDamage', 'county',
       'damagedCity', 'damagedStateAbbreviation', 'damagedZipCode', 'destroyed', 
        'emergencyNeeds', 'floodDamage', 'floodInsurance', 'foodNeed', 'foundationDamage', 
        'grossIncome', 'habitabilityRepairsRequired', 'homeDamage',
       'homeOwnersInsurance', 'householdComposition', 'incidentType', 'occupants19to64', 'occupants2to5',
       'occupants65andOver', 'occupants6to18', 'occupantsUnderTwo', 'roofDamage', 'utilitiesOut', 'shelterNeed']]

In [None]:
X = pd.get_dummies(data=X, drop_first=True)

In [None]:
Y = femaDf[['inspnIssued']]

In [None]:
X.head()

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 40)

regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)
predicted = regr.predict(X_test)