In [261]:
import pandas as pd
import json
import altair as alt
import numpy as np
from vega_datasets import data as datasets
from scipy.optimize import curve_fit

### Load Data

In [262]:

# Load USA level data
usa_data = pd.read_csv('data/usa_data.csv')

# Load state level data
data = pd.read_csv('data/full_data.csv')


# Load regions dictionary
with open('data/regions.json', 'r') as j:
    regions_dict = json.loads(j.read())
    
# Load state abbreviations dictionary
with open('data/us_state_abbrv.json', 'r') as j:
    state_abbrv_dict = json.loads(j.read())

Let's start off by taking a peek at the first few rows of the data

In [263]:
data.head()

Unnamed: 0,cost,costRank,count,countRank,deaths,deathsRank,disaster,region,year
0,5.0,3.0,1.0,1.0,1260.0,1.0,drought,CCR,1980
1,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1981
2,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1982
3,1.6,8.0,1.0,1.0,0.0,42.0,drought,CCR,1983
4,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1984


Each row corresponds to data for a certain ``disaster`` that took place in a specific ``year`` at a certain ``region``. The ``count`` field indicates the number of times that disaster took place during that year, while the ``cost`` field indicates the total dollar-amount of the damages (in billions). Similarly, the ``death`` field tells us how many people died that year due to that disaster in that region. The three ``*Rank`` rank the various years in terms of the corresponding fields -- we won't be using these ranks.

As an example, the first row of the table tells us that in 1980, we had 1 drought in the CCR region, which lead to 1260 deaths and had an economic cost of $5 billion.


Let's get some more details on the two categorical fields here, namely ``disaster`` and ``region``. Mainly, we want to know what all possible values are for these categorical variables

In [264]:
alt.Chart(data).mark_circle().encode(
    alt.Y('disaster:N'),
    alt.X('region:N'),
    alt.Size('sum(count)')
).properties(
    width=450,
    height=300
)

#### First Impressions

From the y-axis, we see that there are 7 different disasters included in our data. Note that there is also an ``all-disasters`` type, which is likely a sum of all the other disasters. We'll check this later to make sure. Ignoring this (probably) cumulative ``all-disasters`` for now, we see that severe-storms are the most represented disaster in our dataset

From the x-axis, we see that there are 13 regions. We have more information about these regions in the ``regions_dict`` dictionary. Let's use this dictionary to get a better sense of these regions.

#### Regions

The ``regions_dict`` has one key corresponding to eeach of the 13 regions

In [265]:
all_regions = np.array(list(regions_dict.keys()))
all_regions

array(['CCR', 'ENCCR', 'NECR', 'NWCR', 'SCR', 'SECR', 'SWCR', 'WCR',
       'WNCCR', 'GCS', 'GLS', 'SP', 'TA'], dtype='<U5')

For every key, we have a separate dict that gives us the full form of the region's acronym and also all the states included within it. For example,

In [266]:
regions_dict[all_regions[0]]

{'name': 'Central Climate Region',
 'states': ['Illinois',
  'Indiana',
  'Kentucky',
  'Missouri',
  'Ohio',
  'Tennessee',
  'West Virginia']}

Let's turn this dict into a dataframe so we can prod it some with visualizations

In [267]:
# Convert regions_dict into a dataframe
regions = pd.DataFrame.from_dict(regions_dict).transpose().reset_index()
regions.rename(columns={'index':'region'}, inplace=True)
regions = regions.explode('states')
regions.reset_index(drop=True, inplace=True)
regions['abbrv'] = regions['states'].map(state_abbrv_dict)
regions.head()

Unnamed: 0,region,name,states,abbrv
0,CCR,Central Climate Region,Illinois,IL
1,CCR,Central Climate Region,Indiana,IN
2,CCR,Central Climate Region,Kentucky,KY
3,CCR,Central Climate Region,Missouri,MO
4,CCR,Central Climate Region,Ohio,H


Here are the names associated with each region acronym

In [268]:
regions.drop_duplicates('region')[['region','name']].reset_index(drop=True)

Unnamed: 0,region,name
0,CCR,Central Climate Region
1,ENCCR,East North Central Climate Region
2,NECR,Northeast Climate Region
3,NWCR,Northwest Climate Region
4,SCR,South Climate Region
5,SECR,Southeast Climate Region
6,SWCR,Southwest Climate Region
7,WCR,West Climate Region
8,WNCCR,West North Climate Region
9,GCS,Gulf Coast States


In [269]:
# Get a list of all region names for later
all_region_names = regions['name'].unique()

Let's start off by checking what states are contained within what regions by plotting the various regions on map. To do this, we first need to get ``states`` dataset of all states from ``vega-datasets``, which has an ID associated with each state. Our dataset has names of states, so we'll need to map these to IDs. Once our dataset has the same state IDs as the ``states`` dataset, we can use lookups to make our cartographic plots

In [270]:
# Pull geographic info on all states from vega datasets
states = alt.topo_feature(datasets.us_10m.url, 'states')
# Pull state IDs from another vega dataset so we can use to do lookups
state_ids = datasets.population_engineers_hurricanes()
# Match ids to state names in our dataset
state_ids = state_ids.loc[:,['state','id']]
state_ids = dict(zip(state_ids.state, state_ids.id))
regions['id'] = regions['states'].map(state_ids)
regions.head()

Unnamed: 0,region,name,states,abbrv,id
0,CCR,Central Climate Region,Illinois,IL,17
1,CCR,Central Climate Region,Indiana,IN,18
2,CCR,Central Climate Region,Kentucky,KY,21
3,CCR,Central Climate Region,Missouri,MO,29
4,CCR,Central Climate Region,Ohio,H,39


In [271]:
def chloropleth_map(regions_df, projection='albers'):
    return alt.Chart(states).mark_geoshape(stroke='black').encode(
        alt.Color('name:N', scale=alt.Scale(scheme='tableau20'))
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(regions_df, 'id', list(regions.columns))
    ).properties(
        width=500,
        height=350
    ).project(
        type=projection
    )

chloropleth_map(regions)

Two things to note here:
First, our dataset only seems to cover the contiguous US as Hawaii, Alaska, and Puerto Rico show up as ``null``. This is good to know, but it also raises some completeness questions. For example, we know that this dataset does not cover the recent devastating hurricane Maria in Puerto Rico.

Second, we saw earlier that there were 13 regions in our dataset, but we only see 11 unique labels here. One possible reason could be that some of the regions overlap with one another. For example, we don't see ``East North Central Climate Region`` or ``West North Climate Region``. Let's check if we have regions overlap with one another

In [323]:
alt.Chart(regions).mark_rect().encode(
    alt.Y('abbrv:O'),
    alt.X('region:N'),
    alt.Color('name:N', scale=alt.Scale(scheme='tableau20'))
).properties(
    width=650,
    height=700
)


In [325]:
def interactive_region_hist(default_region_names):
    
    # Interactive selector
    selection = alt.selection_multi(fields=['name'], 
                                    init = [{"name": name} for name in default_region_names])
    color = alt.condition(selection, 'region:N', alt.value('lightgray'))
    region_selector = alt.Chart(regions).mark_rect(stroke='black').encode(
        y='name', color=color).add_selection(    
        selection
    ).properties(
        width=10
    )
    
    
    # The bar chart
    chart = alt.Chart(regions).mark_bar().add_selection(
        selection
    ).encode(
        alt.Y('states:O', axis=alt.Axis(labelAngle=0)),
        alt.X('count(states):Q'),
        alt.Color('region:N', scale=alt.Scale(scheme='tableau20'), legend=None)
    ).properties(
        width=600,
        height=500
    ).transform_filter(
        selection
    )
    
    # Full plot
    full_plot = alt.hconcat(region_selector, chart)
    return full_plot

interactive_region_hist(all_region_names)

Aha! Just as we thought -- there are many states that are included in multiple regions. For instance, Alabama (the first bar AL) is included in 2 different regions; Texas is included in 4 different regions.

In the above interactive plot, we can pick which regions should be included in our dataset. We'd want to keep a set of regions around such that each state is only included in one region. A logical first attempt might be to remove the anamolies i.e. all regions that do not end with 'Regions' e.g. Southern Plains, Tornado Alley. Below, we have the interactive plot again, this time with the anamolous regions removed by default

So, the overlaps were coming from the regions whose names do not end in 'Region'. Namely, we need to remove the following regions: ``[Great Lakes States, Gulf Coast States, Southern Plains, Tornado Alley']``

In [274]:
region_names_to_drop = ['Great Lakes States', 'Gulf Coast States', 'Southern Plains', 'Tornado Alley']
reduced_region_names = [x for x in all_region_names if x not in region_names_to_drop]
interactive_region_hist(reduced_region_names)


Great! If we just keep these regions around, we don't have any overlaps. Let's filter our regions dataset to only keep these regions, and give our chloropleth plot another shot

In [275]:
# Filter unwanted regions from our regions dataset
regions = regions.loc[regions['name'].isin(reduced_region_names)]
chloropleth_map(regions, projection='albersUsa')

That's much better. We now have 9 (non-null) categories on this map. We no longer have that earlier issue where certain regions disappeared due to overlaps.

In [276]:
reduced_regions = regions.copy().loc[regions['name'].isin(reduced_region_names), 'region'].unique()
reduced_regions

array(['CCR', 'ENCCR', 'NECR', 'NWCR', 'SCR', 'SECR', 'SWCR', 'WCR',
       'WNCCR'], dtype=object)

### Fun With Maps

In [326]:
cols_to_keep = ['Location','Date','Value']

precip = pd.read_csv('data/statewide_precip.csv', usecols=cols_to_keep)
max_temps = pd.read_csv('data/statewide_max_temp.csv', usecols=cols_to_keep)
min_temps = pd.read_csv('data/statewide_min_temp.csv', usecols=cols_to_keep)
drought_severity = pd.read_csv('data/statewide_drought_severity.csv', usecols=cols_to_keep)


all_df_names = ['precip','max_temps','min_temps', 'drought_severity']
all_dfs = [precip, max_temps, min_temps, drought_severity]

# Rename seom columns
[x.rename(columns={'Value': '{}'.format(y)}, inplace=True) for x,y in zip(all_dfs,all_df_names)]
[x.rename(columns={'Location': 'states'}, inplace=True) for x in all_dfs]
[x.rename(columns={'Date': 'date'}, inplace=True) for x in all_dfs]



state_to_region_map = regions[['region','states']].set_index('states').to_dict()['region']

for idx,df in enumerate(all_dfs):
    df['year'] = (df['date'] / 100).astype(np.int)
#     df['month'] = precip['date'] % 100
    df.drop('date', axis='columns', inplace=True)
    #TODO Should we use max, min, mean?
    all_dfs[idx] = df.groupby(['states','year']).median().reset_index()
    all_dfs[idx]['id'] = all_dfs[idx]['states'].map(state_ids)
    all_dfs[idx] = all_dfs[idx].loc[all_dfs[idx]['year'] > 1990, :].reset_index(drop=True)
    all_dfs[idx]['region'] = all_dfs[idx]['states'].map(state_to_region_map)

precip, max_temps, min_temps, drought_severity = all_dfs

In [327]:
# precip = precip.loc[precip['year'] > 1980]
# max_temps = max_temps.loc[max_temps['year'] > 1980]
# min_temps = min_temps.loc[min_temps['year'] > 1980]
# drought_severity = drought_severity.loc[drought_severity['year'] > 1980]

In [328]:
# # dat = max_temps.loc[max_temps['year'] == 1985,:]
# # dat = max_temps.iloc[0:1000,:]
# dat = max_temps

# alt.Chart(states).mark_geoshape().encode(
#     color='max_temps:Q'
# ).transform_lookup(
#     lookup='id',
#     from_=alt.LookupData(dat, 'id', list(dat.columns))
# ).project(
#     type='albersUsa'
# ).properties(
#     width=500,
#     height=300
# )

In [280]:
def interactive_chloropleth(df, metric_name, start_year, end_year, scale_domain):
    
    dat = df.loc[df['year'] > start_year,:]

    dat = dat.pivot(index='id', columns='year', values=metric_name).reset_index()
    dat.rename(columns = {x:y for x,y in zip(dat.columns, [str(z) for z in dat.columns]) }, inplace=True)
    dat_columns = [str(year) for year in range(start_year, end_year)]


    slider = alt.binding_range(min=start_year, max=end_year, step=1)
    select_year = alt.selection_single(name="year", fields=['year'],
                                       bind=slider, init={'year': int((start_year + end_year)/2)})

    return alt.Chart(states).mark_geoshape(
        stroke='black',
        strokeWidth=0.05
    ).project(
        type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(dat, 'id', dat_columns)
    ).transform_fold(
        dat_columns, as_=['year', metric_name]
    ).transform_calculate(
        year='parseInt(datum.year)'
    ).encode(
        alt.Color('{}:Q'.format(metric_name), scale=alt.Scale(scheme='blues', domain=scale_domain))
    ).add_selection(
        select_year
    ).properties(
        width=700,
        height=400
    ).transform_filter(
        select_year
    )

In [281]:
interactive_chloropleth(all_dfs[0], all_df_names[0], 1990, 2021, [0,10])

In [282]:
interactive_chloropleth(all_dfs[1], all_df_names[1], 1990, 2020, [30,100])

In [283]:
interactive_chloropleth(all_dfs[2], all_df_names[2], 1980, 2020, [0,90])

In [284]:
interactive_chloropleth(all_dfs[3], all_df_names[3], 1990, 2021, [-10,10])

In [285]:
# alt.Chart(states).mark_geoshape(stroke='black').project(
#     type='albersUsa'
# ).encode(
#     alt.Color(alt.repeat('repeat'), type='nominal')
# ).transform_lookup(
#     lookup='id',
#     from_=alt.LookupData(regions, 'id', list(regions.columns))
# ).properties(
#     width=150,
#     height=150
# ).repeat(
#     repeat=all_region_names, # Repeated plots for each region name column
#     columns=4
# ).resolve_scale(
#     color='independent'
# )

In [286]:
# def sub_region_plot(region_name):
#     sub_regions = regions.loc[regions['name'] == region_name, :]

#     chart = alt.Chart(states).mark_geoshape(stroke='black').encode(
#         alt.Color('name:N', scale=alt.Scale(scheme='tableau20'))
#     ).transform_lookup(
#         lookup='id',
#         from_=alt.LookupData(sub_regions, 'id', list(regions.columns))
#     ).properties(
#         width=500,
#         height=400
#     ).project(
#         type='albersUsa'
#     )
    
#     return chart

# region_name = 'Central Climate Region'    
# chartList = [sub_region_plot(region) for region in all_region_names]

## Disaster Frequency Viz

Let's start by looking at the trends in the US in general. Let's plot the total number of billion dollar disasters each year across the entire united states. We'll also fit a curve to the data, and extrapolate it to the next few years to get a sense of what we can expect

In [287]:
data = usa_data
disaster = 'all-disasters'
metric = 'count'

# Select out rows from dataset for the disaster
all_disaster_data = data.loc[data['disaster'] == disaster, :].reset_index(drop=True)

# Get metric data as our targets
regression_data = data.loc[data['disaster'] == disaster,metric].reset_index(drop=True)

# Define a fit function for a given exponent n
def fit_func(n): 
    return lambda x,a,b: a + b*x**n

# Set the list of exponents to try
exponents = {'x^2': 2,'x^3': 3,'x^4': 4,'x^5': 5}
colors = ['red','blue','green','orange']
num_future_yrs = 15

preds = np.zeros((len(exponents),len(regression_data) + num_future_yrs-1))
rmse_errs = np.zeros(len(exponents))

for idx,exp_name in enumerate(exponents.keys()):
    fn = fit_func(exponents[exp_name])
    params, cov = curve_fit(fn, np.arange(1,len(regression_data)+1), regression_data)
    preds[idx,:] = fn(np.arange(1,len(regression_data)+num_future_yrs), params[0], params[1])
    rmse_errs[idx] = np.sqrt(np.sum((preds[idx, 0:len(regression_data)] - regression_data)**2))

# Collect the predictions    
pred_df = pd.DataFrame(preds).T
pred_df.columns = exponents.keys()
pred_df['year'] = np.arange(1980,2020+num_future_yrs)
pred_df = pred_df.reset_index(drop=True).melt('year') # Turn columns to rows


# Plot the raw data for disasters as points
chart = alt.Chart(all_disaster_data).mark_point().encode(
    alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2020+num_future_yrs,5), labelAngle=0)),
    alt.Y('{}'.format(metric), axis=alt.Axis(title='Number of Disasters'))
).properties(
    width=850,
    height=450
)

# Plot the fitted curves
fit_charts = alt.Chart(pred_df).mark_line().encode(
    alt.X('year:N'),
    alt.Y('value:Q'),
    color='variable:N',
    opacity = alt.condition(alt.datum.variable == 'x^3', alt.value(1.0), alt.value(0.3))
)

# Add label texts for fit curves
annotations = pd.DataFrame({'x': [2034]*4, 'y': [73,55,40,30], 'text': ['y=x^5', 
                                                                        'y=x^4',
                                                                        'y=x^3',
                                                                        'y=x^2']})
fit_label_chart = alt.Chart(annotations).mark_text(size=10).encode(
    x = alt.X('x:N'),
    y = alt.Y('y:Q'),
    text = 'text',
)

# Shade the Observed and Predicted Regions
cutoffs = pd.DataFrame({
    'start': [1979,2021],
    'stop': [2021,2021+num_future_yrs]
})
shade_chart = alt.Chart(
    cutoffs.reset_index()
).mark_rect(
    opacity=0.2
).encode(
    x='start:N',
    x2='stop:N',
    y=alt.value(0),
    y2=alt.value(450),
    color=alt.Color('index:N', legend=None)
)

# Add text for 'observed' and 'predicted'
annotations = pd.DataFrame({'x': [1984,2025], 'y': [70,70], 'text': ['Observed', 'Predicted']})
text_chart = alt.Chart(annotations).mark_text(size=20).encode(
    x = alt.X('x:N'),
    y = alt.Y('y:Q'),
    text = 'text',
)

# Extra comment annotations
annotations = pd.DataFrame({'x': [2020], 'y': [25], 'text': ['Record number of 22 disasters in 2020!']})
comments_chart = alt.Chart(annotations).mark_text(size=12).encode(
    x = alt.X('x:N'),
    y = alt.Y('y:Q'),
    text = 'text',
)


chart + fit_charts + shade_chart + text_chart + fit_label_chart + comments_chart

We see that there is clearly an increase in the number of annual billion-dollar disasters over the years. In 2020, we saw a record number of 22 billion dollar disasters.

We've also drawn up some projections based on different polynomial fits to the data. The cubic polynomial fit the data the best among all the other options tried. If this trend were true, we'd need to brace ourselves for around 35 annual disasters.

Let's now take a look at how these disasters break down into different categories. Here are all the different types of disasters in our dataset

In [288]:
# chart = alt.Chart(usa_data).mark_bar().transform_filter(
#     alt.datum.disaster != 'all-disasters'
# ).encode(
#     x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
#     y = alt.Y('count:Q'),
#     color = alt.Color('disaster:N'),
#     order= alt.Order(
#       'count',
#       sort='ascending'
#     )
# )


# # Shade the Observed and Predicted Regions
# cutoffs = pd.DataFrame({
#     'start': [1980,2007],
#     'stop': [2007, 2021]
# })
# shade_chart = alt.Chart(
#     cutoffs.reset_index()
# ).mark_rect(
#     opacity=0.2
# ).encode(
#     x='start:N',
#     x2='stop:N',
#     y=alt.value(0),
#     y2=alt.value(50),
#     color=alt.Color('index:N', legend=None)
# )

# # Add comment annotations
# annotations = pd.DataFrame({'x': [2010, 2007], 'y': [20, 21], 'text': ['Severe storms begin driving the increase in annual disasters',
#                                                                       '2007']})
# comments_chart = alt.Chart(annotations).mark_text(size=12).encode(
#     x = alt.X('x:N'),
#     y = alt.Y('y:Q'),
#     text = 'text',
# )

# chart 
# # + comments_chart + shade_chart

In the above plot, we have sorted each of the stacks by the number of times that disaster occured in the given year i.e. the most commonly occuring disaster is at the top of the stack. 

In the last two decades, the bulk of the billion-dollar disasters have been severe storms. The annual number of severe-storms have increased substatially in the last two decades, starting sometime around 2007. In fact, it appears that the increase in billion-dollar disasters is being driven mostly by severe storms.

The above plot clearly shows how severe-storms have dominated the billion-dollar disasters in the past decade. However, it is a bit difficult to get a sense of how the frequency of different disasters has been changing over time. This is a bit more visceral in the following plot

In [289]:
alt.Chart(usa_data).mark_circle().transform_filter(
    alt.datum.disaster != 'all-disasters'
).encode(
    x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
    y = alt.Y('disaster:N'),
    size = 'count'
).properties(
    width = 800,
    height=400
)

# TODO: Add another circle plot at the bottom for all-disasters

We now see some interesting patterns emerging, other than how severe-storms have been increasing in frequency. 

First, we see that wildfires have been becoming a more regular occurence in the past two decades. Prior to 2000, it was common to have many years without any wildfires. We did not have any from 1980 to 1990. Starting 2000, there are fewer spells of years without any wildfires at all. 

Freezes and winter-storms seem to have become less frequent in the last two decades. However, both floods and droughts, disasters that are at two extremes, seem to both be occuring more frequently in the last decade.


This figure suggests that the last two decades have seen some marked changes. Let's plot the same frequency data, but using the average from 1980-1999 as a baseline

In [319]:
cutoff_year = 1999

# Get baseline averages
baseline_data = usa_data.copy().loc[usa_data['year'] < cutoff_year, ['count','disaster']].reset_index(drop=True)
baseline_data = baseline_data.groupby(['disaster']).mean().reset_index()
baseline_data = baseline_data.rename(columns={'count':'avg_count'})

# Add average info and diff from avg to our data
usa_data_avg = usa_data.merge(baseline_data, left_on='disaster',right_on='disaster')
usa_data_avg['avg_diff'] = usa_data_avg['count'] - usa_data_avg['avg_count']

alt.Chart(usa_data_avg).mark_bar().encode(
    x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
    y = alt.Y('avg_diff:Q'),
    color=alt.condition(
        alt.datum.avg_diff > 0,
        alt.value("red"),  # The positive color
        alt.value("green")  # The negative color
)).transform_filter(
    alt.datum.year > cutoff_year
).facet(
    row = 'disaster:N'
).transform_filter(
    alt.datum.disaster != 'all-disasters'
)

In [320]:
# # Get baseline averages
# target_data = usa_data.copy().loc[usa_data['year'] > cutoff_year, ['count','disaster']].reset_index(drop=True)
# target_data = target_data.groupby(['disaster']).mean().reset_index()
# target_data = target_data.rename(columns={'count':'avg_count'})


# diff_data = pd.DataFrame()
# diff_data['Average between 1980-1999']= baseline_data['avg_count']
# diff_data['Average between 2000-2020']= target_data['avg_count']
# diff_data['disaster'] = baseline_data['disaster']
# diff_data = diff_data.melt('disaster').sort_values(['disaster','variable'], ascending=[True,False]).reset_index(drop=True)

# pos_slope = diff_data.groupby('disaster')['value'].diff().dropna().reset_index(drop=True)
# disaster_names = diff_data['disaster'].drop_duplicates().reset_index(drop=True)
# slope_dict = {x:y for (x,y) in zip(disaster_names, pos_slope)}

# diff_data['pos_slope'] = diff_data['disaster'].map(slope_dict)

# chart = alt.Chart(diff_data).mark_line(point=True).encode(
#     x = alt.X('variable:N', 
#               sort=alt.EncodingSortField(field='variable', order='descending'),
#              axis=alt.Axis(title='Change over the Decades', labelAngle=0)),
#     y = alt.Y('value'),
#     size = 'disaster:N',
#     color = alt.condition(alt.datum.pos_slope > 0, alt.value("red"), alt.value("green"))
# ).properties(
#     width = 400
# ).transform_filter(
#     alt.datum.disaster != 'all-disasters'
# ).properties(
#     width = 600
# )

# # Extra comment annotations
# # annotations = pd.DataFrame({'x': ['Average between 2000-2020']*7, 
# #                             'y': [1,2,3,5.5,5,6,7], 
# #                             'text': disaster_names[1:]})
# # comments_chart = alt.Chart(annotations).mark_text(size=12).encode(
# #     x = alt.X('x:N'),
# #     y = alt.Y('y:Q'),
# #     text = 'text',
# # )


# chart

Apart from winter-storms, we've been getting more frequent disasters across all categories compared to the average of the prior 2-decade period (1980-1999)

Let us now look at the human deaths due to these disasters

In [292]:
chart = alt.Chart(usa_data).mark_bar().encode(
    x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
    y = alt.Y('deaths'),
    color='disaster',
    order =  alt.Order(
      'deaths:N',
      sort='ascending'
    )
).transform_filter(
    alt.datum.disaster != 'all-disasters'
)

# Add comment annotations
# Extra comment annotations
annotations = pd.DataFrame({'x': [2005, 2017, 1990, 2011], 
                            'y': [2100, 3400, 1800, 1000], 
                            'text': ['Hurricane Katrina?', 
                                     'Hurricane Harvey?',
                                     '1980-2000: Droughts caused the most fatalities',
                                     '2000-2020: Droughts not as life-threatening']})
comments_chart = alt.Chart(annotations).mark_text(size=12).encode(
    x = alt.X('x:N'),
    y = alt.Y('y:Q'),
    text = 'text',
)

chart + comments_chart

We notice two main things here. 
1) Fortunately, there does not seem to be sustained increase in the number of deaths over the years i.e. the billion-dollar disasters have not been becoming more and more deadly over the years. 
2) There were a few devastating tragedies that have caused human deaths. From 1980-2000, the most deadly of such events tended to be droughts. In the last two decades, we do not see as much of loss of life due to droughts. In general, there seem to have been fewer deaths overall. The main killer in the past 20 decades has instead been tropical cyclones.

Let us now look at the economic toll these billion-dollar disasters have had over the years

In [321]:
chart = alt.Chart(usa_data).mark_bar().encode(
    x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
    y = alt.Y('cost'),
    color='disaster',
    order =  alt.Order(
      'cost:N',
      sort='ascending'
    )
).transform_filter(
    alt.datum.disaster != 'all-disasters'
)

# Extra comment annotations
annotations = pd.DataFrame({'x': [2005, 2012, 2017], 
                            'y': [250, 150, 340], 
                            'text': ['Hurricane Katrina?', 'Hurricane Sandy?', 'Hurricane Harvey?']})
comments_chart = alt.Chart(annotations).mark_text(size=12).encode(
    x = alt.X('x:N'),
    y = alt.Y('y:Q'),
    text = 'text',
)

chart + comments_chart

We see that while severe-storms were driving the trend in terms of frequency of disasters, tropical-cyclones dominate in terms of their economic impact. In the earlier plot, we saw that tropical cyclones in 2005 and 2017 were outliers in their fatalities. However, we see that it is quite common for thee tropical storms to cause massive economic damage, particularly in the last two decades.

Let's see how the data looks like if we removed the effect of tropical-cyclones

In [322]:
alt.Chart(usa_data).mark_bar(point=True).encode(
    x = alt.X('year:N', axis=alt.Axis(title='Years', values=np.arange(1980,2030,5), labelAngle=0)),
    y = alt.Y('cost'),
    color='disaster',
    order =  alt.Order(
      'cost:N',
      sort='ascending'
    )
).transform_filter(
    ((alt.datum.disaster != 'all-disasters') & (alt.datum.disaster != 'tropical-cyclone'))
)

A few things to note here:
1) We see that droughts continue to cause significant economic damage, even though they do not cause as much death. 
2) Once we remove tropical-cyclones, we see severe-storms dominating the economic damages in the last two decades. One thing to note is that the various disasters are not completely independent.
3) We see that in the last three years (2018-2020), wildfires have been wreaking heavy economic damage.

Question: Are deaths and economic impact related?

In [299]:
alt.Chart(usa_data).mark_point().encode(
    alt.X('deaths'),
    alt.Y('cost')
).transform_filter(
    alt.datum.disaster == 'all-disasters'
)

In [316]:
alt.Chart(usa_data).mark_circle(size=100).encode(
    alt.X('deaths:Q'),
    alt.Y('cost:Q'),
    alt.Color('disaster:N')
).transform_filter(
    alt.datum.disaster != 'all-disasters'
).transform_filter(
    alt.datum.cost < 200
).properties(
    width = 600,
    height=500
)

In [314]:
alt.Chart(usa_data).mark_circle(size=100).encode(
    alt.X('deaths:Q'),
    alt.Y('cost:Q'),
    alt.Color('disaster:N')
).transform_filter(
    alt.datum.disaster != 'all-disasters'
).transform_filter(
    alt.datum.cost < 200
).transform_filter(
    alt.datum.deaths < 400
).properties(
    width = 600,
    height=500
)

### Split up By Regions

We have thus far focused on the aggregate statistics of the US. Let us now try to drill things down and focus on region specific data

Here, we'll mostly be working with count data

In [318]:
data.head()

Unnamed: 0,year,count,cost,deaths,lower75,upper75,lower90,upper90,lower95,upper95,countRank,costRank,deathsRank,disaster
0,1980,1.0,33.9,1260.0,26.9,26.9,25.0,25.0,23.9,23.9,1.0,3.0,1.0,drought
1,1981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought
2,1982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought
3,1983,1.0,8.0,0.0,5.6,5.6,5.1,5.1,4.7,4.7,1.0,8.0,42.0,drought
4,1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought


In [317]:
data = data.loc[data['region'].isin(reduced_regions), :]

KeyError: 'region'

In [79]:
# Print out all the disasters
disasters = data['disaster'].unique()
# data = data.loc[data['region'].isin(all_regions),:]
print(disasters)

['drought' 'flooding' 'freeze' 'severe-storm' 'tropical-cyclone'
 'wildfire' 'winter-storm' 'all-disasters']


We see that there are three disaster categories that fall under some form of storms. Tropical cyclones are revolving storms that begin in the tropics. Winter storms are storms in the winter. Let's just categorize all of these under the single term 'severe-storms' to prevent potential double counting.

In [80]:
data = data.replace('tropical-cyclone', 'severe-storm')
data = data.replace('winter-storm', 'severe-storm')

##### Sanity check:
We see that there is an 'all-disasters' here. Are all the disasters exclusive? Let's check if the sum of counts over all categories sum to the total disasters

In [81]:
alt.layer(
    alt.Chart(data).mark_bar().transform_filter(
        (alt.datum.disaster != 'all-disasters')
    ).encode(
        x = 'region:N',
        y = 'sum(count):Q'
    ),
    alt.Chart(data).mark_circle(color="#FFAA00",fillOpacity=1.0,size=100).transform_filter(
        (alt.datum.disaster == 'all-disasters')
    ).encode(
        x = 'region:N',
        y = 'sum(count):Q'
    )
).properties(
    width=500,
    height=250
    )

Ok. Looks like we have all the categories that also that they are exclusive -- otherwise, the sum over individual disasters should have been higher than 'all-disasters'

In [82]:
data.loc[((data['disaster'] == 'all-disasters') & (data['year'] == 2020)),
         :]

Unnamed: 0,cost,costRank,count,countRank,deaths,deathsRank,disaster,region,year
327,10.4,6.0,12.0,2.0,124.0,18.0,all-disasters,CCR,2020
655,11.8,3.0,5.0,3.0,59.0,13.0,all-disasters,ENCCR,2020
983,5.3,3.0,7.0,1.0,65.0,19.0,all-disasters,NECR,2020
1311,2.8,1.0,2.0,2.0,91.0,7.0,all-disasters,NWCR,2020
1639,33.9,4.0,18.0,1.0,184.0,13.0,all-disasters,SCR,2020
1967,15.4,8.0,14.0,1.0,125.0,21.0,all-disasters,SECR,2020
2295,2.1,6.0,2.0,10.0,91.0,10.0,all-disasters,SWCR,2020
2623,12.5,3.0,2.0,4.0,91.0,6.0,all-disasters,WCR,2020
2951,1.6,16.0,5.0,1.0,130.0,6.0,all-disasters,WNCCR,2020


### Visualizing Total Disaster Counts

First, let's take a look at whether there has been an increase in the number of annual billion-dollar disasters

In [90]:
data.groupby('disaster')['count'].sum()

disaster
all-disasters    913.0
drought          150.0
flooding          70.0
freeze            22.0
severe-storm     596.0
wildfire          75.0
Name: count, dtype: float64

In [110]:
# Plotting the total number of disasters across the regions
alt.Chart(data).mark_circle(size=100).transform_filter(
        (alt.datum.disaster == 'all-disasters') & 
        (alt.datum.region != 'CCR')
    ).encode(
    x = 'year:O',
    y = 'sum(count):Q',
    color ='region:N',
).properties(
    width=750,
    height=500
)

In [36]:
#TODO: Fit a curve here. Should show an exponential behavior

We see that severe-storms have consistently accounted for most of the billion-dollar disasters in the US. More worrying is that the frequency of such severe storms has been increasing consistently. The least frequent of these billion-dollar disasters have been `freeze` events.

We also see that there were essentially no billion-dollar wildfires upto 1990. Starting in 1999, economically devastating wildfires have been increasing in frequency. Since 2006, it has been fairly common to see about 5 billion-dollar wildires every year

The above graph gives us a sense of how the total number of annual disasters has been increasing over the years. Let's see the same data in a different visualization, that gives us a more visceral sense of change in frequence of disasters over the years

In [37]:
alt.Chart(data).mark_circle().transform_filter(
        (alt.datum.disaster != 'all-disasters') & (alt.datum.region != 'SCR')
    ).encode(
    x = 'year:O',
    size = 'sum(count):Q',
    y ='disaster:N',
).properties(
    width=800,
    height=500
)

In [38]:
#TODO: Focus on what has happened in the last two decades. Take 1980-2000 to be the average.
# Then plot things in relation to this average

We see that with the exception of freezing, we've been seeing all other forms of billion-dollar disasters more frequently. 

In [39]:
# TODO: Combine the plot above with the plot below. Allow user to select region, and see frequency of different disasters
# Also allow picking the type of disaster (versus all disasters)

In [40]:
# TODO: Sort by total number of disasters

In [41]:
# Plotting the total number of disasters across the regions
alt.Chart(data).mark_circle().transform_filter(
        alt.datum.disaster != 'all-disasters'
    ).encode(
    x = 'year:O',
    size = 'sum(count):Q',
    y ='region:N',
    color='region:N'
).properties(
    width=800,
    height=500
)

Clearly, 'severe storms' account for most of the billion-dollar natural disasters in the US. Also, we see that there has been a big increase in the number of severe storms. We also see an increase inflooding. 

In [330]:
data

Unnamed: 0,year,count,cost,deaths,lower75,upper75,lower90,upper90,lower95,upper95,countRank,costRank,deathsRank,disaster
0,1980,1.0,33.9,1260.0,26.9,26.9,25.0,25.0,23.9,23.9,1.0,3.0,1.0,drought
1,1981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought
2,1982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought
3,1983,1.0,8.0,0.0,5.6,5.6,5.1,5.1,4.7,4.7,1.0,8.0,42.0,drought
4,1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,42.0,42.0,drought
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,2016,15.0,51.5,138.0,41.4,41.4,39.2,39.2,37.8,37.8,4.0,11.0,24.0,all-disasters
324,2017,16.0,327.8,3278.0,266.7,266.7,242.6,242.6,228.1,228.1,2.0,1.0,1.0,all-disasters
325,2018,14.0,94.7,247.0,78.3,78.3,72.1,72.1,65.8,65.8,5.0,5.0,15.0,all-disasters
326,2019,14.0,46.1,44.0,37.0,37.0,34.9,34.9,32.5,32.5,5.0,12.0,36.0,all-disasters


In [329]:
alt.Chart(data).mark_circle().transform_filter(
        alt.datum.disaster != 'all-disasters'
    ).encode(
    x = 'year:O',
    y = 'region:N',
    size='sum(count)',
    row = 'disaster'
).properties(
    width=600,
    height=200
    )

In [43]:
#TODO: make this interactive so that you highlight the region when you select rows

In [44]:
# billion-dollar droughts and wildfires became very frequent sometime around the 2000s
# Flooding started becoming frequent around 2006
# 

## Economic Impact of Natural Disasters

Have these billion-dollar disasters been causing more financial damage over the years?

First, let's look at the trends in the total economic cost of all disasters over the years

In [45]:
alt.Chart(data).mark_bar().transform_filter(
        alt.datum.disaster == 'all-disasters'
    ).encode(
    alt.X('year:O'),
#     alt.Size('sum(cost)'),
#     alt.Y('region'),
#     alt.Fill('sum(deaths)'),
    alt.Y('sum(cost)')
).properties(
    width=800,
    height=450
    )

We see that there seems to be a general increase. Since we're only looking at billion-dollar disasters, it doesn't quite make sense to compute a trend line. We're not looking at 'annual cost of disasters' since we're missing many disasters. What we should be taking away from this figure is that it is becoming more frequenty to observse years where we have natural disasters with devastating economic impact. 

From 1980-2000, we didn't really see years with more than ~60 billion in damages. However, as we step into the past two decades, we see we've seen years with ~80 billion in damages every 4 years or so. Particularly, we've seen 3 disasters with costs of >100 billion within the span of 15 years, with a frequency of about 5 years.

## The Human Toll of Billion-Dollar Disasters

Let's compare the trends of deaths over time

In [398]:
alt.Chart(data).mark_bar().transform_filter(
        alt.datum.disaster != 'all-disasters'
    ).encode(
    alt.X('year:O'),
    alt.Y('disaster'),
    alt.Color('sum(deaths)')
).properties(
    width=800,
    height=450
)

Fortunately, we don't seem to be seeing an increasing trend in the number of deaths over the years.
We seem to be seeing a few notable events of droughts and sever storms

#### Droughts
In 1980, we see a major drought that took a huge human life toll. This was the result of a major heat wave. We had similar years with bad droughts in 1989, and in 1995. The year 1988-1989 was marked as one of the worst periods of droughts in North America.

In [393]:
alt.Chart(data).mark_bar().transform_filter(
        alt.datum.disaster == 'drought'
    ).encode(
    alt.X('year:O'),
    alt.Y('region'),
    alt.Color('sum(deaths)')
).properties(
    width=800,
    height=450
)

Overall, the western parts of the US have been less affected by these droughts. It is the central parts of the country that seem to have been hit hard thorugh all of these periods of intense drought.

##### Severe Storms

In [394]:
alt.Chart(data).mark_bar().transform_filter(
        alt.datum.disaster == 'severe-storm'
    ).encode(
    alt.X('year:O'),
    alt.Y('region'),
    alt.Color('sum(deaths)')
).properties(
    width=800,
    height=450
)

The major human death event in 2005 was centered around three regions -- CCR, SCR, and SECR. The same three regions also saw another big human loss of life in 2011-2012. These events correspond to Hurricanes Katrina and Harvey.

In [388]:
# TODO: Add annotations. 
# 2005 was hurricane Katrina in SCT (Louisianna)
# 2012 was hurricane sandy in NECR
# 2017 was probably Hurricane Harvey

In [400]:
data

Unnamed: 0,cost,costRank,count,countRank,deaths,deathsRank,disaster,region,year,death_per_disaster,cost_per_disaster
0,5.0,3.0,1.0,1.0,1260.0,1.0,drought,CCR,1980,1260.000000,5.000000
1,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1981,,
2,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1982,,
3,1.6,8.0,1.0,1.0,0.0,42.0,drought,CCR,1983,0.000000,1.600000
4,0.0,42.0,0.0,42.0,0.0,42.0,drought,CCR,1984,,
...,...,...,...,...,...,...,...,...,...,...,...
2947,1.5,18.0,4.0,5.0,23.0,18.0,all-disasters,WNCCR,2016,5.750000,0.375000
2948,5.1,6.0,5.0,1.0,56.0,11.0,all-disasters,WNCCR,2017,11.200000,1.020000
2949,0.6,19.0,3.0,7.0,106.0,9.0,all-disasters,WNCCR,2018,35.333333,0.200000
2950,6.2,4.0,3.0,7.0,10.0,23.0,all-disasters,WNCCR,2019,3.333333,2.066667
