In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import adjustText as aT
import geopandas as gpd
%matplotlib inline

# Explore Data

In [None]:
df = pd.read_csv("statewide_cases.csv")

In [None]:
# Sanity checks
df.shape

In [None]:
df['county'].nunique()

In [None]:
[col for col in df.columns]

In [None]:
df.dtypes

In [None]:
round(df.describe(), 2)

In [None]:
df[df['newcountconfirmed'] < 0].count()

In [None]:
df[df['newcountdeaths'] < 0].count()

In [None]:
df_county = df.groupby('county').nunique()

In [None]:
# Remove unassigned and out of county values
df = df[df["county"] != 'Unassigned']
df = df[df["county"] != 'Out Of Country']

# Merge Datasets and Calculate Indices

In [None]:
# Import Geography dataset
fp = "CA_Counties/CA_Counties_TIGER2016.shp"
map_df = gpd.read_file(fp)
map_df.head()

# Merge with original dataset
merged = map_df.merge(df, how = 'left', left_on='NAME', right_on='county')

# Select only desired variables
merged2 = merged[['NAME', 'geometry', 'totalcountconfirmed', 'totalcountdeaths', 'newcountconfirmed', 'newcountdeaths', 'date']]

In [None]:
# Add County population data
county_pop = pd.read_csv('california_county_pop.csv')

# Drop growth rate column
county_pop.drop('GrowthRate', axis=1, inplace=True)

# Remove suffix from county names so can merge with DF
county_pop['CTYNAME'].replace(' County', '', regex=True, inplace=True)

In [None]:
# Create new master DF with geography and county populations
merged3 = merged2.merge(county_pop, how = 'left', left_on='NAME', right_on='CTYNAME')

# Drop extraneous name variable
merged3.drop(['CTYNAME'], axis=1, inplace=True)

In [None]:
# Replace negative values with 0
merged3[merged3['newcountdeaths'] <0] = 0
merged3[merged3['newcountconfirmed'] <0] = 0

In [None]:
# Calculate incidence/prevalence/mortality
merged3['incidence_rate'] = merged3['newcountconfirmed']/merged3['pop2018']
merged3['mortality_rate'] = merged3['newcountdeaths']/merged3['pop2018']
merged3['prevalence'] = merged3['totalcountconfirmed']/merged3['pop2018']

# Subdivide Dataset into Months, Groupby and Calculate Means

In [None]:
# Split dataset into 4 months
month_1 = merged3[(merged4['date'] >= pd.to_datetime('2020-03-18')) & (merged3['date'] < pd.to_datetime('2020-04-18'))]
month_2 = merged3[(merged4['date'] >= pd.to_datetime('2020-04-18')) & (merged3['date'] < pd.to_datetime('2020-05-18'))]
month_3 = merged3[(merged4['date'] >= pd.to_datetime('2020-05-18')) & (merged3['date'] < pd.to_datetime('2020-06-18'))]
month_4 = merged3[(merged4['date'] >= pd.to_datetime('2020-06-18')) & (merged3['date'] < pd.to_datetime('2020-07-18'))]                

In [None]:
# Groupby county name and calculate mean
month_1_grp = month_1.groupby('NAME').mean()
month_2_grp = month_2.groupby('NAME').mean()
month_3_grp = month_3.groupby('NAME').mean()
month_4_grp = month_4.groupby('NAME').mean()

# Reset county indices
month_1_grp.reset_index()
month_2_grp.reset_index()
month_3_grp.reset_index()
month_4_grp.reset_index()

# Replace lost geometry column
month_1_grp = map_df.merge(month_1_grp, how = 'left', left_on='NAME', right_on='NAME')
month_2_grp = map_df.merge(month_2_grp, how = 'left', left_on='NAME', right_on='NAME')
month_3_grp = map_df.merge(month_3_grp, how = 'left', left_on='NAME', right_on='NAME')
month_4_grp = map_df.merge(month_4_grp, how = 'left', left_on='NAME', right_on='NAME')

# Eliminate unneeded variables
month_1_grp = month_1_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_2_grp = month_2_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_3_grp = month_3_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]
month_4_grp = month_4_grp[['NAME', 'geometry', 'newcountconfirmed', 'newcountdeaths', 'totalcountconfirmed', 'newcountdeaths', 'pop2018', 'incidence_rate', 'mortality_rate', 'prevalence']]


# Create Choropleths

In [None]:
# Calculate centroids for labeling tasks
month_1_grp['center'] = month_1_grp['geometry'].centroid
month_2_grp['center'] = month_2_grp['geometry'].centroid
month_3_grp['center'] = month_3_grp['geometry'].centroid
month_4_grp['center'] = month_4_grp['geometry'].centroid

# Create copies for use by geopandas
month_1_points = month_1_grp.copy()
month_2_points = month_2_grp.copy()
month_3_points = month_3_grp.copy()
month_4_points = month_4_grp.copy()

# Set geometry
month_1_points.set_geometry('center', inplace=True)
month_2_points.set_geometry('center', inplace=True)
month_3_points.set_geometry('center', inplace=True)
month_4_points.set_geometry('center', inplace=True)

In [None]:
# The code to create the choropleths and label them - algorithm labels county if its averaged monthly index in top 1/2 of Statewide average
# Month 1 incidence
vmin, vmax = min(merged4['incidence_rate']), max(merged4['incidence_rate'])
ax = month_1.plot(column='incidence_rate', figsize = (10,10), cmap='Reds', linewidth=1.0, edgecolor='0.7', vmin=vmin, vmax=vmax, legend=True, norm=plt.Normalize(vmin=vmin, vmax=vmax))
plt.axis('off')
plt.title('Average Incidence', fontsize=17)
texts = []

for x, y, label, metric in zip(month_1_points.geometry.x, month_1_points.geometry.y, month_1_points['NAME'], month_1_points['incidence_rate']):
        if metric >= month_1_grp['incidence_rate'].mean():
            texts.append(plt.text(x,y,label,fontsize=8))
        else:
            pass

aT.adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle="->", color='black'))