## Aim: Exploring the temporal and topographic pattern of Covid19 confirmed cases in the US
### 1. How the case numbers fluctuate in the US across the states?
### 2. How are the states related to each other in terms of their case fluctuation?

## Methods:
1. Calculate covariation of the fluctuation since Mid Feb to present among all the states (Pairwise Pearson correlation matrix)
2. Perform clustering on the covariation pattern to group the states into clusters

## Results
### There are several temporal patterns across the Covid19 case fluctuation in the US (50 states + D.C.), e.g.,
    * NY and neighbour stats saw early high rise and has been okay since (e.g., NY, CT)
    * The South saw igh rise during the summer (e.g., AZ, FL)
    * Mid-atlantic has been stably high to moderate except a short period in June (e.g., MD)
    * Of couser it has been high everywhere recently...
    * Hawaii is by its own, with high rise around August and hasn't been affected too much by the recent surge.
    
## Conclusion
* States may be grouped by geography and/or similar socioeconomic status.
* It would be intersting to see how things are going to unfold in the future.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Preparation and having a glance of the overall data

In [None]:
from matplotlib import pyplot as pl
from sklearn import cluster,metrics
from scipy.cluster import hierarchy as hc
from scipy import signal
def flattenMatrix(m):
    # helper to get the upper-triangle values 
    if np.shape(m)[0] != np.shape(m)[1]:
        print('Input is not a square %s' % m.shape)
        return
    n = int(m.shape[0])
    flatv = np.zeros(int((n*(n-1))/2))
    count = 0 
    for i in range(n):
        for j in range(1+i,n):
            flatv[count] = m[i,j]
            count += 1
    return flatv
# Confirmed cases in the US
df_us_conf = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_us_confirmed_cases.csv',header=[0,1],index_col=0)
#print(df_us_conf.shape)
states = np.unique(df_us_conf.columns.get_level_values(0))#print(len(states),states)
n_states = len(states)
dates = df_us_conf.axes[0]
dates = dates.map(lambda x: '-'.join(x.split('/')[:-1]))
n_days = len(dates)
# Sum up numbers for each state
ds = np.zeros((n_states, n_days))
for ist, st in enumerate(states):
    '''If to exclude cases Out of state and Unassigned:
    counties = df_us_conf[st].axes[1]
    ctr = (counties.map(lambda x: x.startswith('Out of')))
    ctr = np.asarray((ctr.values | (counties=='Unassigned')),dtype=bool)
    counties = counties[~ctr]'''
    print('%d %s totalN=%d' % (ist,st,df_us_conf[st].sum().sum()))
    # sum all counties
    ds[ist] = df_us_conf[st].sum(1).values

In [None]:
pl.figure(figsize=(18,5))
pl.plot(ds.T)
for ist, st in enumerate(states):
    pl.text(ds[ist].argmax(), ds[ist].max(),st)
pl.xticks(np.arange(0,n_days,30),dates[np.arange(0,n_days,30)])
pl.show()

In [None]:
# May do some simple smoothing to mitigate day to day variation due to uninteresting reason (e.g., weekends)
lmbd = 1
ds_sm = np.empty_like(ds)
for ist, st in enumerate(states):
    ds_sm[ist] = signal.cspline1d(ds[ist],lmbd)
pl.figure(figsize=(18,5))
pl.plot(ds_sm.T)
for ist, st in enumerate(states):
    pl.text(int(ds_sm[ist].argmax()), int(ds_sm[ist].max()), st)
pl.xticks(np.arange(0,n_days,15),dates[np.arange(0,n_days,15)])
pl.show()

# 2. Clustering the states based on their pairwise correlation 
** Use Pearson correlation to make it scale invariant, since the numbers are not normalized by population.
* Selecting a time-period by specifying start and end day
* Excluding some states/regions: here I chose to exclude the territories due to potential unreliability

In [None]:
# start and end date ('month-day')
date_begin = '2-15'
date_end = '11-18'
time_period = np.nonzero((dates==date_begin) | (dates==date_end))[0]
# regions/states to exclude (Territories)
states_exclude = ['American Samoa','Diamond Princess', 'Grand Princess','Northern Mariana Islands','Guam','Virgin Islands','Puerto Rico']
lbl_exclude_states = np.asarray([(st not in states_exclude) for st in states])
states_sub = states[lbl_exclude_states]
ds_tmp = ds_sm[lbl_exclude_states, time_period[0]:time_period[1]]
print(ds_tmp.shape)
cm = np.corrcoef(ds_tmp) 
pl.figure(figsize=(5,5))
pl.imshow(cm,cmap='jet')
pl.xticks(range(n_states-len(states_exclude)),states_sub,rotation=90)
pl.yticks(range(n_states-len(states_exclude)),states_sub)
pl.colorbar()
pl.show()

In [None]:
# Perform simple Hierarchical clustering
dm = 1-cm 
dm[np.isnan(dm)] = 1
Z = hc.linkage(flattenMatrix(dm), method='ward')
pl.figure(figsize=(10,4))
dn = hc.dendrogram(Z , labels=states_sub)
pl.show()

In [None]:
# Flatten the dendrogram into given n of clusters
n_clusters = 6
clustering = hc.fcluster(Z,n_clusters,criterion='maxclust')#clustering
for i in range(1,1+n_clusters):
    print('#%d n=%d' % (i,sum(clustering==i)))
    print(states_sub[clustering==i])

# 3. Visualize the mean time-course of each cluster of states
### Note that the absolute numbers (i.e., y-axis) are not of interest here (thus the values are scaled).
### The focus is the trend along the time of each cluster.

In [None]:
# May assign a label arbitrarily for each cluster for vis purpose, otherwise just use the first state's name plus clustersize:
clusterNames =  []#
for i in range(1,1+n_clusters):
    clusterNames.append(states_sub[clustering==i][0]+str(sum(clustering==i)))
pl.figure(figsize=(18,5))
for targCluster in range(1,1+n_clusters):
    #pl.subplot(n_clusters,1,targCluster)
    subds = ds_sm[lbl_exclude_states][clustering==targCluster]
    subds = (subds - np.nanmean(subds))/np.nanstd(subds) #scaling for visualization purpose
    pl.errorbar(range(n_days),np.mean(subds,0),yerr=np.std(subds,axis=0)/np.sqrt(subds.shape[0]),\
                linewidth=2,label = clusterNames[targCluster-1])
pl.xticks(np.arange(0,n_days,10),dates[np.arange(0,n_days,10)])
pl.axvspan(xmin=time_period[0], xmax=time_period[1],color='gray',alpha=.16)#marked the time period used for clustering
pl.legend()
pl.xlabel('Date')
pl.ylabel('Normalized confirmed case N')
pl.show()