In [1]:
import pandas as pd   
import numpy as np   
import plotly.graph_objects as go
import plotly.express as px
import math
from utils import StatesDataFrame, AnimatedBase1
# from utils.AnimatedBase import AnimatedBase1
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation #For clustering

cleaning


In [2]:
df = StatesDataFrame().df


cleaning


In [2]:
url = 'https://covidtracking.com/api/v1/states/daily.csv'

In [3]:
df = pd.read_csv(url, parse_dates=['date']).sort_index()

In [6]:
df.columns

Index(['date', 'state', 'positive', 'negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'recovered', 'dataQualityGrade', 'lastUpdateEt', 'dateModified',
       'checkTimeEt', 'death', 'hospitalized', 'dateChecked', 'fips',
       'positiveIncrease', 'negativeIncrease', 'total', 'totalTestResults',
       'totalTestResultsIncrease', 'posNeg', 'deathIncrease',
       'hospitalizedIncrease', 'hash', 'commercialScore',
       'negativeRegularScore', 'negativeScore', 'positiveScore', 'score',
       'grade'],
      dtype='object')

In [21]:
df = df.fillna(0)

In [22]:
df = df[df['date'] == max(df['date'])]

In [23]:
df.drop(columns=['index', 'date', 'fips'], inplace=True)

In [24]:
df = df.set_index('state')

In [25]:
df.loc['AK'].to_numpy().shape

(20,)

In [3]:
def cosine_sim(state):
    df = StatesDataFrame().df
    df = df.fillna(0)
    df = df[df['date'] == max(df['date'])]
    df.drop(columns=['index', 'date', 'fips'], inplace=True)
    df = df[['state','new positive (per capita)', 'tests last week (per capita)', 
    'testing rate of change (last 7 days average)', 'positive case pct rate of change (last 7 days average)',
    'positive cases rate of change (last 7 days average)']]
    df = df.set_index('state')
    sim_list = []
    states = df.index.to_numpy()
    a = df.loc[state].fillna(0).to_numpy()
    for stateB in states:
        b = df.loc[stateB].fillna(0).to_numpy()
        try:
            similarity = cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))
            if stateB != state:
                entry = {'state': stateB, 'similarity': similarity[0][0]}
                sim_list.append(entry)
        except:
            print(stateB)
    simDf = pd.DataFrame(sim_list).sort_values(by='similarity', ascending=False)
    return simDf.head(10)

In [28]:
class AnimatedHeatMap(object):
    
    def __init__(self, criteria1, criteria2):
        self.df = StatesDataFrame().df.sort_values(by='date')
        self.dates = self.df.date.unique()
        self.criteria1, self.criteria2 = criteria1, criteria2
        print('prepping and initializing')
        self.prepareDataFrame()
        self.createFrames()
        print('creating frames')
        self.Animate.finalizeSliders()
        
        
    def _KmeansClustering(self, X, nclust=4):
        model = KMeans(nclust)
        model.fit(X)
        clust_labels = model.predict(X)
        return clust_labels

    def prepareDataFrame(self):
        self.df = self.df[['date', 'state', self.criteria1, self.criteria2]]
        fig = go.Figure()
        self.Animate = AnimatedBase1(fig)
        first_date = min(self.df.date)
        
        df0 = self.df[self.df['date'] == first_date]
      
#         df0['clusters'] = self._KmeansClustering(df0.iloc[:, 2:])
        trace0 = go.Heatmap(
        x = df0[criteria1],
        y = df0[criteria2],
        z = df0['clusters'],
        zsmooth = "best",
        colorscale='Viridis'
        )
        trace1 = go.Scatter(
        x = df0[self.criteria1],
        y = df0[self.criteria2],
        mode = 'markers+text',
        text = df0['state'],
        marker=dict(
            color = df0['clusters']
        ),
        textposition = 'top center',
        showlegend=False,
        textfont = {'color': 'white'}
        )  
        self.plots = [trace1]
        self.Animate.setInitial(self.plots)
                
                
    def createFrames(self):
        frames = []
        for date in self.dates[1:]:
                print(date)
                frames_dict = dict(
                    name = str(date).split('T')[0],
                    data = []
                )
                df_i = self.df[self.df['date'] == date]
                try:
                    df_i['clusters'] = self._KmeansClustering(df_i.iloc[:, 2:])
#                     trace0 = go.Heatmap(
#                     x = df_i[self.criteria1],
#                     y = df_i[self.criteria2],
#                     z = df_i['clusters'],
#                     zsmooth = "best",
#                     colorscale='Viridis'
#                     )
                    trace1 = go.Scatter(
                    x = df_i[self.criteria1],
                    y = df_i[self.criteria2],
                    mode = 'markers+text',
                    text = df_i['state'],
                    marker=dict(
                        color = df_i['clusters']
                    ),
                    textposition = 'top center',
                    showlegend=False,
                    textfont = {'color': 'white'}
                    )  
#                     frames_dict['data'].append(trace0)
                    frames_dict['data'].append(trace1)
                except:
                    trace1 = go.Scatter(
                    x = df_i[self.criteria1],
                    y = df_i[self.criteria2],
                    mode = 'markers+text',
                    text = df_i['state'],
#                     marker=dict(
#                         color = df_i['clusters']
#                     ),
                    textposition = 'top center',
                    showlegend=False,
                    textfont = {'color': 'white'}
                    )  
#                     frames_dict['data'].append(trace0)
                    frames_dict['data'].append(trace1)
                    
                self.Animate.addSliderStep(str(date).split('T')[0])
                frames.append(frames_dict)
        self.Animate.fig.update(frames=frames)
            

In [38]:
def KmeansClustering(X, nclust=4):
    model = KMeans(nclust)
    model.fit(X)
    clust_labels = model.predict(X)
    return clust_labels

In [98]:
def compare_scatter(criteria1, criteria2):
    df = StatesDataFrame().df
    df = df.fillna(0)
#     df = df[df['date'] == max(df['date'])]
    last_date = max(df.date)
#     df.drop(columns=['index', 'fips'], inplace=True)
#     df = df[['date''state','new positive (per capita)', 'tests last week (per capita)', 
#     'testing rate of change (last 7 days average)', 'positive case pct rate of change (last 7 days average)',
#     'positive cases rate of change (last 7 days average)']]
    df = df[['date', 'state', criteria1, criteria2]]
    df0 = df[df['date'] == last_date]
    states = df0.state
    X_forClust = df0.iloc[:, 2:]
    df0['clusters'] = KmeansClustering(X_forClust)
    trace0 = go.Heatmap(
        x = df0[criteria1],
        y = df0[criteria2],
        z = df0['clusters'],
        zsmooth = "best",
        colorscale=['#F5BE05', '#2EC152', '#2E39C1', '#8E2EC1'],
        opacity=0.65,
        colorbar = {'tickvals': [0, 1, 2, 3]}
        )
    trace1 = go.Scatter(
    x = df0[criteria1],
    y = df0[criteria2],
    mode = 'markers+text',
    text = df0['state'],
    marker=dict(
        color = df0['clusters']
    ),
    textposition = 'top center',
    showlegend=False,
    textfont = {'color': 'white'}
    )
    fig = go.Figure()
    fig.add_trace(trace0)
    fig.add_trace(trace1)
    fig.update_layout(
        title='{} vs {}'.format(criteria1, criteria2),
        xaxis_title = criteria1,
        yaxis_title = criteria2
    )
    return fig.show()

In [78]:
heatmap = AnimatedHeatMap('new positive (per capita)', 'testing rate of change (last 7 days average)')

cleaning
prepping and initializing
2020-01-23T00:00:00.000000000
2020-01-24T00:00:00.000000000
2020-01-25T00:00:00.000000000
2020-01-26T00:00:00.000000000
2020-01-27T00:00:00.000000000
2020-01-28T00:00:00.000000000
2020-01-29T00:00:00.000000000
2020-01-30T00:00:00.000000000
2020-01-31T00:00:00.000000000
2020-02-01T00:00:00.000000000
2020-02-02T00:00:00.000000000
2020-02-03T00:00:00.000000000
2020-02-04T00:00:00.000000000
2020-02-05T00:00:00.000000000
2020-02-06T00:00:00.000000000
2020-02-07T00:00:00.000000000
2020-02-08T00:00:00.000000000
2020-02-09T00:00:00.000000000
2020-02-10T00:00:00.000000000
2020-02-11T00:00:00.000000000
2020-02-12T00:00:00.000000000
2020-02-13T00:00:00.000000000
2020-02-14T00:00:00.000000000
2020-02-15T00:00:00.000000000
2020-02-16T00:00:00.000000000
2020-02-17T00:00:00.000000000
2020-02-18T00:00:00.000000000
2020-02-19T00:00:00.000000000
2020-02-20T00:00:00.000000000
2020-02-21T00:00:00.000000000
2020-02-22T00:00:00.000000000
2020-02-23T00:00:00.000000000
2020-



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020-03-08T00:00:00.000000000
2020-03-09T00:00:00.000000000
2020-03-10T00:00:00.000000000
2020-03-11T00:00:00.000000000
2020-03-12T00:00:00.000000000
2020-03-13T00:00:00.000000000
2020-03-14T00:00:00.000000000
2020-03-15T00:00:00.000000000
2020-03-16T00:00:00.000000000
2020-03-17T00:00:00.000000000
2020-03-18T00:00:00.000000000
2020-03-19T00:00:00.000000000
2020-03-20T00:00:00.000000000
2020-03-21T00:00:00.000000000
2020-03-22T00:00:00.000000000
2020-03-23T00:00:00.000000000
2020-03-24T00:00:00.000000000
2020-03-25T00:00:00.000000000
2020-03-26T00:00:00.000000000
2020-03-27T00:00:00.000000000
2020-03-28T00:00:00.000000000
2020-03-29T00:00:00.000000000
2020-03-30T00:00:00.000000000
2020-03-31T00:00:00.000000000
2020-04-01T00:00:00.000000000
2020-04-02T00:00:00.000000000
2020-04-03T00:00:00.000000000
2020-04-04T00:00:00.000000000
2020-04-05T00:00:00.000000000
2020-04-06T00:00:00.000000000
2020-04-07T00:00:00.000000000
2020-04-08T00:00:00.000000000
2020-04-09T00:00:00.000000000
2020-04-10

In [31]:
heatmap.Animate.fig.show()

In [99]:
compare_scatter('new positive (per capita)', 'testing rate of change (last 7 days average)')

cleaning




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [88]:
compare_scatter('new positive (per capita)', 'testing rate of change (last 7 days average)')

cleaning


In [15]:
df = cosine_sim('HI').reset_index().drop(columns=['index']).reset_index()

cleaning


In [16]:
df['index'] = df['index'] +1

In [17]:
plt.scatter(d)

Unnamed: 0,index,state,similarity
0,1,CA,0.863466
1,2,KY,0.766549
2,3,GA,0.71148
3,4,WA,0.686438
4,5,OK,0.68613
5,6,SC,0.685984
6,7,OR,0.685063
7,8,ND,0.684173
8,9,LA,0.683528
9,10,MO,0.682864


In [32]:
df

Unnamed: 0,index,date,state,fips,positive,negative,death,hospitalizedCumulative,onVentilatorCumulative,new positive cases,...,tests last week,tests last week (per capita),testing rate of change,testing rate of change (last 7 days average),positive case pct,positive case pct (last 7 days average),positive case pct rate of change,positive case pct rate of change (last 7 days average),positive cases rate of change,positive cases rate of change (last 7 days average)
4393,5224,2020-01-22,WA,53,1.0,,,,,0.0,...,0.0,0.000000,0.000000,0.000000,,0.000000,,0.000000,,0.000000
4394,5223,2020-01-23,WA,53,1.0,,,,,0.0,...,0.0,0.000000,0.000000,0.000000,,0.000000,,0.000000,,0.000000
4395,5222,2020-01-24,WA,53,1.0,,,,,0.0,...,0.0,0.000000,0.000000,0.000000,,0.000000,,0.000000,,0.000000
4396,5221,2020-01-25,WA,53,1.0,,,,,0.0,...,0.0,0.000000,0.000000,0.000000,,0.000000,,0.000000,,0.000000
4397,5220,2020-01-26,WA,53,1.0,,,,,0.0,...,0.0,0.000000,0.000000,0.000000,,0.000000,,0.000000,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4392,51,2020-06-06,VT,50,1046.0,39037.0,55.0,,,19.0,...,7416.0,0.011841,0.000294,0.000496,0.009304,0.005584,3.401355,-2.742126,0.894737,-0.098371
4529,52,2020-06-06,WA,53,22993.0,367870.0,1149.0,3639.0,,264.0,...,42630.0,0.005657,0.000283,0.000052,0.045086,0.050660,-14.115090,-15.848946,-0.162879,0.048871
4624,53,2020-06-06,WI,55,20571.0,314802.0,645.0,2832.0,,322.0,...,74235.0,0.012769,0.000335,0.000179,0.031535,0.040993,-9.685223,9.531618,-0.624224,-0.601321
4717,54,2020-06-06,WV,54,2131.0,107592.0,84.0,,,12.0,...,13833.0,0.007660,-0.000724,0.000133,0.011350,0.014853,43.688906,-32.945304,-0.916667,-0.788095
