In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time

## Read data

In [6]:
timestr = time.strftime("%Y_%m_%d")

In [7]:
df = pd.read_csv(r'C:\Users\pspat\Documents\school_related\MSiA\MSiA_423\project\dev_data\covid19_time_series_{}.csv'.format(timestr))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Country,CountryCode,Province,City,CityCode,Lat,Lon,Confirmed,Deaths,Recovered,Active,Date
0,0,Afghanistan,AF,,,,33.94,67.71,0,0,0,0,2020-01-22T00:00:00Z
1,1,Afghanistan,AF,,,,33.94,67.71,0,0,0,0,2020-01-23T00:00:00Z
2,2,Afghanistan,AF,,,,33.94,67.71,0,0,0,0,2020-01-24T00:00:00Z
3,3,Afghanistan,AF,,,,33.94,67.71,0,0,0,0,2020-01-25T00:00:00Z
4,4,Afghanistan,AF,,,,33.94,67.71,0,0,0,0,2020-01-26T00:00:00Z


In [11]:
df.describe()

Unnamed: 0.1,Unnamed: 0,CityCode,Lat,Lon,Confirmed,Deaths,Recovered,Active
count,291579.0,269833.0,291579.0,291579.0,291579.0,291579.0,291579.0,291579.0
mean,145789.0,32090.92956,36.042511,-80.930828,100.071514,5.19332,24.397686,0.0
std,84171.751407,17822.444503,11.416278,40.661389,2294.112659,195.420274,906.264386,0.0
min,0.0,60.0,-51.8,-170.13,0.0,0.0,0.0,0.0
25%,72894.5,19037.0,33.62,-97.48,0.0,0.0,0.0,0.0
50%,145789.0,30045.0,37.94,-88.7,0.0,0.0,0.0,0.0
75%,218683.5,47011.0,41.68,-81.17,0.0,0.0,0.0,0.0
max,291578.0,99999.0,71.71,179.41,170099.0,20465.0,64727.0,0.0


In [41]:
##Active is always zero...manually correct via simple calculation
df['Active'] = df['Confirmed'] - df['Deaths'] - df['Recovered']

## Aggregate to the Country by date level

In [42]:
country_date_df = df.groupby(['Country','Date']).sum()[["Confirmed","Recovered","Active","Deaths"]].reset_index()
country_date_df.isna().sum()

Country      0
Date         0
Confirmed    0
Recovered    0
Active       0
Deaths       0
dtype: int64

In [43]:
country_date_df.head()

Unnamed: 0,Country,Date,Confirmed,Recovered,Active,Deaths
0,Afghanistan,2020-01-22T00:00:00Z,0,0,0,0
1,Afghanistan,2020-01-23T00:00:00Z,0,0,0,0
2,Afghanistan,2020-01-24T00:00:00Z,0,0,0,0
3,Afghanistan,2020-01-25T00:00:00Z,0,0,0,0
4,Afghanistan,2020-01-26T00:00:00Z,0,0,0,0


In [45]:
country_date_df.describe()

Unnamed: 0,Confirmed,Recovered,Active,Deaths
count,15189.0,15189.0,15189.0,15189.0
mean,1921.044967,468.355586,1352.994667,99.694713
std,15974.590166,4524.167198,12929.272778,948.445579
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,47.0,2.0,39.0,0.0
max,580223.0,78039.0,513221.0,23520.0


## Aggregate to global level and make trend plots

In [46]:
global_date_df = country_date_df.groupby('Date').sum()[["Confirmed","Recovered","Active","Deaths"]].reset_index()

In [47]:
fig = px.line(global_date_df, x="Date", y="Confirmed", 
              height=500, title='Confirmed COVID-19 Cases Globally by Date'
             )
fig.show()

In [48]:
fig = px.line(country_date_df.loc[country_df_regres['Country']=='United States of America'], x="Date", y="Confirmed", 
              height=500, title='Global Confirmed Cases'
             )
fig.show()

In [49]:
fig = px.line(country_df_regres.loc[country_df_regres['Country']=='China'], x="Date", y="Confirmed", 
              height=500, title='Global Confirmed Cases'
             )
fig.show()

In [50]:
global_date_df[-1:]

Unnamed: 0,Date,Confirmed,Recovered,Active,Deaths
82,2020-04-13T00:00:00Z,1916203,448036,1348706,119461


In [52]:
temp = global_date_df.groupby('Date')['Recovered', 'Deaths', 'Confirmed'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Confirmed'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.area(temp, x="Date", y="Count", color='Case', height=600,
             title='Cases over time', color_discrete_sequence = ['green', 'red', 'cyan'])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [55]:
df_data = country_date_df.groupby(['Date', 'Country'])['Confirmed', 'Deaths'].max().reset_index()
df_data["Date"] = pd.to_datetime( df_data["Date"]).dt.strftime('%m/%d/%Y')

fig = px.scatter_geo(df_data, locations="Country", locationmode='country names', 
                     color=np.power(df_data["Confirmed"],0.3)-2 , size= np.power(df_data["Confirmed"]+1,0.25)-1, hover_name="Country",
                     hover_data=["Confirmed"],
                     range_color= [0, max(np.power(df_data["Confirmed"],0.25))], 
                     projection="natural earth", animation_frame="Date", 
                     color_continuous_scale=px.colors.sequential.Plasma,
                     title='COVID-19: Progression of spread'
                    )
fig.update_coloraxes(colorscale="YlOrRd")
fig.update(layout_coloraxis_showscale=False)
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



<Figure size 2880x2880 with 0 Axes>