### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Use pip install plotly==4.7.1
#or
#  conda install -c plotly plotly=4.7.1
# to install plotly for this project

## Loading Data  

#### The following read commands contains link to the raw github repo of COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University. Below is the link to the repository:

### https://github.com/CSSEGISandData/COVID-19

#### In the above repository there are many data files, but the one we use are cssc covid19 times series. Below is the direct link:
### https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

In [None]:
df_confirmed_us = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/'
                              'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv')

df_confirmed_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/'
                                  'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')

df_deaths_us = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/'
                           'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')

df_deaths_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/'
                               'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')

df_recovered_global = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/'
                                  'csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [None]:
df_confirmed_global

### To calculate total global cases per day, we sum the values for each day across all 266 rows. 

In [None]:
# Total cases per day
confirmed_global = df_confirmed_global.drop(labels=['Province/State','Lat', 'Long'], axis='columns')
confirmed_global = confirmed_global.sum(axis='index', numeric_only=True)
confirmed_global

### The following piece of code will produce an interactive plot. You can interact with the plot by hovering the mouse on the curve, zoom in/out, panning etc.

In [None]:
fig = go.Figure(
    data=[go.Scatter(x=confirmed_global.index, y=confirmed_global.values)],
    layout=go.Layout(
        title=go.layout.Title(text="Confirmed Cases per day (Global)"),
        xaxis_title='Days',
        yaxis_title='Number of Cases'
    )
)

fig.show()

In [None]:
fig.update_layout(title='Confirmed Cases (Global) per day (Log Scale)',
                  yaxis_title='Number of Cases (Log)',
                  yaxis_type="log",
                 )
fig.show()

### In the following piece of code we try to evaluate whether the number of cases has peaked or not. 

In [None]:
# confirmed cases per week
confirmed_global_perweek = confirmed_global[[x for x in range(len(confirmed_global)) if x % 7 == 6]]


# computing the number of new cases per week
newcases_global_perweek = np.diff(confirmed_global_perweek.to_numpy(), n = 1)
newcases_global_perweek = np.insert(newcases_global_perweek, 0, confirmed_global_perweek.iloc[0])
newcases_global_perweek = pd.Series(newcases_global_perweek, 
                                           index=confirmed_global_perweek.index)
newcases_global_perweek



In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=newcases_global_perweek.index,
    y=newcases_global_perweek.values
))

fig.update_layout(title='Confirmed New Cases (Global) per week',
                   xaxis_title='Weeks (ending on)',
                   yaxis_title='New Cases')

fig.show()

### Repeating the same procedure for US cases

In [None]:
df_confirmed_us

In [None]:
confirmed_us = df_confirmed_us.drop(labels=['UID', 'iso2', 'iso3', 'code3','FIPS','Lat','Long_'],
                                    axis='columns')
confirmed_us = confirmed_us.sum(axis='index', numeric_only=True)
confirmed_us

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=confirmed_us.index,
    y=confirmed_us.values,
    line=dict(color='firebrick')
))

fig.update_layout(title='Confirmed Cases per day (US)',
                   xaxis_title='Days',
                   yaxis_title='Number of Cases')

fig.show()

In [None]:
fig.update_layout(title='Confirmed Cases (US) per day (Log Scale)',
                  yaxis_title='Number of Cases (Log)',
                  yaxis_type="log",
                 )
fig.show()

### Checking the plateue for US 

In [None]:
#computing the number of total cases per week 
confirmed_us_perweek = confirmed_us[[x for x in range(len(confirmed_us)) if x % 7 == 6]]

# computing the number of new cases per week
newcases_us_perweek = np.diff(confirmed_us_perweek.to_numpy(), n = 1)
newcases_us_perweek = np.insert(newcases_us_perweek, 0, confirmed_us_perweek.iloc[0])
newcases_us_perweek = pd.Series(newcases_us_perweek, 
                                           index=confirmed_us_perweek.index)
newcases_us_perweek

In [None]:
fig_us_newcases = go.Figure()

fig_us_newcases.add_trace(go.Scatter(
    x=newcases_us_perweek.index,
    y=newcases_us_perweek.values,
    line=dict(color='firebrick')
))

fig_us_newcases.update_layout(title='Confirmed New Cases (US) per week',
                   xaxis_title='Weeks (ending on)',
                   yaxis_title='Number of New Cases ')

fig_us_newcases.show()

### Analyzing the recovery dataset in the same manner 

In [None]:
# RECOVERED CASES (GLOBAL)
recovered_global = df_recovered_global.drop(labels=['Province/State','Lat', 'Long'], axis='columns')
recovered_global = recovered_global.sum(axis='index', numeric_only=True)
recovered_global

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=recovered_global.index,
    y=recovered_global.values,
    line=dict(color='seagreen')
))

fig.update_layout(title='Recovered Cases per day (Global)',
                   xaxis_title='Days',
                   yaxis_title='Number of Cases ')

fig.show()

In [None]:
fig.update_layout(title='Recovered Cases (Global) per day (Log Scale)',
                  yaxis_title='Number of Cases (Log)',
                  yaxis_type="log",
                 )
fig.show()

### Checking new recovered cases per week 

In [None]:
recovered_global_perweek = recovered_global[[x for x in range(len(recovered_global)) if x % 7 == 6]]

new_recovered_perweek = np.diff(recovered_global_perweek.to_numpy(), n = 1)
new_recovered_perweek = np.insert(new_recovered_perweek, 0, recovered_global_perweek.iloc[0])
new_recovered_perweek = pd.Series(new_recovered_perweek, 
                                           index=recovered_global_perweek.index)
new_recovered_perweek

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=new_recovered_perweek.index,
    y=new_recovered_perweek.values,
    line=dict(color='seagreen')
))

fig.update_layout(title='Recovered New Cases (Global) per week',
                   xaxis_title='Weeks (ending on)',
                   yaxis_title='Number of New Cases')

fig.show()

### After preliminary data analysis, we move forward to ask more insightful questions. Like:

#### 1: Which countries have the highest number of cases
#### 2: How many deaths are there in those countries
#### 3: What's the death rate in those countries (deaths /cases) * 100

In [None]:
# Finding the top ten countries based on the most recent cases.
top_ten_confirmed = df_confirmed_global.nlargest(n=10, columns= df_confirmed_global.columns[-1], keep='all')

top_ten_confirmed = top_ten_confirmed.drop(labels=['Lat','Long','Province/State'], axis='columns')
top_ten_confirmed = top_ten_confirmed.set_index('Country/Region', drop=True)# this line is not needed

top_ten_confirmed.head()

In [None]:
# preparing the df for charts

top_ten_confirmed = top_ten_confirmed.transpose()
top_ten_confirmed = top_ten_confirmed.assign(date=top_ten_confirmed.index)
top_ten_confirmed.index = range(len(top_ten_confirmed.index))
top_ten_confirmed

In [None]:
# Converting the df from wide to long format, a necessary step for the plotly line chart in the next cell.

top_ten_confirmed_tidy=top_ten_confirmed.melt(id_vars=['date'], var_name='Country', value_name='Total Cases')
top_ten_confirmed_tidy

### One of the killer features of plotly is that you can interactively hide/unhide certain lines by clicking on the country names in the legend given. Because the curve of US is overshadowing all other curves, this is particularly helpful for a careful analysis of the lesser growing counterparts. Simply click on 'US', it will hide that chart.

In [None]:
import plotly.express as px

fig = px.line(top_ten_confirmed_tidy, x="date", y="Total Cases", color="Country", 
             title='Line charts for top 10 countries')
fig.show()

##### From the above charts we can see that although earlier Italy was leading the world in total COVID19 cases, somewhere around March 27 the rise of cases in US overtook that of any country in the whole world. One of the major reasons is that Italy declared Nationwide Lockdown on March 03. Most of the states in US started declaring 'shelter-in-place' somewhere around March 19-24.  

#### Once we hide US chart, by clicking on it under the legends and wait for figure to recalibrate, we can see each country's chart more clearly. We can see that Spain, Italy, Germany, France and Turkey showing a significant decrease in slope somewhere around April 17 2020, whereas US is yet to see that change. The reason, although different for each country, in the most general sense of speaking is because all of these countries declared 'national lockdown' much before US's states did. And US never really declared a national lockdown. It fell upon each state to declare 'shelter-in-place' for itself. 

## For this cell, enter the command mode to bring the data back to its proper format.


Lockdown data via wikipedia 


Country State               Start date  End date 
                            year-mm-dd  year-mm-dd
US      California	        2020-03-19	N/A	        State
        Clark County, NV	2020-03-20	N/A	        County
        Connecticut	        2020-03-23	2020-04-22	State
        Illinois	        2020-03-21	2020-05-30	
        Kansas City, KS	    2020-03-24	2020-04-19	City
        Massachusetts	    2020-03-24	2020-05-04	
        Michigan	        2020-03-24	2020-04-13	
        New York	        2020-03-20	2020-05-15	
        Oregon	            2020-03-24	N/A	

Russia	Moscow	            2020-03-30	2020-05-12	Metropolitan 
                            2020-03-28	2020-04-30	Nation wide

UK                          2020-03-23  N/A  

Spain                       2020-03-14	2020-05-09  Nation wide


Italy                       2020-03-09	2020-05-03  Nation wide

Brazil Santa Catarina	    2020-03-17	2020-04-07	State
       São Paulo	        2020-03-24	2020-05-10  State 
       
France                      2020-03-17	2020-05-11  Nation wide


Germany	                    2020-03-23	2020-05-10 Nation wide

Turkey                      2020-04-23	2020-04-27 Nation wide

Iran                        2020-03-14	2020-04-20 Nation wide




In [None]:
## I created this friendly function to create a line chart for whatever country you are interested in
## The line chart will show new cases per week. Enter the country's name with the first letter capitalized.

def create_chart(string):
    '''
        string: name of the country you are interested in to get new cases per week
    '''
    temp_df = df_confirmed_global.drop(labels=['Province/State','Lat', 'Long'], axis='columns')
    country_cases = temp_df.loc[temp_df['Country/Region']==string].groupby(['Country/Region']).sum().squeeze()
    total_cases_perweek = country_cases[[x for x in range(len(country_cases)) if x % 7 == 6]]
    new_cases_perweek = np.diff(total_cases_perweek.to_numpy(), n = 1)
    new_cases_perweek = np.insert(new_cases_perweek, 0, total_cases_perweek.iloc[0])
    new_cases_perweek = pd.Series(new_cases_perweek, 
                                           index=total_cases_perweek.index)
    fig = go.Figure()

    fig.add_trace(go.Scatter(
            x=new_cases_perweek.index,
            y=new_cases_perweek.values
        ))

    fig.update_layout(title=f'Confirmed New Cases ({string}) per week',
                   xaxis_title='Weeks (ending on)',
                   yaxis_title='Number of New Cases ')

    fig.show()

### At the time of writing this, these are some of the countries that came up in top 10 countries. But if something new comes up on another execution of this notebook, feel free to change the name of those countries !

In [None]:
create_chart('Spain')

In [None]:
create_chart('Italy')

In [None]:
create_chart('France')

In [None]:
create_chart('Iran')

In [None]:
create_chart('US')

#### As we see from the above charts, cases in US peaked not only later than other countries, but the number of new cases are incredibly higher than other countries. Some of that could be accredited to the fact that US is the third highest populated country on this planet. But most of it could be blamed towards the fact that no other country has as much global exposure as US does. The other two countries more populated than US i.e China and India have net migration rate in the negatives. (courtesy- https://www.worldometers.info/world-population/population-by-country/).

#### So a national lockdown, including shutdown of air traffic, international and domestic could have been really helpful for the US. 

#### P.S: We also wanted to study air traffic in the world for this year to see how much impact it had on the outbreak, but were running out time.  

### Creating a table of top 50 countries based on cases. Supplementing the table with the total deaths and death rate 

In [None]:
top_50_confirmed_cases = df_confirmed_global.drop(labels=['Lat','Long','Province/State'], axis='columns')

# doing this to combine dulpicate countries with differnt state
top_50_confirmed_cases = top_50_confirmed_cases.groupby(['Country/Region']).sum()
last_column = top_50_confirmed_cases.columns[-1]

top_50_confirmed_cases = top_50_confirmed_cases.nlargest(n=50, columns= last_column, keep='all')
top_50_inxed_confirmed = top_50_confirmed_cases.index

top_50_confirmed_cases =top_50_confirmed_cases.iloc[:,-1]

top_50_confirmed_cases[:10]


In [None]:
## Doing the same thing for deaths

top_50_confirmed_deaths = df_deaths_global.drop(labels=['Lat','Long','Province/State'], axis='columns')
# doing this to combine dulpicates countires with differnt state
top_50_confirmed_deaths = top_50_confirmed_deaths.groupby(['Country/Region']).sum()

deaths_last_column = top_50_confirmed_deaths.columns[-1]

top_50_confirmed_deaths = top_50_confirmed_deaths.loc[top_50_inxed_confirmed]

# top_50_confirmed_deaths
top_50_confirmed_deaths = top_50_confirmed_deaths.iloc[:,-1]
top_50_confirmed_deaths[:10]

In [None]:
# Merging both series to get a neat table

merged_table= pd.concat([top_50_confirmed_cases,top_50_confirmed_deaths], axis=1, sort=False)
merged_table.columns = ['Confirmed Cases','Deaths']

merged_table['Death rate(%)'] = (merged_table['Deaths']/merged_table['Confirmed Cases'])*100
print("Table of top 50 countries with the highest cases and deaths as on ", last_column)
merged_table

### Now we have the table, for the bar chart below we will focus on top 10 countries. 

In [None]:
merged_table_10 = merged_table.iloc[:10, :].sort_values(by='Death rate(%)', ascending=False)
merged_table_10

In [None]:

fig = go.Figure(
    data=[
        go.Bar(name='Covid19 Cases', x=merged_table_10.index, y=merged_table_10['Confirmed Cases'],
               yaxis='y', offsetgroup=1),
        go.Bar(name='Deaths', x=merged_table_10.index, y=merged_table_10['Deaths'], yaxis='y', offsetgroup=2)
    ],
    layout={
        'yaxis': {'title': 'Covid19 Cases'},
        'yaxis2': {'title': 'Deaths', 'overlaying': 'y', 'side': 'right'}
    }
)

# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

### Bar chart for the comparison of death rates 

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=merged_table_10.index,
    y=merged_table_10['Death rate(%)']
))

fig.update_layout(title='Death Rates (Total Deaths/Total Cases)*100 of ten countries with the most cases',
                   xaxis_title='Countries',
                   yaxis_title='Death Rate (%) ')

fig.show()

#### It is quite interesting to see that US has lower death rates than most other countries and this is a testament of US Healthcare and its medical human resources.

#### PS: We also wanted to do a study of ratio of senior citizen per million population in top 10 countries based on deaths to see if we can see any correlation. But unfortunately, we were out of time.

### In the following cells we present plotly animated bubble chart. By pressing the play button, you can see how the cases are growing on a daily basis in different parts of the world. 

In [None]:
# producing the long df for feeding into the plotly graph
# below we format the global cases

df_global = df_confirmed_global.drop(labels=['Province/State'], axis=1)
cases_global_tidy = df_global.melt(id_vars=['Country/Region', 'Lat', 'Long'], var_name='date', value_name='cases')
cases_global_tidy

In [None]:
# here we evaluate global deaths
death_global = df_deaths_global.drop(labels=['Province/State'], axis=1)
death_global_tidy = death_global.melt(id_vars=['Country/Region', 'Lat', 'Long'], var_name='date', 
                                      value_name='deaths')

cases_global_tidy = cases_global_tidy.assign(deaths=death_global_tidy.deaths)
cases_global_tidy

In [None]:
# Bubble chart for cases around the globe

fig = px.scatter_geo(
    cases_global_tidy, 
    lat='Lat',
    lon='Long',
    hover_name='Country/Region',
    size='cases',
    projection="natural earth",
    title='World COVID-19 Cases',
    animation_frame="date"
)
fig.show()

In [None]:
# Bubble chart for deaths around the globe

fig = px.scatter_geo(
    cases_global_tidy, 
    lat='Lat',
    lon='Long',
    hover_name='Country/Region',
    size='deaths',
    projection="natural earth",
    title='World COVID-19 Deaths',
    animation_frame="date"
)
fig.show()