In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Reading the data-set
dataset = pd.read_csv('../input/covid19-global-forecasting-week-4/train.csv')
dataset['Date'] = dataset['Date'].apply(lambda date: pd.to_datetime(date, format = '%Y-%m-%d'))
dataset.head(5)

## Data Exploration

I will be exploring the data in two ways, as of now - 
- Province wise per Country.
- On a Global level.

The first makes more sense, as I feel it can clearly elicit out the four stages of Covid - 19 spread.

In [None]:
dataset['Province_State'] = dataset['Province_State'].fillna('NA')

In [None]:
inception_dates = dataset.where(dataset['ConfirmedCases'] > 0).groupby(['Country_Region', 'Province_State'])['Date'].min()
countries = list(inception_dates.index.get_level_values(0))
for country in countries:
    provinces = list(inception_dates[country].index.get_level_values(0))
    for province in provinces:
        df_index = dataset.loc[(dataset['Country_Region'] == country) & (dataset['Province_State'] == province)].index
        dataset.loc[df_index, 'date_of_inception'] = inception_dates[country, province]

In [None]:
dataset['days'] = dataset.apply(lambda row: (row['Date'] - row['date_of_inception']).days if row['date_of_inception'] != np.nan else np.nan, axis = 1)

### Visualization for Italy

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 0)]

# Italian govt imposed a lockdown on 9th of March
lockdown_date = pd.to_datetime('2020-03-09', format = '%Y-%m-%d')
day_of_lockdown = (lockdown_date - inception_dates.loc[('Italy', 'NA')]).days

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in Italy')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 150000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Fatalities in Italy')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 20000, linestyle = '--', color = 'orange')
ax2.grid(True)

The number of confirmed cases seem to have taken a linear shape by 50th day since first case. Lately, it has started to show a change in convexity, with a point of inflection around the 60th day.

A similar trend can be seen for the Fatalities curve too.

Let's look at the day-wise newly reported cases.

In [None]:
daily_new_cases = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 0), 'ConfirmedCases'].diff()
daily_new_fatalities = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 0), 'Fatalities'].diff()
days = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 0), 'days']

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Daily Reported New Cases in Italy')
ax1.plot(days, daily_new_cases)
ax1.set_ylabel('New Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 6000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Daily Reported Fatalities in Italy')
ax2.plot(days, daily_new_fatalities)
ax2.set_ylabel('New Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 800, linestyle = '--', color = 'orange')
ax2.grid(True)

This is very comforting that the number of new cases reported daily and the number of fatalities reported daily are decreasing! Let's look at these figures, in the near linear zone of the curve.

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 50)]

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in Italy')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days after 50th day from first reported case')
ax1.grid(True)

ax2.set_title('Fatalities in Italy')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days after 50th day from first reported case')
ax2.grid(True)

These curves show quite a huge deviation from the an initial logarithmic model! The hypothesis at this point of time can be that the spread of the virus follows two different dynamics -
- Natural spread: In the absence of government regulations, concious hygience maintenance etc.
- Restricted spread: After the government interventions have started to taken affect, keeping in mind the incubation period of the virus.

These two factors lead to a very sigmoid like model for the spread of the disease.

### Visualizations for Spain

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'Italy') & (dataset['days'] >= 0)]

# Spain govt implicitly imposed a lockdown on 14th of March
lockdown_date = pd.to_datetime('2020-03-14', format = '%Y-%m-%d')
day_of_lockdown = (lockdown_date - inception_dates.loc[('Spain', 'NA')]).days

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in Spain')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 150000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Fatalities in Spain')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 20000, linestyle = '--', color = 'orange')
ax2.grid(True)

In [None]:
daily_new_cases = dataset.loc[(dataset['Country_Region'] == 'Spain') & (dataset['days'] >= 0), 'ConfirmedCases'].diff()
daily_new_fatalities = dataset.loc[(dataset['Country_Region'] == 'Spain') & (dataset['days'] >= 0), 'Fatalities'].diff()
days = dataset.loc[(dataset['Country_Region'] == 'Spain') & (dataset['days'] >= 0), 'days']

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Daily Reported New Cases in Spain')
ax1.plot(days, daily_new_cases)
ax1.set_ylabel('New Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 6000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Daily Reported Fatalities in Spain')
ax2.plot(days, daily_new_fatalities)
ax2.set_ylabel('New Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 800, linestyle = '--', color = 'orange')
ax2.grid(True)

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'Spain') & (dataset['days'] >= 50)]

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in Spain')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days after 50th day from first reported case')
ax1.grid(True)

ax2.set_title('Fatalities in Spain')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days after 50th day from first reported case')
ax2.grid(True)

From the curves, it can be easily inferred that the spread of the virus follows a similar two phased trend in Spain too. Let's look at the spread of the virus in USA.

### Visualizations for USA

In [None]:
us_provinces = dataset.loc[dataset['Country_Region'] == 'US', 'Province_State'].unique()

fig, axes = plt.subplots(18, 3, figsize = (30, 180))

axes = [ax for axis in axes for ax in axis]
for index, province in enumerate(us_provinces):
    data = dataset.loc[(dataset['Country_Region'] == 'US') & (dataset['Province_State'] == province)]
    axes[index].set_title('Confirmed Cases in US: {}'.format(province))
    axes[index].plot(data['days'], data['ConfirmedCases'])
    axes[index].set_ylabel('Confirmed Cases')
    axes[index].set_xlabel('Days since the first case')
    axes[index].grid(True)

In [None]:
fig, axes = plt.subplots(18, 3, figsize = (30, 180))

axes = [ax for axis in axes for ax in axis]
for index, province in enumerate(us_provinces):
    data = dataset.loc[(dataset['Country_Region'] == 'US') & (dataset['Province_State'] == province)]
    axes[index].set_title('Fatalities in US: {}'.format(province))
    axes[index].plot(data['days'], data['Fatalities'])
    axes[index].set_ylabel('Fatalities')
    axes[index].set_xlabel('Days since the first case')
    axes[index].grid(True)

In [None]:

fig, axes = plt.subplots(18, 3, figsize = (30, 150))

axes = [ax for axis in axes for ax in axis]
for index, province in enumerate(us_provinces):
    data = dataset.loc[(dataset['Country_Region'] == 'US') & (dataset['Province_State'] == province)]
    daily_new_cases = data['ConfirmedCases'].diff()
    axes[index].set_title('Daily Reported New Cases in US: {}'.format(province))
    axes[index].plot(data['days'], daily_new_cases)
    axes[index].set_ylabel('New Cases')
    axes[index].set_xlabel('Days since the first case')
    axes[index].grid(True)

In [None]:
fig, axes = plt.subplots(18, 3, figsize = (30, 150))

axes = [ax for axis in axes for ax in axis]
for index, province in enumerate(us_provinces):
    data = dataset.loc[(dataset['Country_Region'] == 'US') & (dataset['Province_State'] == province)]
    daily_new_cases = data['Fatalities'].diff()
    daily_new_fatalities = data['Fatalities'].diff()
    axes[index].set_title('Daily Reported New Fatalities in US: {}'.format(province))
    axes[index].plot(data['days'], daily_new_fatalities)
    axes[index].set_ylabel('New Cases')
    axes[index].set_xlabel('Days since the first case')
    axes[index].grid(True)

Inferences from the visualizations on US data - 

- The spread of the virus has not matured yet, i.e., the number of new reported cases daily is on the rise, on an average.

In order to comment anything seriously on the growth of the virus in various regions of US, let's draw a contrast between Italy and the states of US which have suffered appreciable number of cases.

In [None]:
# Finding the states of US with confirmed case > 15000
us_provinces = dataset.loc[(dataset['Country_Region'] == 'US') & (dataset['ConfirmedCases'] > 15000), 'Province_State'].unique()

plt.figure(figsize = (15, 10))
italy_data = dataset[dataset['Country_Region'] == 'Italy']

for province in us_provinces:
    province_data = dataset[dataset['Province_State'] == province]
    plt.plot(province_data['days'], province_data['ConfirmedCases'], label = 'US: {}'.format(province))
    
plt.plot(italy_data['days'], italy_data['ConfirmedCases'], label = 'Italy')

plt.title('Comparison of Confirmed Cases between Italy and provinces of US')
plt.ylabel('Confirmed Cases')
plt.xlabel('Days since the first case')
    
plt.legend()

There are two states that come out as alarming from the above contrast - New York and New Jersey. These are the two states that are exhibiting high growth rate in the number of confirmed cases. Italy was experiencing a daily new addition of ~500 cases at the stage New York and New Jersey are currently in (~30 day mark). However, both New York and New Jersey are reporting at a very high magnitude (~9000 for New York and ~4000 for New Jersey).

On the other hand, California, with a total cases of ~20000, is showing a considerably lower reporting rate (~1000 cases per day, presently). Although this is greater than what Italy was experiencing, still considerably lower than the other two hugely affected states.

With California reporting its first case in Feb 26, in contrast to March 3 for New York, I would say California state has been very quickly and effectively responding to the pandemic.

### How is India faring? Is there some good news, or is it all the same?

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'India') & (dataset['days'] >= 0)]

# Indian govt implicitly imposed a lockdown on 24th of March
lockdown_date = pd.to_datetime('2020-03-24', format = '%Y-%m-%d')
day_of_lockdown = (lockdown_date - inception_dates.loc[('India', 'NA')]).days

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in India')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 10000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Fatalities in India')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 300, linestyle = '--', color = 'orange')
ax2.grid(True)

The curve for confirmed cases has seem to gone into the regime of some linearity. However, the same can't be said for the Fatalities curve. It would be good to see the curve shift its convexity. 

The linear zone for the Confirmed Cases curve seems to start around the 65th day.

Let's look at the reporting rate for the country.

In [None]:
daily_new_cases = dataset.loc[(dataset['Country_Region'] == 'India') & (dataset['days'] >= 0), 'ConfirmedCases'].diff()
daily_new_fatalities = dataset.loc[(dataset['Country_Region'] == 'India') & (dataset['days'] >= 0), 'Fatalities'].diff()
days = dataset.loc[(dataset['Country_Region'] == 'India') & (dataset['days'] >= 0), 'days']

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Daily Reported New Cases in India')
ax1.plot(days, daily_new_cases)
ax1.set_ylabel('New Cases')
ax1.set_xlabel('Days since first case')
ax1.vlines(day_of_lockdown, ymin = 0, ymax = 1000, linestyle = '--', color = 'orange')
ax1.grid(True)

ax2.set_title('Daily Reported Fatalities in India')
ax2.plot(days, daily_new_fatalities)
ax2.set_ylabel('New Fatalities')
ax2.set_xlabel('Days since first case')
ax2.vlines(day_of_lockdown, ymin = 0, ymax = 40, linestyle = '--', color = 'orange')
ax2.grid(True)

It can be seen that India is reporting ~900 cases per day (an alarming value, no doubt!). The reporting rate for Fatalities seems to be surely following an increasing trend.

In [None]:
data = dataset.loc[(dataset['Country_Region'] == 'India') & (dataset['days'] >= 65)]

fix, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 8))
ax1.set_title('Confirmed Cases in India')
ax1.plot(data['days'], data['ConfirmedCases'])
ax1.set_ylabel('Confirmed Cases')
ax1.set_xlabel('Days after 65th day from first reported case')
ax1.grid(True)

ax2.set_title('Fatalities in India')
ax2.plot(data['days'], data['Fatalities'])
ax2.set_ylabel('Fatalities')
ax2.set_xlabel('Days after 65th day from first reported case')
ax2.grid(True)

As witnessed earlier, the curve for Confirmed can be approximated with a linear one very confidently after the 65th day from the first case. 

However, the curve for Fatalities still seems to have an upward convexity.

In [None]:
italy_data = dataset[dataset['Country_Region'] == 'Italy']
india_data = dataset[dataset['Country_Region'] == 'India']

_, (ax1, ax2) = plt.subplots(1, 2, figsize = (30, 10))
ax1.plot(italy_data['days'], italy_data['ConfirmedCases'], label = 'Italy')
ax1.plot(india_data['days'], india_data['ConfirmedCases'], label = 'India')
ax1.set_title('Comparison of Confirmed Cases between India & Italy')
ax1.set_xlabel('Days since first case')
ax1.set_ylabel('Confirmed Cases')
ax1.set_ylim([0, 10000])
ax1.legend()


ax2.plot(italy_data['days'], italy_data['Fatalities'], label = 'Italy')
ax2.plot(india_data['days'], india_data['Fatalities'], label = 'India')
ax2.set_title('Comparison of Confirmed Cases between India & Italy')
ax2.set_xlabel('Days since first case')
ax2.set_ylabel('Confirmed Cases')
ax2.set_ylim([0, 500])
ax2.legend()

Oh My! ALthough a lagged response in the number of confirmed cases, yet the growth rate for India seems to be almost similar to that of Italy. However, given that Italy didn't reach a linear zone in the time-frame shown in the plot (the y-limit), it might not be all gloomy for Indians right now.

Due to the lagged response, India is now observing the reporting rate which Italy observed around the mark of ~37-38 days, where the reporting rate was around 1200-1500 per day. In comparison to that India is facing a reporting rate of 800-900 per day.