<https://github.com/CSSEGISandData/COVID-19>

In [406]:
import re
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)

In [420]:
class AmericanDatesTransformer:
    
    PATTERN = r'(\d+)/(\d+)/(\d+)'
    
    def __init__(self, countries_col="Country/Region"):
        self.countries_col = countries_col
        
    @classmethod
    def get_day_month_year(cls, match):
        month = int(match.group(1))
        day = int(match.group(2))
        year = int('20' + match.group(3))
        return str(date(year, month, day))
    
    def transform_raw(self, df_raw):
        df = df_raw.copy()
        rename_dict = {}
        dates_cols = []
        for colname in df.columns.values:
            match = re.search(self.PATTERN, colname)
            if match:
                d = self.get_day_month_year(match)
                rename_dict[colname] = d
                dates_cols.append(d)
            else:
                rename_dict[colname] = colname
        return df.rename(rename_dict, axis=1), dates_cols

    def transpose_df(self, df, dates_cols):
        df_countries = df[[self.countries_col] + dates_cols]\
            .groupby(self.countries_col)\
            .sum().transpose()\
            .reset_index().rename({"index": "date"}, axis=1).copy()
        dates = df_countries.date
        date0 = dates[0]
        days = []
        for date in dates:
            days.append(
                (datetime.strptime(date, "%Y-%m-%d") - datetime.strptime(date0, "%Y-%m-%d")).days
            )
        df_countries['day'] = days
        df_countries.columns.name = ""
        return df_countries

    def transform(self, df_raw):
        df, dates_cols = self.transform_raw(df_raw)
        return self.transpose_df(df, dates_cols)
    
class EuropeanDatesTransformer(AmericanDatesTransformer):
    
    PATTERN = r'(\d+)/(\d+)/(\d+)'
        
    @classmethod
    def get_day_month_year(cls, match):
        day = int(match.group(1))
        month = int(match.group(2))
        year = int(match.group(3))
        return str(date(year, month, day))

In [421]:
american_transformer = AmericanDatesTransformer()
european_transformer = EuropeanDatesTransformer(countries_col="CCAA")

df_confirmed_raw = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
df_confirmed = american_transformer.transform(df_confirmed_raw)
df_confirmed.head()

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,...,Turkey,US,Ukraine,United Arab Emirates,United Kingdom,Uruguay,Venezuela,Vietnam,occupied Palestinian territory,day
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2,0,1
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,2,0,2
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,2,0,3
4,2020-01-26,0,0,0,0,0,0,0,0,4,...,0,5,0,0,0,0,0,2,0,4


In [422]:
df_deaths_raw = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
df_deaths = american_transformer.transform(df_deaths_raw)
df_deaths.head()

Unnamed: 0,date,Afghanistan,Albania,Algeria,Andorra,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,...,Turkey,US,Ukraine,United Arab Emirates,United Kingdom,Uruguay,Venezuela,Vietnam,occupied Palestinian territory,day
0,2020-01-22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2020-01-24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,2020-01-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,2020-01-26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [423]:
df_spain_deaths_raw = pd.read_csv("https://raw.githubusercontent.com/datadista/datasets/master/COVID%2019/ccaa_covid19_fallecidos.csv")
df_spain_deaths = european_transformer.transform(df_spain_deaths_raw)
df_spain_deaths.head()

Unnamed: 0,date,Andalucía,Aragón,Asturias,Baleares,C. Valenciana,Canarias,Cantabria,Castilla y León,Castilla-La Mancha,...,Extremadura,Galicia,La Rioja,Madrid,Melilla,Murcia,Navarra,País Vasco,Total,day
0,2020-03-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-04,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,2020-03-05,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,3,2
3,2020-03-06,0,1,0,0,1,0,0,0,0,...,0,0,0,2,0,0,0,1,5,3
4,2020-03-09,0,1,0,0,1,0,0,0,0,...,0,0,0,8,0,0,0,5,16,6


In [424]:
all_ccaa = df_spain_deaths.columns.values[2:-1]
all_ccaa

array(['Aragón', 'Asturias', 'Baleares', 'C. Valenciana', 'Canarias',
       'Cantabria', 'Castilla y León', 'Castilla-La Mancha', 'Cataluña',
       'Ceuta', 'Extremadura', 'Galicia', 'La Rioja', 'Madrid', 'Melilla',
       'Murcia', 'Navarra', 'País Vasco', 'Total'], dtype=object)

In [425]:
df_spain_deaths.columns.values

array(['date', 'Andalucía', 'Aragón', 'Asturias', 'Baleares',
       'C. Valenciana', 'Canarias', 'Cantabria', 'Castilla y León',
       'Castilla-La Mancha', 'Cataluña', 'Ceuta', 'Extremadura',
       'Galicia', 'La Rioja', 'Madrid', 'Melilla', 'Murcia', 'Navarra',
       'País Vasco', 'Total', 'day'], dtype=object)

In [426]:
(datetime.strptime("2020-03-04", "%Y-%m-%d") - datetime.strptime("2020-03-02", "%Y-%m-%d")).days

2

In [427]:
df_spain_deaths

Unnamed: 0,date,Andalucía,Aragón,Asturias,Baleares,C. Valenciana,Canarias,Cantabria,Castilla y León,Castilla-La Mancha,...,Extremadura,Galicia,La Rioja,Madrid,Melilla,Murcia,Navarra,País Vasco,Total,day
0,2020-03-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-04,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,2020-03-05,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,3,2
3,2020-03-06,0,1,0,0,1,0,0,0,0,...,0,0,0,2,0,0,0,1,5,3
4,2020-03-09,0,1,0,0,1,0,0,0,0,...,0,0,0,8,0,0,0,5,16,6
5,2020-03-10,0,3,0,0,1,0,0,0,0,...,0,0,1,21,0,0,0,6,35,7
6,2020-03-11,0,4,0,0,1,0,0,0,0,...,0,0,2,31,0,0,0,6,47,8
7,2020-03-12,0,6,1,1,1,0,0,0,1,...,1,0,2,56,0,0,0,11,84,9
8,2020-03-13,0,7,1,1,1,0,0,1,5,...,1,0,2,81,0,0,0,14,120,10
9,2020-03-14,2,7,1,1,4,1,0,3,6,...,2,0,3,86,0,0,0,14,136,11


In [448]:
def fig_update(fig):
    fig.update_layout(
        xaxis=dict(
            showline=True,
            showgrid=False,
            showticklabels=True,
            linecolor='rgb(204, 204, 204)',
            linewidth=2,
            ticks='outside',
            tickfont=dict(
                family='Arial',
                size=12,
                color='rgb(82, 82, 82)',
            )
        ),
        yaxis=dict(
            showgrid=True,
            zeroline=False,
            showline=False,
            showticklabels=True,
            gridcolor='aliceblue'
        ),
        showlegend=True,
        plot_bgcolor='white'
    )
    return fig

class CountryStat:
    
    def __init__(self, country_name, global_df, start_with=0, max_days=None):
        self.country_name = country_name
        self.global_df = global_df
        self.start_with = start_with
        self.max_days = max_days
        self.df = self.global_df[["day", "date", country_name]].rename({country_name: 'total'}, axis=1)
        
    @property
    def data(self):
        cdf = self.df.copy()
        cdf['new_cases'] = cdf['total'] - cdf['total'].shift(1)    
        cdf = cdf[(cdf['new_cases'] != 0) & (cdf['new_cases'].notnull())]
        cdf = cdf[cdf['total'] >= self.start_with].copy()
        if len(cdf) == 0:
            return cdf
        cdf['day'] = cdf['day'] - cdf.iloc[0]['day']
        if self.max_days is not None:
            cdf = cdf[cdf.day <= self.max_days]
        return cdf
    
    def log_linear_prediction(self, last_day=None):
        data = self.data
        x_train = data["day"]
        if last_day is None:
            last_day = x_train.iloc[-1] + 1 + 7
        day0 = data["date"].iloc[0]
        X_train = np.array([x_train]).T
        y_train = np.log(data['total'])
        regr = LinearRegression()       
        regr.fit(X_train, y_train)
        days = list(range(x_train.iloc[0], last_day + 1))
        X_test = np.array([days]).T
        y_pred = regr.predict(X_test)
        pred = np.exp(y_pred)
        dt0 = datetime.strptime(day0, '%Y-%m-%d')
        dates = [dt0 + timedelta(days=day) for day in days]
        pred_df = pd.DataFrame({"day": days, "date": dates, "total": pred})       
        return pred_df
        
    def plotly_lm_plot(self, xcol='day', color=None):
        data = self.data
        if len(data) != 0:
            x=data[xcol]
            y=data.total
            trace = go.Scatter(
                x=x, y=y, mode='lines+markers', name=self.country_name)
            if color is not None:
                trace.line.color = color
            return trace
        return None
    
    def plotly_log_linear_prediction(self, xcol='day', last_day=None, color=None):
        data = self.log_linear_prediction(last_day)
        if len(data) != 0:
            x=data[xcol]
            y=data.total
            trace = go.Scatter(
                x=x, y=y, mode='lines', line = dict(dash='dot'), showlegend=False)
            if color is not None:
                trace.line.color = color
            return trace
        return None
    
    def plotly_fig(self, log_scale=False, xcol='day', color=None):
        trace = self.plotly_lm_plot(xcol, color)
        fig = go.Figure()
        if trace is not None:
            fig.add_trace(trace)
            fig = fig_update(fig)
            if log_scale:
                fig.layout.yaxis.update(type='log')
        return fig
    
    def plotly_fig_with_prediction(self, log_scale=False, xcol='day', last_day=None, color=None):
        fig = self.plotly_fig(log_scale, xcol, color)
        pred_trace = self.plotly_log_linear_prediction(xcol, last_day, color)
        fig.add_trace(pred_trace)
        return fig

In [449]:
spain = CountryStat("Spain", df_confirmed, start_with=100)

In [450]:
spain.log_linear_prediction()

Unnamed: 0,day,date,total
0,0,2020-03-02,105.433859
1,1,2020-03-03,148.179627
2,2,2020-03-04,208.255697
3,3,2020-03-05,292.688248
4,4,2020-03-06,411.352061
5,5,2020-03-07,578.125427
6,6,2020-03-08,812.513273
7,7,2020-03-09,1141.928356
8,8,2020-03-10,1604.897316
9,9,2020-03-11,2255.566543


In [451]:
iplot(spain.plotly_fig(True, color=COLOR_SCALE[0]))

In [452]:
iplot(spain.plotly_fig_with_prediction(True, 'date', 20, color=COLOR_SCALE[0]))

In [437]:
madrid = CountryStat("Madrid", df_spain_deaths, start_with=1)
madrid.log_linear_prediction(last_day=17)

Unnamed: 0,day,date,total
0,0,2020-03-05,1.167554
1,1,2020-03-06,1.967489
2,2,2020-03-07,3.315488
3,3,2020-03-08,5.587051
4,4,2020-03-09,9.414945
5,5,2020-03-10,15.865469
6,6,2020-03-11,26.735486
7,7,2020-03-12,45.05295
8,8,2020-03-13,75.920382
9,9,2020-03-14,127.936228


In [434]:
madrid.data

Unnamed: 0,day,date,total,new_cases
2,0,2020-03-05,1,1.0
3,1,2020-03-06,2,1.0
4,4,2020-03-09,8,6.0
5,5,2020-03-10,21,13.0
6,6,2020-03-11,31,10.0
7,7,2020-03-12,56,25.0
8,8,2020-03-13,81,25.0
9,9,2020-03-14,86,5.0
10,10,2020-03-15,213,127.0


In [435]:
madrid.plotly_fig(True, "date")

In [438]:
madrid.plotly_fig_with_prediction(True, 'date', 17, color="black")

In [466]:
class ConfirmedCasesStat:
    
    COLOR_SCALE = px.colors.sequential.Agsunset
    
    def __init__(self, countries, global_df, start_with=1, max_days=None, prediction=None):
        self.countries = countries
        self.global_df = global_df
        self.start_with = start_with
        self.max_days = max_days
        self.countries_stats = self.get_coutries_stats()
        self.prediction = prediction
        
    def get_coutries_stats(self):
        countries_stats = {}
        for country_name in self.countries:
            countries_stats[country_name] =\
                CountryStat(country_name, self.global_df, self.start_with, self.max_days)
        return countries_stats
   
    @property
    def plot_title(self):
        return "COVID-19: Confirmed Cases"
        
    @property
    def xaxis_title(self):
        return f"Days since {self.start_with} confirmed cases"
    
    def plotly_lm_plot(self, y_log_scale=False):      
        fig = go.Figure()
        fig = fig_update(fig)
        fig.update_layout(xaxis_title=self.xaxis_title)
        if y_log_scale:
            fig.layout.yaxis.update(type='log')
            fig.update_layout(title=self.plot_title+", log scale")
            fig.update_yaxes(tickvals=[100, 1000, 10000])
        else:
            fig.update_layout(title=self.plot_title)
        countries_stats = list(self.countries_stats.values())
        for country_idx in range(len( countries_stats)):
            country_stat = countries_stats[country_idx]
            color = self.COLOR_SCALE[country_idx % len(self.COLOR_SCALE)]
            plot = country_stat.plotly_lm_plot(color=color)
            if plot is not None:
                fig.add_trace(plot)
                if self.prediction is not None:
                    if (self.prediction == True) or self.prediction[country_idx]:
                        pred_plot = country_stat.plotly_log_linear_prediction(last_day=self.max_days, color=color)
                        fig.add_trace(pred_plot)
        return fig

class DeathsStat(ConfirmedCasesStat):
    
    @property
    def plot_title(self):
        return "COVID-19: Deaths"
        
    @property
    def xaxis_title(self):
        if self.start_with == 1:
            return "Days since first death"
        return f"Days since {self.start_with} deaths"

In [469]:
countries = ConfirmedCasesStat(
    ["Poland", "Spain"], 
    df_confirmed,
    start_with=25,
    max_days=5, 
    prediction=[True, False]
)

In [470]:
iplot(countries.plotly_lm_plot())

In [402]:
'United Kingdom', "France" "Spain","Japan", "France",

('United Kingdom', 'FranceSpain', 'Japan', 'France')

In [471]:
countries = ConfirmedCasesStat(
    ["Italy", "Spain"], 
    df_confirmed, 
    start_with=100, 
    max_days=21,
    prediction=[False, True]
)
countries.COLOR_SCALE = px.colors.sequential.Agsunset
iplot(countries.plotly_lm_plot(False))

In [458]:
countries = ConfirmedCasesStat(
    ["Italy",  "Germany", "US",  'United Kingdom',  "US", "Spain", "France"], 
    df_confirmed, 
    start_with=20, 
    max_days=22
)
countries.COLOR_SCALE = px.colors.sequential.Agsunset
iplot(countries.plotly_lm_plot(True))

In [478]:
countries = DeathsStat(
    ["Italy",  "Germany", "Spain", "France"], 
    df_deaths,
    start_with=5,
    max_days=19,
    prediction=True
)
iplot(countries.plotly_lm_plot(True))

In [405]:
ccaa = DeathsStat(
    all_ccaa, 
    df_spain_deaths,
     start_with=2,
)
ccaa.COLOR_SCALE = px.colors.sequential.Agsunset
iplot(ccaa.plotly_lm_plot(True))

In [481]:
ccaa = DeathsStat(
        ['Aragón', 'C. Valenciana', 'Castilla-La Mancha', 'Cataluña',
        'La Rioja', 'Madrid', 'País Vasco'], 
    df_spain_deaths,
    start_with=2,
    max_days=16,
    prediction=True
)
ccaa.COLOR_SCALE = px.colors.sequential.Agsunset
iplot(ccaa.plotly_lm_plot(True))