In [None]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator

import seaborn as sns
import warnings 

import requests
from bs4 import BeautifulSoup
from itertools import product

warnings.filterwarnings("ignore")
sns.set(style="darkgrid")

def read_raw (path):
    raw = pd.read_csv(path)
    raw = raw.rename (columns = {"Province/State": "Province_State", "Country/Region":"Country_Region"})
    raw ["Province_State"] = raw ["Province_State"].fillna(raw ["Country_Region"])
    raw ["Country_Region"] = raw ["Country_Region"].fillna("-")
    return raw

def extract ( df_raw, type_value, date_format='%m/%d/%y' ):
    df = pd.DataFrame()
    dates = df_raw.columns[4:]

    for d in dates:    
        sites = df_raw[["Province_State","Country_Region"]].copy()
        sites["ObservationDate"]=d
        sites[type_value]=df_raw[d]
        df = df.append(sites)

    df ["ObservationDate"] = pd.to_datetime(df ["ObservationDate"], format=date_format)

    return df



def load_data ():

    raw_confirmed = read_raw("time_series_covid19_confirmed_global.csv")
    raw_recovered = read_raw("time_series_covid19_recovered_global.csv")
    raw_deaths = read_raw("time_series_covid19_deaths_global.csv")

    anag = raw_confirmed[ ["Province_State","Country_Region","Lat","Long"]].copy()
    confirmed = extract (raw_confirmed, "Confirmed")
    recovered = extract (raw_recovered, "Recovered") #,  date_format='%m/%d/%Y')
    deaths = extract (raw_deaths, "Death")


    data = anag.merge(confirmed,on=["Province_State","Country_Region"])
    data = data.merge(recovered,on=["Province_State","Country_Region", "ObservationDate"] )
    data = data.merge(deaths,on=["Province_State","Country_Region", "ObservationDate"] )

    data ["ObservationDate"] = pd.to_datetime(data ["ObservationDate"], format='%m/%d/%y')
    data = data.sort_values(by="ObservationDate",ascending=True) 

    g = data.groupby( ["Province_State", "Country_Region"])

    data ["Infected"] = data["Confirmed"] - data["Recovered"] - data["Death"] 

    for f in ["Confirmed", "Infected", "Death", "Recovered"]:
        data ["Day_" + f] = data[f]  - g[f].shift()    

    data ["beta"] = data["Day_Confirmed"]/(data["Infected"]-data["Day_Infected"])
    
    return data


def select_data (data, province = None, country = None):
    
    if province is not None:
        data = data.query ( "Province_State == @province" )

    if country is not None:
        data = data.query ( "Country_Region == @country" )
    
    return data

def extract_pcps_by_country ( data, province = None, country = None, confirmed_threshold = 50 ):
    data = select_data (data, province, country).query ("Confirmed > @confirmed_threshold")

    return data 

# The virus spreads

## Facts

* Jan 21: Chinese officials acknowledged the risk of human-to-human transmission for COVID-19.
* Jan 23: The Chinese authorities locked down Wuhan, and many cities followed in the next few weeks. Travel across China nearly stopped.

* Mar 10: In Italy, a nationwide lockdown went into effect on March 10 that restricts virtually all aspects of life for its 60 million citizens, including retail, leisure, worship, imprisonment, and travel.
* Mar 11: The World Health Organization (WHO) declared COVID-19 a pandemic
* Mar 14: Spain became the second European country to impose a nationwide quarantine.
* Mar 23: The UK went into full coronavirus lockdown on March 23.

* Apr 8: Chinese authorities allowed residents to leave the city of Wuhan for the first time since 23 January.

sources: [1](https://www.nytimes.com/interactive/2020/03/22/world/coronavirus-spread.html), [2](https://www.businessinsider.com/countries-on-lockdown-coronavirus-italy-2020-3?IR=T#germany-announced-a-shut-down-of-shops-churches-sports-facilities-bars-and-clubs-in-16-states-15)

I use data from [Johns Hopkins Github repository](https://github.com/CSSEGISandData/COVID-19) and from [Citymapper](https://citymapper.com/CMI) 


In [None]:
!wget -N -q https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv
!wget -N -q https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv    
!wget -N -q https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv
!wget -N -q https://cdn.citymapper.com/data/cmi/Citymapper_Mobility_Index_20200809.csv
!ls

In [None]:
def infected_plot ( d, p, ax ):
    ax.set_yscale('log')
    ax.xaxis.set_major_locator(MultipleLocator(14))
    chart = sns.lineplot(x="ObservationDate", y="Infected", hue="Province_State",   data = d.query("Province_State.isin(@p)",engine="python"),  ax=ax)

data = load_data ()
data = data.sort_values(by="Confirmed",ascending=False) 
data = data.drop_duplicates(["Province_State","Country_Region"], keep="first")
today =  pd.datetime.now().replace(microsecond=0, second=0, minute=0, hour=0)
data ["Age"] = today - data ["ObservationDate"]
#data = data [ data ["Age"] < '5 days' ]
data = data [ (data ["Age"] < '5 days') | (data["Province_State"] == "Hubei")  ]

provinces = list(data.head(24)["Province_State"].values)


top=data.query("Province_State.isin(@provinces)",engine='python')  [["ObservationDate", "Country_Region","Province_State","Confirmed", "Infected", "Recovered", "Death", "beta"]].reset_index(drop=True)


d = load_data ()

d = d.sort_values(by="Infected",ascending=False) 


d["ObservationDate"] = d["ObservationDate"].dt.strftime("%m/%d")


d_global = d.groupby(["ObservationDate"]).agg({
    "Confirmed":'sum',
    "Recovered": 'sum',
    "Infected": 'sum',
    "Death": 'sum'
}).reset_index()
d_global["Province_State"]="all countries"

fig, ax = plt.subplots(1, 1,figsize=(18,4))
fig.suptitle("Infected")
ax.set_yscale('log')
ax.xaxis.set_major_locator(MultipleLocator(7))
chart = sns.lineplot(x="ObservationDate", y="Infected", hue="Province_State",  data =d_global,  ax=ax)

plt.ylim(bottom=500)
plt.tight_layout()
plt.show()


fig, axs = plt.subplots(2, 2,figsize=(18,9), sharey=True, sharex=False )

infected_plot ( d, provinces[0:5], axs[0,0] )
infected_plot ( d, provinces[5:10], axs[0,1] )
infected_plot ( d, provinces[10:15], axs[1,0] )
infected_plot ( d, provinces[15:20], axs[1,1] )

plt.ylim(bottom=500)
plt.tight_layout()
plt.show()

In [None]:
def death_plot ( d, p, ax ):
    #ax.set_yscale('log')
    ax.xaxis.set_major_locator(MultipleLocator(14))
    chart = sns.lineplot(x="ObservationDate", y="Day_Death_Mean", hue="Province_State",   data = d.query("Province_State.isin(@p)",engine="python"),  ax=ax)

data = load_data ()
data = data.sort_values(by="Death",ascending=False) 
data = data.drop_duplicates(["Province_State","Country_Region"], keep="first")
today =  pd.datetime.now().replace(microsecond=0, second=0, minute=0, hour=0)
data ["Age"] = today - data ["ObservationDate"]
#data = data [ data ["Age"] < '5 days' ]
data = data [ (data ["Age"] < '5 days') | (data["Province_State"] == "Hubei")  ]

provinces = list(data.head(24)["Province_State"].values)


top=data.query("Province_State.isin(@provinces)",engine='python')  [["ObservationDate", "Country_Region","Province_State","Confirmed", "Infected", "Recovered", "Death", "beta"]].reset_index(drop=True)


d = load_data ()

d = d.sort_values(by="Death",ascending=False) 


d["ObservationDate"] = d["ObservationDate"].dt.strftime("%m/%d")


d_global = d.groupby(["ObservationDate"]).agg({
    "Confirmed":'sum',
    "Recovered": 'sum',
    "Infected": 'sum',
    "Death": 'sum',
    "Day_Death": 'sum',
    
}).reset_index()
d_global["Province_State"]="all countries"

d_global["Rolling_Mean"]=d_global["Day_Death"].rolling(7).mean()

fig, ax = plt.subplots(1, 1,figsize=(18,4))
fig.suptitle("Day_Death")
#ax.set_yscale('log')
ax.xaxis.set_major_locator(MultipleLocator(7))
chart = sns.lineplot(x="ObservationDate", y="Day_Death", hue="Province_State",  data =d_global,  ax=ax)
chart = sns.lineplot(x="ObservationDate", y="Rolling_Mean",  data =d_global,  ax=ax)


#plt.ylim(bottom=500)
plt.tight_layout()
plt.show()


fig, axs = plt.subplots(2, 2,figsize=(18,9), sharey=True, sharex=False )

d["Day_Death_Mean"] = d.groupby("Province_State")["Day_Death"].rolling(7).mean().reset_index(0,drop=True)

death_plot ( d, provinces[0:5], axs[0,0] )
death_plot ( d, provinces[5:10], axs[0,1] )
death_plot ( d, provinces[10:15], axs[1,0] )
death_plot ( d, provinces[15:20], axs[1,1] )

#plt.ylim(bottom=500)
plt.tight_layout()
plt.show()

The only way to decrease the spread of the COVID-19 - until a vaccine will be developed - is to take actions to decrease the probability of contact.

It is important to measure this probability over time to evaluate the validity of those actions.


Let's see how - using a simple variation of the [SIR Model](https://mathworld.wolfram.com/Kermack-McKendrickModel.html) - it's possible to track the change of probability of contact over time.


### A SIR Model

\\(C(t) = I(t) + R(t) + D(t)\\)

\\(I(t+1) = I(t) + {Pc(t).Ps(t).S(t).I(t)\over N} - D(t) - R(t) \\)

where

* \\(C(t)\\) is the cumulative number of confirmed cases till that date t
* \\(I\\) is the cumulative number of infected 
* \\(R\\) is the cumulative number of recovered cases
* \\(D\\) is the cumulative number of deaths 
* \\(S\\) is the cumulative number of people susceptible to virus
* \\(N\\) is the population
* \\(Ps\\) is the probability of spreading the virus (if susceptible contacts infected)
* \\(Pc\\) is the probability of contact between susceptibles and infected



### A proxy for the probability of contact over time 

Assume that 

* \\({S(t)\over N} \approx 1 \\)    that is, the whole population is susceptible to the virus and there are few confirmed cases with respect to the whole population.  
* \\(Ps(t)=Ps\\)    that is, the propability of spreading is constant. 

we have that \\(\beta=Pc(t).Ps\\) is a good proxy of the probability of contact over time:


\\(\beta = Pc(t).Ps={C(t+1)-C(t)\over I(t)} \\)



# Current value for \\(\beta\\)

First of all, let's show the current value of \\(\beta=Pc(t)*Ps\\) for countries/provinces with high values of confirmed cases.


In [None]:
top

# History of \\(\beta\\) for some countries

Let's see the \\(\beta\\) trend for some countries/provinces with respect  to applied quarantine policies

I extracted from wikipedia [Coronavirus quarantines](https://en.wikipedia.org/wiki/National_responses_to_the_2019%E2%80%9320_coronavirus_pandemic)  the starting date of lockdown for countries .

* "L" is the start of National lockdown for the country
* "P" is the start of Partial lockdown (City, Regional, ...) for the country

In [None]:
website_url = requests.get('https://en.wikipedia.org/wiki/National_responses_to_the_2019–20_coronavirus_pandemic').text
soup = BeautifulSoup(website_url,'lxml')

tag = soup.find('table',{'class':'wikitable sortable mw-collapsible'})


def clean_country (x):
    if x == "United States":
        return "US"
    if x == "Czech Republic":
        return "Czechia"
    
    return x

#https://stackoverflow.com/questions/48393253/how-to-parse-table-with-rowspan-and-colspan
def table_to_2d(tag):
    rowspans = []  # track pending rowspans
    rows = tag.find_all('tr')

    # first scan, see how many columns we need
    colcount = 0
    for r, row in enumerate(rows):
        cells = row.find_all(['td', 'th'], recursive=False)
        # count columns (including spanned).
        # add active rowspans from preceding rows
        # we *ignore* the colspan value on the last cell, to prevent
        # creating 'phantom' columns with no actual cells, only extended
        # colspans. This is achieved by hardcoding the last cell width as 1. 
        # a colspan of 0 means “fill until the end” but can really only apply
        # to the last cell; ignore it elsewhere. 
        colcount = max(
            colcount,
            sum(int(c.get('colspan', 1)) or 1 for c in cells[:-1]) + len(cells[-1:]) + len(rowspans))
        # update rowspan bookkeeping; 0 is a span to the bottom. 
        rowspans += [int(c.get('rowspan', 1)) or len(rows) - r for c in cells]
        rowspans = [s - 1 for s in rowspans if s > 1]

    # it doesn't matter if there are still rowspan numbers 'active'; no extra
    # rows to show in the table means the larger than 1 rowspan numbers in the
    # last table row are ignored.

    # build an empty matrix for all possible cells
    table = [[None] * colcount for row in rows]

    # fill matrix from row data
    rowspans = {}  # track pending rowspans, column number mapping to count
    for row, row_elem in enumerate(rows):
        span_offset = 0  # how many columns are skipped due to row and colspans 
        for col, cell in enumerate(row_elem.find_all(['td', 'th'], recursive=False)):
            # adjust for preceding row and colspans
            col += span_offset
            while rowspans.get(col, 0):
                span_offset += 1
                col += 1

            # fill table data
            rowspan = rowspans[col] = int(cell.get('rowspan', 1)) or len(rows) - row
            colspan = int(cell.get('colspan', 1)) or colcount - col
            # next column is offset by the colspan
            span_offset += colspan - 1
            value = cell.get_text()
            for drow, dcol in product(range(rowspan), range(colspan)):
                try:
                    value =  re.sub(r'\[\d+\]', '',value.replace("\n",""))
                    value = re.sub (r'^ +','', value)
                    value = re.sub (r' +$','', value)
                    table[row + drow][col + dcol] = value
                    rowspans[col + dcol] = rowspan
                except IndexError:
                    # rowspan or colspan outside the confines of the table
                    pass

        # update rowspan bookkeeping
        rowspans = {c: s - 1 for c, s in rowspans.items() if s > 1}
    
    
    df = pd.DataFrame (
        data = np.array(table) [2:-1,:], 
        columns = ["Country_Region", "Province_State", "Start_Date", "End_Date", "Level"]
    )
    return df

df_lockdown = table_to_2d (tag)

df_lockdown["Country_Region"]=df_lockdown["Country_Region"].map ( lambda x: clean_country(x) )
df_lockdown["Province_State"]=df_lockdown["Province_State"].map ( lambda x: clean_country(x) )

hubei = pd.DataFrame ({
    "Country_Region" : ["China"],
    "Province_State" : ["Hubei"],
    "Start_Date" : ["2020-01-24"],
    "End_Date" : ["08-04-2020"],
    "Level" : ["National"]
})

#df_lockdown = df_lockdown.append ( hubei )


df_lockdown["Lockdown"] = df_lockdown["Level"].map(lambda x: "Lockdown" if x == "National" else "Partial") 

df_lockdown.to_csv("wikipedia_lockdown.csv", index=False)


#df_lockdown ["ObservationDate"] = pd.to_datetime(df_lockdown ["Start_Date"].map(lambda x: str(x).replace("[a]","").replace("[b]","").replace("[c]","").replace("[d]","").replace("[e]","")), format="%Y-%m-%d")
df_lockdown ["ObservationDate"] = pd.to_datetime(df_lockdown ["Start_Date"].map(lambda x: str(x).replace("[a]","").replace("[b]","").replace("[c]","").replace("[d]","").replace("[e]","").replace("[f]","").replace("[g]","").replace("[i]","").replace("[j]","")), format="%Y-%m-%d")




df_lockdown["ObservationDate"] = df_lockdown["ObservationDate"].dt.strftime("%m/%d")
df_lockdown = df_lockdown.drop_duplicates(["Country_Region","ObservationDate"])
df_lockdown["Lockdown"] = df_lockdown["Lockdown"].fillna("").map(lambda x: x[0] if len(x) > 0 else "")

df_lockdown.to_csv("wikipedia_lockdown.csv",index=False)

In [None]:
provinces = top["Province_State"].values

data = load_data ()
d = data [ data ["Confirmed"] > 500 ]
d = d [ d["Province_State"].isin(  provinces) ]

d["beta"] = d["beta"].clip(0,0.20)


d["ObservationDate"] = d["ObservationDate"].dt.strftime("%m/%d")
d = d.merge(df_lockdown [["Country_Region","ObservationDate", "Lockdown"]] , on=["Country_Region", "ObservationDate"], how="left")

#hubei_lockdown = d [(d["Province_State"] == "Hubei") & (d["ObservationDate"] == "01/24")].index.values[0]

#d.loc[hubei_lockdown,"Lockdown"] = "L"


hmap = d.pivot(index="Province_State", columns="ObservationDate", values="beta")
annot = d.pivot(index="Province_State", columns="ObservationDate", values="Lockdown").fillna("")


fig, ax = plt.subplots(figsize=(20,10))

sns.heatmap(hmap, cmap="Blues", ax=ax, annot = annot.values, fmt = '', annot_kws={"color": 'white', 'size': 16, "weight": "bold"})
plt.title("History of beta for some countries/provinces")
plt.show()

let's see what happen when max beta is aligned

In [None]:
data = load_data ()
d = data [ data ["Confirmed"] > 500 ]
d = d [ d["Province_State"].isin( provinces ) ]

d["ObservationDate"] = d["ObservationDate"].dt.strftime("%m/%d")
d = d.merge(df_lockdown [["Country_Region","ObservationDate", "Lockdown"]] , on=["Country_Region", "ObservationDate"], how="left")

#hubei_lockdown = d [(d["Province_State"] == "Hubei") & (d["ObservationDate"] == "01/24")].index.values[0]
#d.loc[hubei_lockdown,"Lockdown"] = "L"



df = d[["Province_State", "ObservationDate", "beta"]].pivot(index="ObservationDate", columns="Province_State", values="beta").reset_index(drop="True")

df_annot = d[["Province_State", "ObservationDate", "Lockdown"]].pivot(index="ObservationDate", columns="Province_State", values="Lockdown").fillna("")
df_annot ["day"] = df_annot.index
df["day"]=df.index
df_annot["day"]=df.index


idx_max = df.idxmax()
#hubei_idx = idx_max["Hubei"]


for c in [c for c in provinces if (c != "day" and c != "Hubei" ) ]:
    idx = idx_max[c]
    df[c]=df[c].shift(-idx )
    df_annot[c] =df_annot[c].shift(-idx) 





d = pd.melt(df,id_vars=["day"], value_vars=provinces, value_name="beta"   )
d["beta"] = d["beta"].clip(0,0.20)


d_annot = pd.melt(df_annot,id_vars=["day"], value_vars=provinces, value_name="Lockdown"   )
d = d.query('day < 170')
d_annot = d_annot.query('day < 170')


fig, ax = plt.subplots(figsize=(20,10))



hmap = d.pivot(index="Province_State", columns="day", values="beta")
annot = d_annot.pivot(index="Province_State", columns="day", values="Lockdown").fillna("")



sns.heatmap(hmap, cmap="Blues", ax=ax,  annot = annot.values, fmt = '', annot_kws={"color": 'white', 'size': 16, "weight": "bold"})


plt.title("Relative History of beta for some countries/provinces")
plt.show()

## Citymapper Mobility Index

Data from [Citymapper](https://citymapper.com/CMI) 

In [None]:
mobility_index= pd.read_csv("Citymapper_Mobility_Index_20200809.csv", header=3)
mobility_index ["Date"] = pd.to_datetime(mobility_index ["Date"], format="%Y-%m-%d")
mobility_index ["Date"] = mobility_index ["Date"].dt.strftime("%m/%d")

fig, ax = plt.subplots(figsize=(20,10))

df = mobility_index.transpose()
df.columns = df.iloc[0]
df = df [1:]
for c in df.columns:
    df[c] = df[c].astype(np.float32)

sns.heatmap(df.clip(upper=1.0), cmap="Blues", ax=ax, )

plt.title("Citymapper Mobility Index")
plt.show()

*to be continued*