In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

### This article explains what the effect was of good- or bad communication:

https://theconversation.com/coronavirus-one-year-on-two-countries-that-got-it-right-and-three-that-got-it-wrong-155923

#### Good communication: South Korea and Ghana
#### Bad communicaiton: UK, Brazil and India

### This article explains policy fatigue
PandemicPolicyFatigue_EBA_RG_AP_final.pdf


### This article is about lessons to learn
https://theconversation.com/six-lessons-the-uk-should-have-learned-one-year-on-from-its-first-lockdown-157518

## 1. Load data 'CORONAVIRUS GOVERNMENT RESPONSE TRACKER'  from University of Oxford

In [None]:
#Link to data of CORONAVIRUS GOVERNMENT RESPONSE TRACKER from University of Oxford
link = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

df_data1 = pd.read_csv(link, usecols=['CountryName','RegionName'], keep_default_na=True, dtype='str')

df_data2 = pd.read_csv(link, usecols=['Date','C1_School closing','C2_Workplace closing',
                                      'C3_Cancel public events','C4_Restrictions on gatherings',
                                      'C5_Close public transport','C6_Stay at home requirements',
                                      'C7_Restrictions on internal movement','C8_International travel controls',
                                      'ConfirmedCases', 'H1_Public information campaigns',
                                     'H2_Testing policy','H3_Contact tracing',
                                      'H4_Emergency investment in healthcare','H5_Investment in vaccines',
                                     'H6_Facial Coverings', 'H7_Vaccination policy',
                                     'H7_Flag','H8_Protection of elderly people'], keep_default_na=True)
#print(df_data2.isna().sum())

df_data2 = df_data2.fillna(0)
#print('df_data2.isna().sum()', df_data2.isna().sum())

df_data2 = df_data2.astype('int32')

#print('df_data2.describe()',df_data2.describe())

df_data = pd.concat([df_data1, df_data2], axis=1)
#print('df_data.shape',df_data.shape)

df_data['Date_'] = pd.to_datetime(df_data.Date, format='%Y%m%d', errors='ignore')
del df_data['Date']

df_data.rename(columns={'Date_':'Date'}, inplace=True)



df_data_all = pd.read_csv(link, keep_default_na=True)



In [None]:
df_data.head()

In [None]:
df_data.tail()

### This article explains what the effect was of good- or bad communication:

https://theconversation.com/coronavirus-one-year-on-two-countries-that-got-it-right-and-three-that-got-it-wrong-155923

#### Good communication: South Korea and Ghana
#### Bad communicaiton: UK, Brazil and India

In [None]:
def plot_country_C(country):
    df = df_data[df_data.CountryName == country][:].copy()
    df['dayCases'] = df.loc[:, 'ConfirmedCases'].diff()
    def not_zero(a):
        if a < 0:
            a = 0
        return a
    df['dayCases'] = df.dayCases.apply(not_zero)
    df = df[1:]
    cols = list(df.columns.values)[2:10]
    plt.figure(figsize=(20,15))
    x = df.Date
    y1 = df.dayCases
    plt.plot(x,y1, label='daily cases')
    for i in cols:
        y2 = df.loc[:,i]*y1.mean()
        plt.plot(x,y2, label=i)
    plt.title('daily cases and Containment and closure policies')
    plt.legend()
    plt.show()



In [None]:
def plot_country_H(country):
    df = df_data[df_data.CountryName == country][:].copy()
    df['dayCases'] = df.loc[:, 'ConfirmedCases'].diff()
    def not_zero(a):
        if a < 0:
            a = 0
        return a
    df['dayCases'] = df.dayCases.apply(not_zero)
    df = df[1:]
    cols = list(df.columns.values)[10:19]
    #print(cols)
    plt.figure(figsize=(20,15))
    x = df.Date
    y1 = df.dayCases
    plt.plot(x,y1, label='daily cases')
    for i in cols:
        if i != 'H4_Emergency investment in healthcare' and i != 'H5_Investment in vaccines':
            y2 = df.loc[:,i]*y1.mean()
            plt.plot(x,y2, label=i)
    plt.title('daily cases and Health system policies')
    plt.legend()
    plt.show()



In [None]:
plot_country_C('South Korea')

In [None]:
plot_country_H('South_Korea')

In [None]:
plot_country_C('Ghana')

In [None]:
plot_country_H('Ghana')

In [None]:
plot_country_C('Brazil')

In [None]:
plot_country_H('Brazil')

In [None]:
plot_country_C('India')

In [None]:
plot_country_H('India')

In [None]:
plot_country_C('United Kingdom')

In [None]:
plot_country_H('United Kingdom')

In [None]:
plot_country_C('Belgium')

In [None]:
plot_country_H('Belgium')

In [None]:
plot_country_C('United Arab Emirates')

In [None]:
plot_country_H('United Arab Emirates')

In [None]:
plot_country_C('Israel')

In [None]:
plot_country_H('Israel')

In [None]:
# collecting the data as one feature in a new df_data_control with country as index and 'control_score' and 'first_action_delay'
# cumulate the Closing and Controls (C1..C8) per country
# for countries with Regions cumulate and divide by total number of Regions per Country
# access the number of days between the first case and the start of the first restriction C2/C4/C5/C6 any come first 
def count_restrictions(df):
    '''
    input df holds alls data of one country
    country: country we are counting
    output: 
    '''
    
    restrictions = df.columns.values[2:-2]
    control_score = 0
    for C in restrictions:
        control_score += df[C].sum()
    
    if len(df[df['ConfirmedCases'] > 0][0:1].Date.ravel()) > 0:

        date_of_fist_case = df[df['ConfirmedCases'] > 0][0:1].Date.ravel()[0]
        times = []
        for C in restrictions:
            #print(C)
            if len(df[df[C] >=2][0:1].Date.ravel()) == 0:
                #print(C, 'zero')
                break
            else:
                date = df[df[C] >=2][0:1].Date.ravel()[0]
            #print(date)
            diff = date - date_of_fist_case
            pd.to_timedelta([diff]).astype('timedelta64[h]')[0] # diff = diff.astype('timedelta64[D]')
            times.append(diff/np.timedelta64(1,'D'))
        times = np.asanyarray(times)
        if len(times) == 0:
            first_action_delay = 365.0
        else:
            first_action_delay = times.min()
    else:
        first_action_delay = 365.0
            
    return (first_action_delay, control_score )

country_as_index = df_data.CountryName.unique()
df_data_control = pd.DataFrame(index=country_as_index, columns=['first_action_delay', 'control_score'])

for ci in country_as_index:
    #print(ci)
    df_country = df_data[df_data.CountryName == ci ].copy()
    if len(df_country.RegionName.unique()) == 1:
        first_action_delay, control_score = count_restrictions(df_country)
        if first_action_delay < 0:
            first_action_delay = 365
        df_data_control.at[ci, 'first_action_delay'] = first_action_delay
        df_data_control.at[ci, 'control_score'] = control_score
        
    else:
        print(ci)
        regions = df_country.RegionName.unique()
        print(len(regions))
        first_action_delays = []
        control_scores = []
        numberof_regions = len(regions)
        for region in regions:
            df_region = df_data[df_data.RegionName == region].copy()
            df_region['CountryName'] = df_region.RegionName
            first_action_delay, control_score = count_restrictions(df_region)
            first_action_delays.append(first_action_delay)
            control_scores.append(control_score)
        cs = (np.asanyarray(control_scores)).mean()
        control_score = cs
        first_action_delay = (np.asanyarray(first_action_delays)).mean()
        df_data_control.at[ci, 'first_action_delay'] = first_action_delay
        df_data_control.at[ci, 'control_score'] = control_score

# df_data_control.head()

df_data_control = df_data_control.astype('float32')
df_data_control.info()

print(df_data_control[df_data_control.index == 'United States'])
print()
print(df_data_control[df_data_control.index == 'Belgium'])

## 2. load Covid-19 data from CSSEGISandData

In [None]:
#link_deaths_raw = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'

link_cases_raw = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
# file_corona = './novel-corona-virus-2019-dataset/time_series_covid19_deaths_global.csv'
link_deaths_raw = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'

df_corona_2 = pd.read_csv(link_deaths_raw)
df_corona_3 = pd.read_csv(link_cases_raw)
# sum up all provinces in one country and only the last date in a column

temp = df_corona_2.iloc[:,[1,-1]].copy()
df_corona_deaths = temp.groupby(temp['Country/Region'], as_index=True).sum().copy()
#df_corona = df_corona.drop('1/23/20', axis=1)
#df_corona['Country'] = df_corona.index
temp = df_corona_3.iloc[:,[1,-1]].copy()
df_corona_cases = temp.groupby(temp['Country/Region'], as_index=True).sum().copy()
df_corona_cases.head()

# rename 'Country/Region' in Country
df_corona_deaths.reset_index(inplace=True)
df_corona_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)
df_corona_cases.reset_index(inplace=True)
df_corona_cases.rename(columns={'Country/Region':'Country'}, inplace=True)
print('deaths')
print(df_corona_deaths.head(3))
print()
print('confirmed cases')
print(df_corona_cases.head(3))

In [None]:
# merge deaths and cases
print('deaths')
print(df_corona_deaths.tail(3))
print()
print('confirmed cases')
print(df_corona_cases.tail(3))

df_corona = pd.concat([df_corona_deaths, df_corona_cases.iloc[:,-1]], axis='columns')
df_corona.columns = ['Country', 'deaths', 'cases']

print('deaths')
print(df_corona.tail(3))


## 3. Load the data from Wold Bank

In [None]:
# load all_health data from World Bank
file = '../input/covid19correlationswithdatafromworldbank/World_Bank_Data_all_health.csv'
df_health = pd.read_csv(file)
# df_health.head(3)

# the total number of different data items are:

print('Total number of different data items:' ,len(df_health['Series Code'].unique()))
#print('different data items:', df_health['Series Name'].unique())

### the country names in both DataFrames are not the same
Make a dict with names to be changed as keys

In [None]:
# dict with key = country name in World Bank total and value = country name in  df_corona

dict_country_names_data_corona = {
    'Bahamas, The' : 'Bahamas',
    'Brunei Darussalam' : 'Brunei',
    'Congo, Rep.' : 'Congo (Brazzaville)',
    'Congo, Dem. Rep.'   : 'Congo (Kinshasa)',
    'Czech Republic' : 'Czechia',
    'Egypt, Arab Rep.' : 'Egypt',
    'Gambia, The' : 'Gambia',
    'Iran, Islamic Rep.' : 'Iran',
    'Korea, Rep.' : 'Korea, South',
    'Kyrgyz Republic' : 'Kyrgyzstan',
    'Lao PDR' : 'Laos',
    'Russian Federation' : 'Russia',
    'Slovak Republic' : 'Slovakia',
    'Syrian Arab Republic' : 'Syria',    
    'United States' : 'US',
    'Venezuela, RB' : 'Venezuela',
    'Yemen, Rep.' : 'Yemen'
        }

# change the names of the countries in df_health so that thy align with df_corona
def change_name(df, country, dictionary):
    '''
    Canges the name from country column in df following the dict
    input:
    df:  DataFrame where country names have to be changed
    country: the column with the country names to be changed
    dictionary: the dic with old and nuw names
    
    output:
    returns the changed dict with the column 'Country' holding the correct names
    returns a list with the changed names
    
    '''

    list_of_changed_countries = []
    df['Country'] = df[country]
    for index in range(df.shape[0]):
        country_old = df.Country.loc[index]
        if country_old in dictionary.keys():
            new_country = dictionary[country_old]
            df.Country.iat[index] = new_country
            list_of_changed_countries.append(country_old)
            
    return df, list_of_changed_countries

# changing the country names
df_health , the_old_names = change_name(df_health, 'Country', dict_country_names_data_corona)
#print(the_old_names)
print(len(dict_country_names_data_corona), len(the_old_names))

## 4. merge health data with covid-19 death cases

In [None]:
temp = df_health[df_health['Series Code']=='SP.POP.TOTL']
temp[temp.Country == 'Belgium'].Value.ravel()[0]

In [None]:
def add_corona_deaths_cases(df_corona, df_totals):
    '''
    op basis van 'Country' en 'Series Code' wordt df_1 en df_2 samengevoegd in df_totals
    input = df_corona, df_health, df_totals
    output = df_totals
    '''
    Series_Codes = df_totals['Series Code'].unique()
    Countries = df_corona['Country'].values
    index = 0
    while index < df_totals.shape[0]:
        for code in Series_Codes[:]:
            if df_totals['Series Code'].loc[index] == code:
                country = df_totals['Country'].loc[index]
                try:
                    cases = df_corona[df_corona['Country']==country]['cases'].ravel()[0]
                    deaths = df_corona[df_corona['Country']==country]['deaths'].ravel()[0]
                except:
                    cases = 0.
                    deaths = 0.
                #print('deaths', deaths, 'cases', cases)
                if deaths > 0:
                    df_totals['deaths'].iat[index] = deaths
                    df_totals['cases'].iat[index] = cases
                    temp = df_health[df_health['Series Code']=='SP.POP.TOTL']
                    try:
                        population = temp[temp.Country == country].Value.ravel()[0]
                    except:
                        print('population', population, country)
                    #print(population)
                    df_totals.at[index, 'deaths_pp'] = deaths*100000 / population
                    df_totals.at[index , 'cases_pp'] = cases*100000 / population
                    #print('deaths_pp' , df_totals['deaths_pp'].loc[index])
                    #print('cases_pp' , df_totals['cases_pp'].loc[index])


        index +=1   
    return df_totals

df_totals = df_health.copy()
df_totals['deaths'] = -1.0
df_totals['cases'] = -1.0

df_totals = add_corona_deaths_cases(df_corona, df_totals)

# print(len(df_totals.deaths.unique()), df_totals.shape, df_totals.shape)

df_totals.sort_values(['Series Code', 'Country'], axis='index', inplace=True)
df_totals = df_totals[df_totals['deaths'] >= 0. ].copy()
df_totals.reset_index(inplace=True)
del df_totals['index']

indexes_tot_drop = df_totals[df_totals.Value.isna() ].index.ravel()
print(df_totals.shape)
df_totals = df_totals.drop(indexes_tot_drop, axis='index')
print(df_totals.shape)

df_totals.sort_values(['Series Code', 'Country'], axis='index', inplace=True)
df_totals.reset_index(inplace=True)
del df_totals['index']
df_totals.head()

In [None]:
# what are the series code

all_Series_Codes = df_totals['Series Code'].unique()
print(len(all_Series_Codes))

In [None]:
# find all indexes that belong to the 'Series Code' per code
dict_S_Code_indexes = dict()
for code in all_Series_Codes:
    df_temp = df_totals[df_totals['Series Code'] == code]
    dict_S_Code_indexes[code] = df_temp.index.ravel()
    
for key, value in dict_S_Code_indexes.items():
    print('Series Code:', key, 'number of countries:', len(value))

# print populations
for country in df_totals.Country.unique():
    dummy = (df_totals[df_totals.Country == country]['Series Code']=='SP.POP.TOTL')
    index = dummy[dummy].index.ravel()
    if len(index) > 0:
        index= index[0]
        print(index, ';',country, 'population:', df_totals[index:index+1].Value.ravel()[0])
    else:
        print(country, df_totals[df_totals.Country== country])

In [None]:
# normalize unnormalized series : 
# Smoking prevalence, total, ages 15+ : SH.PRV.SMOK
# Gross domestic product 2019 (millions of US dollars)   :   GDP_USdollars
# Population ages 65 and above, total     : SP.POP.65UP.TO
series_to_normalize = ['GDP_USdollars', 'SP.POP.65UP.TO']


for index in df_totals.index:
    if df_totals['Series Code'].loc[index] in series_to_normalize:
        scode = df_totals['Series Code'].loc[index]
        country = df_totals.Country.loc[index]
        dummy = (df_totals[df_totals.Country == country]['Series Code'] == 'SP.POP.TOTL')
        if len(dummy[dummy].index.ravel()) > 0:
            index_pop = dummy[dummy].index.ravel()[0]
            population = df_totals.Value.loc[index_pop]
            value = df_totals.Value.loc[index]
            #print(value)
            new_value = value*100 / population
            #print(new_value)
        else:
            new_value = 0.
        df_totals['Value'].at[index] = new_value


#import seaborn as sns
##sns.set_theme(style="ticks")
##A = dict_S_Code_indexes[all_Series_Codes[0]].ravel()
#B = dict_S_Code_indexes[all_Series_Codes[27]].ravel()
##C = dict_S_Code_indexes[all_Series_Codes[29]].ravel()
#D = dict_S_Code_indexes[all_Series_Codes[30]].ravel()
#E = dict_S_Code_indexes[all_Series_Codes[31]].ravel()
#indexes = np.concatenate((B,D,E), axis=0)

## Show the results of a linear regression within each dataset
#pl  = sns.lmplot(x="deaths", y="Value", col="Series Code", hue="Series Name", data=df_totals.loc[indexes],
#           col_wrap=2, ci=None, palette="muted", height=4, sharey=False,
#           scatter_kws={"s": 50, "alpha": 1})

#plt.savefig('corona_health_exp_3.jpg')

### Make Series in colmuns

In [None]:
def make_df_wide_in_columns(df, code_col):
    '''
    takes in a df with codes in one culumn and puts each column as a group into one new column
    input: df with series codes in 'code_col'
    output: df_wide
    
    '''
    df_wide = df.copy()
    max_size = 0
    for code in all_Series_Codes:
        size = len(dict_S_Code_indexes[code])
        if size > max_size:
            max_size = size
            
    all_Series_Codes_list = all_Series_Codes.tolist()
    all_Series_Codes_list.append('Country')
    all_Series_Codes_list.append('deaths')
    all_Series_Codes_list.append('deaths_pp')
    all_Series_Codes_list.append('cases')
    all_Series_Codes_list.append('cases_pp')
    
    
    df_wide = pd.DataFrame( index=range(max_size+1), 
                           columns=all_Series_Codes_list,
                          dtype='float')
    df_wide.Country = df_wide.Country.astype('str')
    #print(df_wide.shape, df_wide.columns)
    df_wide = df_wide.fillna(-1.)
    index_wide = 0
    for country in df.Country.unique(): 
        country_indexes = df[df.Country == country].index.ravel()
        df_dummy = df.loc[country_indexes]
        #print('df_dummy.shape', df_dummy.shape)
        for index_dummy in df_dummy.index:
            #print(index_dummy)
            series_code = df_dummy['Series Code'].loc[index_dummy]
            value = df_dummy['Value'].loc[index_dummy]
        
            df_wide.at[index_wide, series_code] = value
            df_wide.at[index_wide, 'deaths'] = df['deaths'].loc[index_dummy]
            df_wide.at[index_wide, 'deaths_pp'] = df['deaths_pp'].loc[index_dummy]
            df_wide.at[index_wide, 'cases'] = df['cases'].loc[index_dummy]
            df_wide.at[index_wide, 'cases_pp'] = df['cases_pp'].loc[index_dummy]
            df_wide.at[index_wide, 'Country'] = country
            
        index_wide += 1
                
    return df_wide

# make wide df
df_totals_wide = make_df_wide_in_columns(df_totals, 'Series Code')
#print(df_totals_wide.head())

print('MAX Corona deaths per 100.000', df_totals_wide['deaths_pp'].max() )
print('MIN Corona deaths per 100.000' , df_totals_wide['deaths_pp'].min()  )
print('Mean Corona deaths per 100.000' , df_totals_wide['deaths_pp'].mean()  )
print('MAX Corona cases per 1.000.000', df_totals_wide['cases_pp'].max() )
print('MIN Corona cases per 1.000.000' , df_totals_wide['cases_pp'].min()  )
print('Mean Corona cases per 1.000.000' , df_totals_wide['cases_pp'].mean()  )

# sort on deaths_pp ascending order
df_totals_wide = df_totals_wide.sort_values(['deaths_pp'], axis=0)
#print(df_totals_wide.tail())

df_totals_wide = df_totals_wide.reset_index()
del df_totals_wide['index']

y1 = df_totals_wide['deaths_pp'].values
y2 = df_totals_wide['cases_pp'].values / 10
x = df_totals_wide['Country'].values
plt.figure(figsize=(25,10))

plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20) 

plt.plot(x[-40:], y1[-40:], label='deaths per 100.000')
plt.plot(x[-40:], y2[-40:], label='cases per 1.000.000')
plt.legend()
plt.grid()
plt.xticks(x[-40:], rotation=90)
plt.show()

In [None]:
y = df_totals_wide['deaths_pp'].values
x = df_totals_wide['Country'].values
y = y[:]
x = x[:]

plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10) 
width = 0.8  # the width of the bars

fig, ax = plt.subplots(figsize=(20,40))
rects1 = ax.barh(x, y, width)

ax.set_title('Covid-19 deaths per 100000 inhabitants')

def autolabel(rects, y):
    """Attach a text label above each bar in *rects*, displaying its height."""
    label_nr = 0
    for rect in rects:
        #print(rect, rect.get_y())
        height = rect.get_height()
        ax.annotate('{:0.1f}'.format(y[label_nr]),
                    xy = ( rect.get_width() , rect.get_y() + height/2 ),
                    xytext=(20, -5),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        label_nr += 1


autolabel(rects1, y)
#autolabel(rects2)

#fig.tight_layout()
plt.savefig('Covid-19 deaths per 10000 inhabitants_2.png')
plt.show()


## add control data from Oxford Univesity

In [None]:
# changing the country names
df_data_control.reset_index(inplace=True)
df_data_control = df_data_control.rename(columns={'index':'Country'})
print(df_data_control.columns)
df_data_control , the_old_names = change_name(df_data_control, 'Country', dict_country_names_data_corona)
#print(the_old_names)
print(len(dict_country_names_data_corona), len(the_old_names))

# make country the index
df_data_control.index = df_data_control.Country.values


# add control data from Oxford Univesity
df_totals_wide_control = df_totals_wide.copy()
df_totals_wide_control.index = df_totals_wide_control.Country.values
df_totals_wide_control['first_action_delay'] = 0.0
df_totals_wide_control['control_score'] = 0.0

for country in df_totals_wide_control.Country :
    try:
        first_action_delay = df_data_control.first_action_delay.loc[country]
        control_score = df_data_control.control_score.loc[country]
        df_totals_wide_control['first_action_delay'].at[country] = first_action_delay
        df_totals_wide_control['control_score'].at[country] = control_score
    except Exception as err:
        print(err, df_totals_wide_control.shape)
        df_totals_wide_control = df_totals_wide_control.drop(index=country, axis=index)
        
        pass

cols = list(df_totals_wide_control.columns.ravel())
_ = cols.remove('deaths')
_ = cols.remove('deaths_pp')

# make a dict with Series Codes as key and Series Name as value
dict_Series_Codes = {}
codes = df_health['Series Code'].unique()
names = df_health['Series Name'].unique()
for i in range(len(codes)):
    dict_Series_Codes[codes[i]] = names[i]
    
dict_Series_Codes['first_action_delay'] = 'number of days between first infection case and government action'
dict_Series_Codes['control_score'] = 'total sum of actions over the total period till now'





In [None]:

plt.figure(figsize=(7,7))
x = df_totals_wide_control.cases_pp.values
y = df_totals_wide_control.deaths_pp.values
X = np.vstack([x, np.ones(len(x))]).T
# y = ax + b
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X,y)
a, b = model.coef_
fig = plt.figure(figsize=(5,5))
plt.scatter(x,y)

plt.plot([0,x.max()], [0 , a*x.max()])
plt.title('cases vs. deaths')
plt.show()


In [None]:
plt.hist(x/y, bins=100);

In [None]:
plt.plot(x/y)
plt.show()
print('mean of ratio cases_pp / deaths_pp', (x/y).mean() )

In [None]:
# there are outliers for values where cases_pp / deaths_pp is larger than 5 times the mean
# it is simply not possible that there are countries where cases are 2000 times biger than deaths or in other
# words where there is only 1 death in 2000 cases of covid-19 when the average is 1 per 100.


In [None]:
df_totals_wide_control['cases_to_deaths'] = df_totals_wide_control.cases / df_totals_wide_control.deaths

In [None]:
plt.boxplot(x)
plt.title('boxplot casses per 100.000 inhabitants')
plt.show()
plt.boxplot(y)
plt.title('boxplot deats per 100.000 inhabitants')
plt.show()
plt.boxplot(df_totals_wide_control.cases_to_deaths.values)
plt.title('boxplot ratio cases to deaths')
plt.show()

We use the Interquartile Range Method to identify the best parameter settings. Data that is higher than the Interquartile maximum or lower than the -minimum is considered an outlier.


## remove outliers via ratio of cases to deaths and IsolationForest
https://ieeexplore.ieee.org/abstract/document/4781136


### removing outliers with the Minimum Covariance Determinant (MCD)
The Minimum Covariance Determinant (MCD) method is a highly robust estimator of multivariate location and scatter, for which a fast algorithm is available. […] It also serves as a convenient and efficient tool for outlier detection. (ref. : — Minimum Covariance Determinant and Extensions, 2017 https://arxiv.org/abs/1709.07045)

### LOF: Identifying Density-based Local Outliers, 2000.
https://dl.acm.org/citation.cfm?id=335388

### One-Class SVM (Support Vector Machine)
Estimating the Support of a High-Dimensional Distribution, 2001.
https://dl.acm.org/citation.cfm?id=1119749


### Standard Deviation Method


    - 1 Standard Deviation from the Mean: 68%
    - 2 Standard Deviations from the Mean: 95%
    - 3 Standard Deviations from the Mean: 99.7%
#### A value that falls outside of 3 standard deviations is part of the distribution, but it is an unlikely or rare event at approximately 1 in 370 samples.



In [None]:
class outlierdetect(object):
    def __init__(self, X):
        '''
        initialize the detector
        X is the object to reduce
        '''
        assert X.shape[0] > 1
        assert X.shape[1] == 1
        self.X = X
        
    def OneClassSVM(self, nu):
        '''
        An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. 
        Should be in the interval (0, 1]. By default 0.5 will be taken.
        returns mask and the boolean inverse not_mask
        '''
        self.nu = nu
        print('nu', self.nu)
        ocsvm = OneClassSVM(nu=self.nu)
        yhat = ocsvm.fit_predict(self.X)
        mask = yhat != -1 # select all rows that are not outliers        
        not_mask = ~mask
        return mask, not_mask
    
    def LocalOutlierFactor(self):
        lof = LocalOutlierFactor()
        yhat = lof.fit_predict(X)
        mask = yhat != -1 # select all rows that are not outliers
        not_mask = ~mask
        return mask, not_mask

    def EllipticEnvelope(self, contamination):
        '''
        contamination = The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Range is (0, 0.5)
        '''
        self.contamination = contamination
        ee = EllipticEnvelope(self.contamination)
        yhat = ee.fit_predict(self.X)
        mask = yhat != -1 # select all rows that are not outliers
        not_mask = ~mask
        return mask, not_mask

    def IsolationForest(self, contamination):
        '''
        contamination : 'auto' or float, default='auto'
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the scores of the samples.
        '''
        self.contamination = contamination
        IF = IsolationForest(contamination = self.contamination)
        yhat = IF.fit_predict(self.X)
        mask = yhat != -1 # select all rows that are not outliers
        not_mask = ~mask
        return mask, not_mask
            

        
        

In [None]:
# test class outlierdetect
X = df_totals_wide_control.loc[:,['cases_to_deaths']]
nu_OneClassSVM = 0.371
EllipticEnvelope_contamination = 0.18
IsolationForest_contamination = 0.4
my_detector = outlierdetect(X)
mask, not_mask = my_detector.IsolationForest(IsolationForest_contamination)
mask[:10], not_mask[:10]

In [None]:
def STD_inliers(df, limit=3):
    '''
    inputs df , limit gives the number that std has to be multiplied with
    
    '''
    mean_cases_to_deaths = df_totals_wide_control.cases_to_deaths.mean()
    std_cases_to_deaths = df_totals_wide_control.cases_to_deaths.std()
    dev = limit*std_cases_to_deaths
    min_dev = mean_cases_to_deaths - dev
    if min_dev < 0:
        min_dev = 0
    max_dev = mean_cases_to_deaths + dev
    df = df_totals_wide_control[df_totals_wide_control.cases_to_deaths < max_dev]
    df = df[df.cases_to_deaths > min_dev]
    return df



In [None]:
X = df_totals_wide_control.loc[:,['cases_to_deaths']]

dict_reduced_methods = {'ocSVM' :["df_ocSVM", 0.371],
                'LOF' : ["df_LOF", 0.],
                'EE_MCD' : ["df_EE_MCD", 0.18],
                'IF' : ["df_IF", 0.4],
                'STD' : ["df_STD", 3.]} 

methods = ['df_ocSVM', 'df_LOF', 'df_EE_MCD', 'df_IF', 'df_STD']
my_detector = outlierdetect(X)
dict_reduced = {}

for key, value in dict_reduced_methods.items():
    #print(value[0], value[1])
    #print(key)
    dict_reduced[value[0]] = df_totals_wide_control
    #print(dict_reduced[value[0]].shape)
    if key == 'STD':
        dict_reduced[value[0]] = STD_inliers(dict_reduced[value[0]], limit=3)
        print('STD shape:', dict_reduced[value[0]].shape)
    if key == 'ocSVM':
        mask, not_mask = my_detector.OneClassSVM(value[1])
        dict_reduced[value[0]] = dict_reduced[value[0]].loc[mask]  
        print('ocSVM shape', dict_reduced[value[0]].shape)
    if key == 'LOF':
        mask, not_mask = my_detector.LocalOutlierFactor()
        dict_reduced[value[0]] = dict_reduced[value[0]].loc[mask]  
        print('LOF shape', dict_reduced[value[0]].shape)
    if key == 'EE_MCD':
        mask, not_mask = my_detector.EllipticEnvelope(value[1])
        dict_reduced[value[0]] = dict_reduced[value[0]].loc[mask]  
        print('EE_MCD shape', dict_reduced[value[0]].shape)
    if key == 'IF':
        mask, not_mask = my_detector.IsolationForest(value[1])
        dict_reduced[value[0]] = dict_reduced[value[0]].loc[mask]  
        print('IF shape', dict_reduced[value[0]].shape)



        


In [None]:
X_1D = X.cases_to_deaths.values
STD_1D = dict_reduced['df_STD'].cases_to_deaths.values
all_data = [X_1D, STD_1D]

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))
# plot violin plot
axes[0].violinplot(all_data,
                   showmeans=False,
                   showmedians=True)
axes[0].set_title('violin plot')

# plot box plot
axes[1].boxplot(all_data)
axes[1].set_title('box plot')

# adding horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)
    ax.set_xticks([y+1 for y in range(len(all_data))])
    ax.set_xlabel('')
    ax.set_ylabel('cases/deaths ratio')

# add x-tick labels
plt.setp(axes, xticks=[y+1 for y in range(len(all_data))],
         xticklabels=['all data', 'STD reduced with 3x standard deviation'])
plt.show()

In [None]:
ocSVM_1D = dict_reduced['df_ocSVM'].cases_to_deaths.values
LOF_1D = dict_reduced['df_LOF'].cases_to_deaths.values
EE_1D = dict_reduced['df_EE_MCD'].cases_to_deaths.values
IF_1D = dict_reduced['df_IF'].cases_to_deaths.values
all_data = [ocSVM_1D, LOF_1D, EE_1D, IF_1D]

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))
# plot violin plot
axes[0].violinplot(all_data,
                   showmeans=False,
                   showmedians=True)
axes[0].set_title('violin plot')

# plot box plot
axes[1].boxplot(all_data)
axes[1].set_title('box plot')

# adding horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)
    ax.set_xticks([y+1 for y in range(len(all_data))], minor=False)
    ax.set_xlabel('')
    ax.set_ylabel('cases/deaths ratio')
# add x-tick labels

plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=15)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=15)

plt.setp(axes, xticks=[y+1 for y in range(len(all_data))],
         xticklabels=['one-class SVM', 'Local Oulier Facor', 'Minimum Covariance Determinant', 'Isolation Forest'])
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
def linregplot(x, y):
    '''
    returns a, b from y = ax + b
    '''
    X = np.vstack([x, np.ones(len(x))]).T
    a, b = LinearRegression().fit(X,y).coef_
    return a, b

plt.figure(figsize=(7,7))
for key, value in dict_reduced.items():
    x = value.cases_pp.values
    y = value.deaths_pp.values
    a, b = linregplot(x, y)
    plt.scatter(x,y );
    plt.plot([0,x.max()], [0 , a*x.max()],label=key[3:] );
    
plt.title('best values (inliers) after removing outliers')
plt.legend()
plt.show()


In [None]:
# make list of indexes of the columns except Country and covid-19-deaths covid_19_deaths_per_e5_capita
cat_features = [] 
list_of_all_columns = list(df_totals_wide_control.columns.ravel())

list_not_include = ['Country', 'deaths', 'deaths_pp', 'cases']

for col in list_of_all_columns:
    if col in list_not_include:
        pass
    else:
        cat_features.append(list_of_all_columns.index(col))
cat_features.sort()
print(cat_features)
print(list_of_all_columns)

In [None]:
df_totals_wide_control.reset_index(inplace=True)
del df_totals_wide_control['index']
df_totals_wide_control.index = df_totals_wide_control.Country
df_totals_wide_control.head()

In [None]:
from catboost import CatBoostRegressor, Pool
# run Catboost model 
df_results = df_totals_wide_control.copy()
for key, value in dict_reduced.items():
    results_name = 'results: ' + key[3:]
    print(results_name)
    df_results[results_name] = -0.00001
    counter = 0
    for country in value.index  :
        dummy = value.copy()
        #print('dummy.shape', dummy.shape)
        dummy = dummy.drop(index=country)
        #print('dummy.shape', dummy.shape)
        X = dummy.iloc[:,cat_features]
        #print('X.shape', X.shape)
        y = dummy.deaths_pp.values
        #print('len(y)',len(y))
        model = CatBoostRegressor(random_seed=20)
        model.fit(X,y, verbose=False, plot=False, )
        X_test = value.loc[country][cat_features].values
        #print('X_test.shape', X_test.shape)
        predict = model.predict(X_test)
        df_results[results_name].at[country] = predict
        if counter %2 == 0:
            print(counter, ',', end="")
        counter +=1

In [None]:
plt.figure(figsize=(20,10))
X = df_results.index.values
y1 = df_totals_wide_control.deaths_pp.values
y2 = df_results["results: ocSVM"].values
plt.scatter(X, y1, label='covid 19 deaths per 100 000')
plt.scatter(X, y2, label='prediction with OC SVM inliers')
y3 = df_results['results: LOF'].values
plt.scatter(X, y3, label='prediction with LOF inliers')
y4 = df_results['results: EE_MCD'].values
plt.scatter(X, y4, label = 'prediction with MCD inliers')
y5 = df_results['results: IF'].values
plt.scatter(X, y5, label = 'prediction with IF inliers')
y6 = df_results['results: STD'].values
plt.scatter(X, y6, label = 'prediction with STD inliers')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def errors(y_true, y_hat):
    '''
    displays MAE, RMSE, MAPE, Mean, Max, Min, median, R2
    input: y_true, prediction y_hat
    '''
    y_pred = y_hat
    MAPE = np.mean(np.abs((y_true - y_pred) / (y_true+1))) * 100
    MAE = mean_absolute_error(y_true, y_hat)
    RMSE = mean_squared_error(y_true, y_hat)**.5
    R2 = r2_score(y_true, y_hat)
    MEAN = np.mean(y_hat)
    MEDIAN = np.median(y_hat)
    MIN = np.min(y_hat)
    MAX = np.max(y_hat)
    print('Mean Absolute Error = %.2f' % MAE)
    print('root mean squared error = %.2f' % RMSE)
    print('Mean Absolute Percentage Error (MAPE) = %.2f' % MAPE)
    print('Mean = %.2f' % MEAN)
    print('Median = %.2f' % MEDIAN)
    print('Minimum = %.2f' % MIN)
    print('Maximum = %.2f' % MAX)
    return



In [None]:
for key, value in dict_reduced.items():
    print(key)
    result = 'results: '+key[3:]
    df_result = df_results[df_results[result] != -0.00001]
    y_true = df_result.deaths_pp
    y_hat = df_result[result]
    errors(y_true, y_hat)
    print()

In [None]:
dict_best_results = {}
for key, value in dict_reduced.items():
    print(key)
    result = 'results: '+key[3:]
    df_result = df_results[df_results[result] != -0.00001]
    dict_best_results[key] = []
    max_error = 0.05
    for index in df_result.index:
        error = abs(df_result[result].loc[index] - df_result.deaths_pp.loc[index]) / df_result.deaths_pp.loc[index]
        if error < max_error and df_result.deaths_pp.loc[index] > 5:
            dict_best_results[key].append(index)
            

In [None]:
train_countries_list = []
for key, countries in dict_best_results.items():
    #print(key)
    for train_country in countries:
        train_countries_list.append(train_country)
print('len list',len(train_countries_list))
train_countries_set = set(train_countries_list)
print('len set', len(train_countries_set))
train_countries_list = list(train_countries_set)
print( 'len list' ,len(train_countries_list))

In [None]:
df_results.columns

In [None]:
for key in dict_reduced_methods.keys():
    result_name = 'results: ' + key
    new_column = 'predict_error_' + key
    bool_column = 'bool_' + key
    df_results[new_column] = abs((df_results[result_name]-df_results.deaths_pp)) / df_results.deaths_pp
    max_error = 0.05
    df_results[bool_column] = df_results[new_column] < max_error


In [None]:
df_results['bool_train'] = df_results.bool_EE_MCD | df_results.bool_IF | df_results.bool_LOF | df_results.bool_ocSVM | df_results.bool_STD


In [None]:
df_train = df_results.loc[df_results.bool_train]
df_train.shape

In [None]:
plt.figure(figsize=(20,10))
X = df_train.index.values
y1 = df_train.deaths_pp.values
y2 = df_train['results: ocSVM'].values
plt.scatter(X, y1, label='covid 19 deaths per 100 000', marker='_', s=1000)
plt.scatter(X, y2, label='prediction with OC SVM inliers')
y3 = df_train['results: LOF'].values
plt.scatter(X, y3, label='prediction with LOF inliers')
y4 = df_train['results: EE_MCD'].values
plt.scatter(X, y4, label = 'prediction with MCD inliers')
y5 = df_train['results: IF'].values
plt.scatter(X, y5, label = 'prediction with IF inliers')
y6 = df_train['results: STD'].values
plt.scatter(X, y6, label = 'prediction with STD inliers')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
from catboost import CatBoostRegressor, Pool
# run Catboost model 
results_name = 'results_new_trainset'
print(results_name)
df_totals_wide_control[results_name] = -0.00001
dummy = df_train.copy()
print('dummy.shape', dummy.shape)
#dummy = dummy.drop(index=country)
print('dummy.shape', dummy.shape)
X = dummy.iloc[:,cat_features]
print('X.shape', X.shape)
y = dummy.deaths_pp.values
print('len(y)',len(y))
model = CatBoostRegressor(random_seed=20)
model.fit(X,y, verbose=False, plot=False, )
X_test = df_totals_wide_control.iloc[:,cat_features]
print('X_test.shape', X_test.shape)
predict = model.predict(X_test)
print('predict shape', predict.shape)
df_totals_wide_control[results_name] = predict


In [None]:
plt.figure(figsize=(20,10))
X = df_totals_wide_control.index.values
y1 = df_totals_wide_control.deaths_pp.values
y2 = df_totals_wide_control.results_new_trainset.values
plt.scatter(X, y1, label='covid 19 deaths per 100 000', marker='_', s=100)
plt.scatter(X, y2, label='prediction with new train set')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
errors(y1,y2)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model_etr = ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=4,
                    min_samples_split=3, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=8, oob_score=False,
                    random_state=1496, verbose=0, warm_start=True)


# run extratrees model 
results_name = 'results_new_trainset_ETR'
print(results_name)
df_totals_wide_control[results_name] = -0.00001
dummy = df_train.copy()
#print('dummy.shape', dummy.shape)
#dummy = dummy.drop(index=country)
#print('dummy.shape', dummy.shape)
X = dummy.iloc[:,cat_features]
#print('X.shape', X.shape)
y = dummy.deaths_pp.values
#print('len(y)',len(y))
#model_etr = CatBoostRegressor(random_seed=20)
model_etr.fit(X,y, )
X_test = df_totals_wide_control.iloc[:,cat_features]
#print('X_test.shape', X_test.shape)
predict = model.predict(X_test)
#print('predict shape', predict.shape)
df_totals_wide_control[results_name] = predict



In [None]:
plt.figure(figsize=(20,10))
X = df_totals_wide_control.index.values
y1 = df_totals_wide_control.deaths_pp.values
y2 = df_totals_wide_control.results_new_trainset_ETR.values
plt.scatter(X, y1, label='covid 19 deaths per 100 000', marker='_', s=100)
plt.scatter(X, y2, label='prediction with new train set model ExtraTreesRegressor')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
errors(y1,y2)

In [None]:
cases_to_deats_mean = df_train.cases_to_deaths.mean()
print('mean of cases to deaths ration in the 5pct best inlier countries: %0.2f' %cases_to_deats_mean )

In [None]:
plt.hist(df_train.cases_to_deaths, bins = 10);

In [None]:
df_train.cases_to_deaths.std(), df_train.cases_to_deaths.median()