In [1]:
# Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
import eli5
from eli5 import show_prediction
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence
import plotly
from plotly import graph_objs as go, offline as po, tools
import numpy as np
import json
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import HTML
import plotly.graph_objs as go
from ipywidgets import interact
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix
from IPython.display import Image



In [2]:
# Dataset
## Load data
path = "data/PatientInfo.csv"
df = pd.read_csv(path)

In [3]:
# Dataprep
# Reduce data 
df.drop(columns=['patient_id','infection_case', 'symptom_onset_date', 'global_num', 'birth_year', 'country', 'city', 'infection_order', 'infected_by', 'contact_number', 'released_date', 'deceased_date'], inplace = True)

# New column "Days since first case"
df['confirmed_date'] = pd.to_datetime(df['confirmed_date'], errors='coerce')
firstCase = df['confirmed_date'].min()
df['Days since first case'] = df['confirmed_date'] - firstCase
df.drop(columns = ['confirmed_date'], inplace = True)

# Binarising 
df = df[df['sex'].notna()]
gender = {'male': 0,'female': 1} 
df.sex = [gender[item] for item in df.sex]

# OneHotEncoding
df = pd.get_dummies(df, columns=["disease"], prefix='', prefix_sep='')
df = pd.get_dummies(df, columns=["province"], prefix='', prefix_sep='')

# Clean age
df['age'] = df['age'].str.replace(r's$', '')
df = df[df['age'].notna()]
df = df.astype({"age": int})

# Convert timedeltas to integer
df['Days since first case'] = df['Days since first case'].dt.days

# Remove "Isolated state"
df = df[df['state'] != 'isolated']

# Rename a column
df = df.rename(columns={'True': 'Existing illness'})

In [4]:
# Split into test and training data
X = df.drop(columns = ['state', 'Daegu'])
y = df[['state']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, shuffle=True, random_state=5)

In [5]:
# Train model with tuned parameters
rfc = RandomForestClassifier(max_depth=20, max_features=2, n_estimators=20, random_state=5)
rfc = rfc.fit(X_train,y_train.values.ravel())

# <span style="color:orange">3:</span> What affects risk of fatal attacks for individuals

In [21]:
out = widgets.Output(layout={'border': '0px solid black'})
with out:
    im = Image(filename="images/mathematical.gif")
    display(im)
out

Output(layout=Layout(border='0px solid black'))

In this section anonymised health data from 3388 covid-19 patients in Korea will be analysed, with the goal of determining which factors increase the risk of having a fatal case, and which does not. 

For each patient, the following data has been used:
* The sex of the patient
* The age of the patient
* When the patient contracted covid-19 relative to the first case in the county
* If the patient had an existing illness together with covid-19
* What region the patient is from

The analysis is done using a *Random Forest Classification Machine-learning model*, and in the following you will be walked through the following parts in order.

1. Presentation of the model and its performance
1. Analysis of the results
1. Presentation of a risk-profiling tool for you to play around with the model yourself

## <span style="color:orange">3.1:</span> The model and how it performs

The model tries to predict whether a person with covid-19 is going to suffer a fatal case ('Deceased') or survive ('Release'). It makes its predictions by infering a great number of rules based on the information about the patients, but let's not discuss the details here, lets instead ask the question we really care about: 

"*Is the model any good?*"

When tested on 579 patients the model had not seen before; It predicted 557 of them correct, wrongly predicting 6 patients as 'Deceased' which were in fact 'Released', and 16 patients as 'Released' which were in fact 'Deceased'. Surely this performance is quite good for someone that does not even have a medical degree! 

In the following, the brain of the model will be extracted, and you will have the opportunity to take a look inside of it to see what the model bases its decisions on. 

## <span style="color:orange">3.2:</span> Inside the brain: How does the model make decisions? 

In [9]:
out = widgets.Output(layout={'border': '0px solid black'})
with out:
    im = Image(filename="images/sagan.gif")
    display(im)
out

Output(layout=Layout(border='0px solid black'))

Indeed it does Sagan, and so it does for the predictions of the patients; Except the brain is not actually inside someones head, rather its a set of mathmatical calculations being done on a chip... Anyhow... Time to investigate!

First, a bar-chart will be shown, presenting how important each feature (sex, age, etc.) is to the model when it makes a prediction. The chart can be seen below, try to hover your mouse over some of the bars!

In [10]:
# Extract feature importances for plotting
feature_importance = pd.Series(rfc.feature_importances_, index=X.columns)
featureNames = feature_importance.index.tolist()
featureImportance = np.multiply(feature_importance.tolist(),100)

# Group regions
regionImportance = sum(featureImportance[4:-1])
featureImportance = featureImportance[0:4]
featureImportance = np.append(featureImportance,regionImportance)
featureNames = featureNames[0:4]
featureNames.append('Regions')
featureNames[0] = 'Sex'
featureNames[1] = 'Age'

In [11]:
# Construct plotly barchart
fig = go.FigureWidget()
fig.layout.title = "Relative importance when predicting chance of survival of COVID-19 patients"
fig.add_bar(x=featureNames, y=featureImportance, text = ['Potentially biased', 'Good predicter', 'Potentially biased', 'Good predicter', 'Potentially biased'])
fig.update_layout(xaxis={'categoryorder':'total descending'}, width=900, height=700)
fig

FigureWidget({
    'data': [{'text': [Potentially biased, Good predicter, Potentially biased,
                …

Hopefully this graph has made you wonder; 'What is relative importance?' and why does some bars say 'Potentially biased' or 'Good predicter' when I hover over them? 

**Relative importance** is simply how important the model thinks a feature is when making rules, and the model thinks features that are good at separating people are better. So if you could say "All women die" and "All men survive", then 'Sex' would be very very important! Luckily this is not the case!

**'Potentially biased' and 'Good predicter'** simply refers to the fact that not all of these features are proven to affect covid-19 fatality. 'Age' and 'Existing illness' are scientifically proven to increase risk of suffering a fatal case, making them 'Good predictors', however, 'Days since first case', 'Regions' and 'Sex' are not. This does not mean that the model could not have discovered something that is in fact true, it just means that there is a chance that the model has found something that it thinks is important, but which in reality is not. For instance, it may think that the region "ABCDEFG" is highly correlated with suffering fatal attacks while in reality its just because this region is a special region only consisting of nursing homes! (Not a real example). 

This graph only tells us how important the model think each feature is, but what does it mean if your 'Age' is 80, and what if it is 20? Time to zoom in some more!


In [12]:
out = widgets.Output(layout={'border': '0px solid black'})
with out:
    im = Image(filename="images/zoom.gif")
    display(im)
out

Output(layout=Layout(border='0px solid black'))

Below you will see an interactive plot of how each feature impacts a prediction. You can choose which feature from the bar-graph you wish to investigate using the dropdown menu (except regions, unfortunately there is nothing to see there!). Have a go at it!

In [13]:
# Get partial dependence graph data

# Sex
sex_y, sex_x = partial_dependence(rfc, X_train, [0])
sex_y = np.multiply(sex_y[0],100)
sex_x = ['Male', 'Female']

# Age
age_y, age_x = partial_dependence(rfc, X_train, [1])
age_y = np.multiply(age_y[0],100)
age_x = age_x[0]

# Days since first case
dsfc_y, dsfc_x = partial_dependence(rfc, X_train, [2])
dsfc_y = np.multiply(dsfc_y[0],100)
dsfc_x = dsfc_x[0]

# Existing illness
ei_y, ei_x = partial_dependence(rfc, X_train, [3])
ei_y = np.multiply(ei_y[0],100)
ei_x = ['No existing illness', 'Existing illness present']

In [14]:
# Make interactive PDP plots

# Show first line
initLine = go.Scatter(x=dsfc_x, y=dsfc_y)

# Construct dropdown menu and graph-changes
updatemenus = [
    {
        'buttons': [
            {
                'method': 'restyle',
                'label': 'Days since first case',
                'args': [
                    {'x': [dsfc_x], 'y': [dsfc_y], 'xaxis': "test"},
                ]
            },
            {
                'method': 'restyle',
                'label': 'Age',
                'args': [
                    {'x': [age_x], 'y': [age_y], 'xaxis': "nooo"},
                ]
            },
            {
                'method': 'restyle',
                'label': 'Existing illness',
                'args': [
                    {'x': [ei_x], 'y': [ei_y]},
                ]
            },
            {
                'method': 'restyle',
                'label': 'Sex',
                'args': [
                    {'x': [sex_x], 'y': [sex_y]},
                ]
            }
        ],
        'direction': 'down',
        'showactive': True,
    }
]

# Update layout based on dropdown
layout = go.Layout(
    title_text = "How each feature impacts a prediction",
    yaxis_title="Chance of survival in %",
    xaxis_title="Value according to dropdown selected",
    width=900, 
    height=500,
    updatemenus=updatemenus,
)

# Plot figure
figure = go.Figure(data=[initLine], layout=layout)
figure

*Age* seems to draw a clear picture, patients above 60 seems to be significantly harder impacted by the disease than those below! *Existing illness* is the same, nearly a 50% drop in chance of survival if an existing illness is present. No wonder it is recommended to avoid physical contact with especially sick and elderly!

*Sex* is a quite interesting (no pun intended), it seems that female has an almost 5% higher chance of survival. However, remember that this is 'Potentially biased' and it is known that the dataset contains more female than male, so don't take this as hard facts. 

*Days since first case* seems a bit harder to interpret. The feature could show nothing and purely be hidden bias, or perhabs the model is able to see something that we simply don't. Perhabs survival rate is a bit lower in the beginning due to the virus being all new, then it rises as awareness spreads around day 40, and starts flattening out at a generally higher level than earlier after day 70. This graph is hard to interpret, is it bias or is the model on to something. What do you think? 

Interpretting the model has shown some interesting insights of how COVID-19 increases risk of having a fatal case, but besides just knowing this, how could this information be used in a smart way? This aspect will be discovered in the final part of the analysis.

## <span style="color:orange">3.3:</span> Using technology to battle pandemics 

In [15]:
out = widgets.Output(layout={'border': '0px solid black'})
with out:
    im = Image(filename="images/phones.gif")
    display(im)
out

Output(layout=Layout(border='0px solid black'))

Most people use their phones every day and carries it all around with them. They watch their news on it, communicate on it, look up recipies on it, people use their phones for almost everything; So why not use it as a tool in fighting global pandemics? 

South Korea already uses smartphones in battling covid-19 by providing emergency messages if you have been near a person with covid-19 [(Source),](https://www.aljazeera.com/news/2020/04/korea-smartphone-apps-tracking-coronavirus-won-stop-buzzing-200408074008185.html) so why not use smartphones even more?

Below you will have the opportunity to type in your own personal information and let the model make a prediction about you as if you had just contracted covid-19. The model will provide you with two options:
1. **Risk profile:** Where you will get a personalized risk-profile as if it was given to you by the Korean government. The idea is that something like this could be implemented in a covid-19 app, and let citizens get personalized risk-assessments as a tool to make people take this more seriously.
1. **Prediction info:** Where you will be able to see what the model has based its recommendation on, and whether it predicts that you are likely to suffer a fatal case ('Deceased') from covid-19 or if you are more likely to survive ('Released'). This has been included for the more tech-savy that wish to investigate how the predictions work hands-on. 

Try to play around with some different inputs and see what outputs you get! (*Pssst! There are three different risk profiles*).

In [16]:
## Specify sex picker
#sexPick = widgets.RadioButtons(options=['Male','Female'],value='Male',layout={'width': 'max-content'})

sexPick = widgets.ToggleButtons(
                                options=['Male', 'Female'],
                                description='',
                                disabled=False,
                                button_style='')


# Specificy age picker
#agePick = widgets.Dropdown(options=[('0', 0), ('10', 10), ('20', 20), 
                                    #('30',30), ('40',40), ('50',50), 
                                    #('60',60),('70',70),('80', 80),('90',90)],
                                    #value=20,
                                    #layout={'width': 'max-content'},
                                    #description='Pick closest:',)
agePick = widgets.IntSlider(
                            value=0,
                            min=0,
                            max=90,
                            step=10,
                            description='Pick closest:',
                            disabled=False,
                            continuous_update=False,
                            orientation='horizontal',
                            readout=True,
                            readout_format='d')

# Specify existing illness picker
#eiPick = widgets.RadioButtons(options=['Yes','No'],value = 'No', layout={'width': 'max-content'})

eiPick = widgets.ToggleButtons(
                                options=['Yes', 'No'],
                                description='',
                                disabled=False,
                                button_style='')

# Specify 'Days since first case' picker
dsfcPick = widgets.IntRangeSlider(value=[20, 40],
                                       min=0,
                                       max=73,
                                       step=1,
                                       description='Approx:',
                                       disabled=False,
                                       continuous_update=False,
                                       orientation='horizontal',
                                       readout=True,
                                       readout_format='d',)

# Specify regions picker
#regPick = widgets.Dropdown(options=['Busan','Chungcheongbuk-do','Chungcheongnam-do',
#                                    'Daejeon','Gangwon-do','Gwangju','Gyeonggi-do',
#                                    'Gyeongsangbuk-do','Gyeongsangnam-do','Incheon',
#                                    'Jeju-do','Jeollabuk-do','Jeollanam-do','Sejong','Seoul',
#                                    'Ulsan'],
#                                    value='Busan',
#                                    description='Pick:',
#                                    disabled=False,)

regPick = widgets.ToggleButtons(options=['Busan','Chungcheongbuk-do','Chungcheongnam-do',
                                         'Daejeon','Gangwon-do','Gwangju','Gyeonggi-do',
                                         'Gyeongsangbuk-do','Gyeongsangnam-do','Incheon',
                                         'Jeju-do','Jeollabuk-do','Jeollanam-do','Sejong','Seoul',
                                         'Ulsan'],
                                          value='Busan',
                                          description='Choose',
                                          disabled=False)

# Build tabs
accordion = widgets.Accordion(children=[sexPick,agePick,eiPick,dsfcPick,regPick])
accordion.set_title(0, 'Sex')
accordion.set_title(1, 'Age')
accordion.set_title(2, 'Existing illness')
accordion.set_title(3, 'Days since first case')
accordion.set_title(4, 'Region')
accordion

Accordion(children=(ToggleButtons(options=('Male', 'Female'), value='Male'), IntSlider(value=0, continuous_upd…

In [17]:
# Function takes input from widgets and returns person information to be used for prediction
def personalInfo(sex, age, ei, dsfc, reg):

    # Get average dsfc
    dsfc = sum(dsfc)/2
    
    # Set all regions to 0
    busan = 0
    Chungcheongbuk_do = 0
    Chungcheongnam_do = 0
    Daejeon = 0
    Gangwon_do = 0
    Gwangju = 0
    Gyeonggi_do = 0
    Gyeongsangbuk_do = 0
    Gyeongsangnam_do = 0
    Incheon = 0
    Jeju_do = 0
    Jeollabuk_do = 0
    Jeollanam_do = 0
    Sejong = 0
    Seoul = 0
    Ulsan = 0
    
    # Set sex
    if sex == 'Male':
        sex = 0
    else:
        sex = 1
        
    # Set existing illes
    if ei == "Yes":
        ei = 1
    else:
        ei = 0   
    
    # Set region 
    if reg == "Busan":
        busan = 1
    elif reg == "Chungcheongbuk-do":
        Chungcheongbuk_do = 1
    elif reg == "Chungcheongnam-do":
        Chungcheongnam_do = 1
    elif reg == "Daejeon":
        Daejeon = 1
    elif reg == "Gangwon-do":
        Gangwon_do = 1
    elif reg == "Gwangju":
        Gwangju = 1
    elif reg == "Gyeonggi-do":
        Gyeonggi_do = 1
    elif reg == "Gyeongsangbuk-do":
        Gyeongsangbuk_do = 1
    elif reg == "Gyeongsangnam-do":
        Gyeongsangnam_do = 1
    elif reg == "Incheon":
        Incheon = 1
    elif reg == "Jeju-do":
        Jeju_do = 1
    elif reg == "Jeollabuk-do":
        Jeollabuk_do = 1
    elif reg == "Jeollanam-do":
        Jeollanam_do = 1
    elif reg == "Sejong":
        Sejong = 1
    elif reg == "Seoul":
        Seoul = 1
    elif reg == "Ulsan":
        Ulsan = 1
    
    # Construct dataframe for prediction
    X_pred = pd.DataFrame({"sex":[sex], 
                            "age":[age],  
                            "Days since first case":[dsfc],
                            "Existing illness":[ei],
                            "Busan":[busan],
                            "Chungcheongbuk-do":[Chungcheongbuk_do],
                            "Chungcheongnam-do":[Chungcheongnam_do],
                            "Daejeon":[Daejeon],
                            "Gangwon-do":[Gangwon_do],
                            "Gwangju":[Gwangju],
                            "Gyeonggi-do":[Gyeonggi_do],
                            "Gyeongsangbuk-do":[Gyeongsangbuk_do],
                            "Gyeongsangnam-do":[Gyeongsangnam_do],
                            "Incheon":[Incheon],
                            "Jeju-do":[Jeju_do],
                            "Jeollabuk-do":[Jeollabuk_do],
                            "Jeollanam-do":[Jeollanam_do],
                            "Sejong":[Sejong],
                            "Seoul":[Seoul],
                            "Ulsan":[Ulsan]})
    return X_pred

In [18]:
# Function takes a profile for prediction and returns a risk profile
# - HIGH
# - MEDIUM
# - LOW
def riskProfile(personal_info):
    
    # Predict using model
    prediction = rfc.predict_proba(personal_info)
    
    if prediction[0][1] <= 0.5:
        # High risk
        x = "High risk"
    elif (prediction[0][1] > 0.5) & (prediction[0][1] <= 0.8):
        x = "Medium risk"
    else:
        x = "Low risk"
        
    return x

In [19]:
# Construct button with images

highrisk = Image(filename='images/highrisk.png',width=700)
mediumrisk = Image(filename='images/mediumrisk.png',width=700)
lowrisk = Image(filename='images/lowrisk.png',width=700)

btn = widgets.Button(description='Get risk-profile')
out = widgets.Output(layout={'border': '0px solid black'})
display(btn)

def btn_eventhandler(obj):
    out.clear_output()
    info = personalInfo(sexPick.value, agePick.value, eiPick.value, dsfcPick.value, regPick.value)
    prof = riskProfile(info)
    with out:
        if prof == "High risk":
            display(highrisk)
        elif prof == "Medium risk":
            display(mediumrisk)
        else:
            display(lowrisk)

btn.on_click(btn_eventhandler)
out

# Construct button with risk predictions

btn2 = widgets.Button(description='Get prediction info')
out = widgets.Output(layout={'border': '0px solid black'})
display(btn2)

def btn_eventhandler(obj):
    out.clear_output()
    info = personalInfo(sexPick.value, agePick.value, eiPick.value, dsfcPick.value, regPick.value)
    with out:
        print("This table shows the prediction y = 'prediction' together with the probability of survival.")
        print("The contribution indicates how much 'Feature' with 'Value' contributed to the prediction.")
        print("Existing illness present = 1. Sex female = 1. Region = 1 means person is from that region.")
        print("If no features specifically lowers chance of survival, the model will have a high <BIAS> towards saying 'Released'.")
        display(show_prediction(rfc, info, show_feature_values=True, feature_names = info.columns.to_numpy()))

btn2.on_click(btn_eventhandler)
out

Button(description='Get risk-profile', style=ButtonStyle())

Button(description='Get prediction info', style=ButtonStyle())

Output(layout=Layout(border='0px solid black'))

------------

Lets wrap it up with some key-takeaways. 

Specifically... 
1. Higher age and existing illness significantly lowers chance of surviving COVID-19
1. Males *might* be slightly more prone to suffering fatal COVID-19 cases 
1. 'Days since first case' was an important feature for the model when trying to predict fatal cases

And on a more general level...
1. Machine learning could be a good tool to learn more about pandemics by utilizing patient data as the pandemic spreads
1. Machine learning could be used to provide personalized recommendations to increase citizens compliance and reduce spreading


In [20]:
out2 = widgets.Output(layout={'border': '0px solid black'})
with out2:
    im = Image(filename="images/bye.gif")
    display(im)
out2

Output(layout=Layout(border='0px solid black'))