In [3]:
# Libraries
import pandas as pd
import numpy as np
from time import strptime
import datetime
import re

import matplotlib.pyplot as plt
import seaborn as sns

import chart_studio.plotly as py
import cufflinks as cf
%matplotlib inline
import ipywidgets as widgets
from plotly import tools
import plotly.graph_objs as go
import plotly.express as px
import warnings

cf.go_offline()

SyntaxError: invalid syntax (<ipython-input-3-2e7e89a2ebf4>, line 21)

# Project 3: Coronavirus: Demographical and Epidemiological Visualisation
-------



## Introduction

Coronavirus is a reality that has been affecting the way people live around the globe. There is lots of useful information out there, but also a lot fake, inaccurate data and misconceptions due to personal beliefs/bias, etc.

Hopefully this visual guide will help tackle some of this issues and check with facts the spread of the virus and whether or we can do something to help.

Initially this project aimed to do a deeper analysis on demographics and comorbidity, but this data was impossible to find. One reason why this kind of data is scarse might be due to the ongoing crisis and lack of resources, which doesn't allow the authorities to recover the necessary information.

There are some numbers and theories regarding comorbidity, but it's all in scientific researches and papers, and the data used for them is not accesible.

------------

## Dataset

Multiple datasets where used for this project(5). The source of the data is mostly from Kaggle and Our World in Data. The data had some irrelevant or empty values that needed to be removed, and some other values had to be modified in order to work with them.

The data can be downloaded from the following link: https://drive.google.com/drive/u/1/folders/1XWpKmPrmCkuOfCmukV_3tNkPz2VTb68u

----------------

## Demographics

Let's start with some basic distribution visualisation on age and gender and see if there are important facts to summarize.

In [4]:
demog = pd.read_csv('data/COVID19_line_list_data (1).csv')
#Paolo: be careful with filenames, file names should not have spaces, same for similar cases below.  
# I modified the name removing spaces. Also a local data folder (as per project instructions) was not directly provided
drop_cols = [
    'id', 'case_in_country', 'reporting date', 'Unnamed: 3', 'summary', 'location', 'country', 'symptom_onset', 
    'recovered', 'symptom', 'source', 'link', 'If_onset_approximated', 'hosp_visit_date', 'exposure_start',
    'visiting Wuhan', 'from Wuhan', 'death', 'exposure_end', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 
    'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26',
]
demog.drop(drop_cols, axis=1, inplace=True)

In [5]:
@widgets.interact(
    data = widgets.ToggleButtons(
        options=['gender', 'age', 'gender/age'],
        value='gender',
        description='data:',
        disabled=False,
        tooltips=['See Gender Distribution', 'See Age Distribution', 'See Age/Distribution'],
    )
)

def chart(data):
    annotations = []
    
    fig = go.Figure()
    if data == 'gender/age':
        bg= True
        fig.add_trace(go.Histogram(
            x=demog[demog['gender'] == 'male']['age'],
            histnorm='percent',
            name='male',
            marker_color='#37ced2',
            opacity=0.75,
        ))
        
        fig.add_trace(go.Histogram(
            x=demog[demog['gender'] == 'female']['age'],
            histnorm='percent',
            name='female',
            marker_color='#e3ff00',
            opacity=0.75
        ))
    else:
        fig.add_trace(go.Histogram(
            x=demog[data],
            histnorm='percent',
            name='control',
            marker_color='#3dc546',
            opacity=0.75
        ))
    
    #Source Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.5, 
            y=-0.16,
            xanchor='center', 
            yanchor='top',
            text='Source: Kaggle Coronavirus (COVID-19) Visualization & Prediction',
            font=dict(
                family='Arial',
                size=12,
                color='rgb(150,150,150)'
            ),
            showarrow=False
        )
    )
    
    # Title Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.0, 
            y=1.05,
            xanchor='left', 
            yanchor='bottom',
            text=f'Distribution of population on {data}',
            font=dict(
                family='Arial',
                size=30,
                color='rgb(37,37,37)'
             ),
             showarrow=False
        )
    )
    
    if data == 'age' or data == 'gender/age':
        # right_side of the plot
        annotations.append(
            dict(
                xref='paper', 
                x=1, 
    #             y=0.5,
                xanchor='left', 
                yanchor='middle',
                text=f'mean {int(np.mean(demog["age"]))}',
                font=dict(
                    family='Arial',
                    size=16
                ),
              showarrow=False
            )
        )

    fig.update_layout(
        bargap=0.2,
        bargroupgap=0.1,
        xaxis=dict(
            title= data,
            showticklabels=True,
        ),
        yaxis=dict(
            title= 'Percentage',
            nticks=20,
        ),
        annotations = annotations
    )
    fig.show()

fig = go.Figure()
fig.add_trace(
    go.Box(
        y=demog[demog['gender'] == 'female']['age'], 
        name='Female',
        marker_color = 'indianred', 
        boxmean='sd',
    )
)
fig.add_trace(
    go.Box(
        y=demog[demog['gender'] == 'male']['age'], 
        name='Male',
        marker_color = 'lightseagreen', 
        boxmean='sd'
    )
)
annotations = []
#Source Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.5, 
        y=-0.16,
        xanchor='center', 
        yanchor='top',
        text='Source: Kaggle Coronavirus (COVID-19) Visualization & Prediction',
        font=dict(
            family='Arial',
            size=12,
            color='rgb(150,150,150)'
        ),
        showarrow=False
    )
)

# Title Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.0, 
        y=1.05,
        xanchor='left', 
        yanchor='bottom',
        text=f'Distribution of population on gender/age',
        font=dict(family='Arial',
            size=30,
            color='rgb(37,37,37)'
         ),
         showarrow=False
    )
)

fig.update_layout(
    bargap=0.2,
    bargroupgap=0.1,
    yaxis=dict(
        title= 'Age',
        nticks=20,
    ),
    plot_bgcolor='white',
    annotations = annotations
)
fig.show()
#Paolo: great visualizations!

From these plots we can assume the following:

- There is a predominance of COVID in male individuals.
- We can observe a negative skewness on the age distribution.
- "Young" people are also concerned.
- Age distribution doesn't change much according to sex.
- Children and teenagers are the lowest risk population.

--------

## Evolution of COVID-19

With the next series of plots, we will see the chronological evolution of the virus and its impact on the different countries.

The data used for these plots are the total number of Coronavirus cases, Recoveries and deaths per country per day.

In [8]:
time = pd.read_csv('data/covid_19_data (1).csv')
#Paolo: again weird file name with space in it and (1), I renamed it 
time = time.groupby(['Country/Region', 'ObservationDate']).agg('sum')
time = time.reset_index()
time.drop('SNo', axis=1, inplace=True)
time.rename(columns={'ObservationDate': 'Date Long'}, inplace=True)
time['Date'] = time['Date Long'].replace(regex=True,to_replace=r'(\/\d{2})\w+',value=r'')
time = time.sort_values(by=['Date Long'])

In [9]:
countries_list = list(time['Country/Region'].unique())
countries_list.sort()
@widgets.interact(
    num_dates = (1, time['Date'].nunique()),
    countries = widgets.SelectMultiple(
        options=countries_list,
        value=['Netherlands', 'US', 'France', 'Spain', 'Mainland China', 'Italy', 'Germany'],
        #rows=10,
        description='Countries',
        disabled=False
    ),
    data = widgets.Dropdown(
        options=['Confirmed', 'Deaths', 'Recovered'],
        value='Confirmed',
        description='Data:',
        disabled=False,
    ),
    from_date = widgets.DatePicker(
        description='From',
        disabled=False
    ),
    to_date = widgets.DatePicker(
        description='To',
        disabled=False
    ),
)

def chart(data, countries, from_date, to_date, num_dates):
    df = time
    if len(list(countries)) > 0:
        df = df[df['Country/Region'].isin(countries)]

    if from_date is not None and from_date.strftime('%m/%d/%Y') < df['Date Long'].max():
        df = df[df['Date Long'] >= from_date.strftime('%m/%d/%Y')]

    if to_date is not None and to_date.strftime('%m/%d/%Y') > df['Date Long'].min():
        df = df[df['Date Long'] <= to_date.strftime('%m/%d/%Y')]

    fig = px.line(
        df, 
        x="Date",
        y=data,
        color="Country/Region",
    )
    
    annotations = []
    #Source Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.5, 
            y=-0.16,
            xanchor='center', 
            yanchor='top',
            text='Source: Kaggle Coronavirus (COVID-19) Visualization & Prediction',
            font=dict(
                family='Arial',
                size=12,
                color='rgb(150,150,150)'
            ),
            showarrow=False
        )
    )
    
    # Title Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.0, 
            y=1.05,
            xanchor='left', 
            yanchor='bottom',
            text=f'Evolution of cases ({data}) per country',
            font=dict(family='Arial',
                size=30,
                color='rgb(37,37,37)'
             ),
             showarrow=False
        )
    )
    
    fig.update_layout(
        xaxis=dict(
            nticks=num_dates,
            showticklabels=True,
            tickangle=45,
        ),
        yaxis=dict(
            nticks=20,
        ),
        plot_bgcolor='white',
        annotations = annotations
    )
    fig.show()


Insights:

- The China curve stabilized 1 month after the records from this database.
- The US speed of spread is alarmingly high (more than twice than China, for a country with 1/3 of the population).
- Other countries that are heavily affected (Spain, France, Italy) have also high curves, but not as high.
- In comparaison to other countries, the Netherlands might seem as a not high value, but we need to take into consideration that Netherlands is a much smaller country, with a smaller population count. But the population density is higher, so the risk of contagion is more present.
- The confirmed case curve and death curve is highly similar.
- China is leading in recovered cases.
-------

In [None]:
#Paolo: great plots, great that you can also select multiple plots! These plots display pretty much 
#exponential trend. Usually a log scale on y is much better instead of a linear scale to plot exponentials. 
#An exponential is a line in logscale, try it!

## Bending the curve

Are there any countries that have been able to revert the curve? 

To verify this we created 2 plots with datasets that are linked to our previous one, but show us different information.

There is one dataset that provides us with information of the daily new confirmed cases per country (and also world), and the second one provides us with information of the daily new deaths.

In [13]:
deaths = pd.read_csv('data/covid-confirmed-daily-deaths (1).csv')
deaths['Date'] = deaths['Date'].apply(lambda x: datetime.datetime.strptime(x, '%b %d, %Y').strftime('%m/%d/%Y'))
deaths = deaths[deaths['Date'] != '12/31/2019']
deaths.rename(columns={'Date': 'Date Long', 'Daily confirmed deaths': 'Daily Confirmed Deaths', 'Entity': 'Country'}, inplace=True)
deaths['Date'] = deaths['Date Long'].replace(regex=True,to_replace=r'(\/\d{2})\w+',value=r'')
deaths = deaths.sort_values(by=['Date Long'])

cases = pd.read_csv('data/covid-confirmed-daily-cases (1).csv')
cases['Date'] = cases['Date'].apply(lambda x: datetime.datetime.strptime(x, '%b %d, %Y').strftime('%m/%d/%Y'))
cases = cases[cases['Date'] != '12/31/2019']
cases.rename(columns={'Date': 'Date Long', 'Daily confirmed cases': 'Daily Confirmed Cases', 'Entity': 'Country'}, inplace=True)
cases['Date'] = cases['Date Long'].replace(regex=True,to_replace=r'(\/\d{2})\w+',value=r'')
cases = cases.sort_values(by=['Date Long'])

In [14]:
countries_list = list(deaths['Country'].unique())
countries_list.sort()
@widgets.interact(
    num_dates = (1, deaths['Date'].nunique()),
    countries = widgets.SelectMultiple(
        options=countries_list,
        value=['Netherlands', 'France', 'Spain', 'China', 'United States', 'Germany'],
        description='Countries',
        disabled=False
    ),
    data = widgets.Dropdown(
        options=['Daily Confirmed Cases', 'Daily Confirmed Deaths'],
        value='Daily Confirmed Cases',
        description='Data:',
        disabled=False,
    ),
    from_date = widgets.DatePicker(
        description='From',
        disabled=False
    ),
    to_date = widgets.DatePicker(
        description='To',
        disabled=False
    ),
)
def chart(data, countries, from_date, to_date, num_dates):
    if data == 'Daily Confirmed Cases':
        df = cases
    if data == 'Daily Confirmed Deaths':
        df = deaths

    if len(list(countries)) > 0:
        df = df[df['Country'].isin(countries)]
        
    if from_date is not None and from_date.strftime('%m/%d/%Y') < df['Date Long'].max():
        df = df[df['Date Long'] >= from_date.strftime('%m/%d/%Y')]

    if to_date is not None and to_date.strftime('%m/%d/%Y') > df['Date Long'].min():
        df = df[df['Date Long'] <= to_date.strftime('%m/%d/%Y')]

    fig = px.line(
        df, 
        x="Date",
        y=data,
        color="Country",
    )
    
    annotations = []
    #Source Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.5, 
            y=-0.16,
            xanchor='center', 
            yanchor='top',
            text='Source: https://ourworldindata.org/',
            font=dict(
                family='Arial',
                size=12,
                color='rgb(150,150,150)'
            ),
            showarrow=False
        )
    )
    
    # Title Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.0, 
            y=1.05,
            xanchor='left', 
            yanchor='bottom',
            text=f'{data} per country',
            font=dict(family='Arial',
                size=30,
                color='rgb(37,37,37)'
             ),
             showarrow=False
        )
    )
    
    fig.update_layout(
        xaxis=dict(
            nticks=num_dates,
            showticklabels=True,
            tickangle=45,
        ),
        yaxis=dict(
            nticks=20,
        ),
        plot_bgcolor='white',
        annotations = annotations
    )
    fig.show()

Insights:

- As in the previous plots, the distribution between the confirmed daily cases and confirmed deaths is relatively similar.
- So far, only 1 country has been able to bend the curve succesfully: China.
- According to this data, France might be on the right path to bend the curve also, going from the highest peak of 7578 new cases in April 1st, to 2116 on April 4th, being the lowest number since March 23th.
- Daily cases in the Netherlands have been consistenly growing.

----

## Higher risk countries.

To define which are the High risk countries, we used some insight gained from our previous plots (on which we see that the most concerned population is older people), and country data on percentage of population 70 years old or over and the quantity of physicians per 1000 citizens.

Assuming that countries with higher rate of population >= 70 years old  will have a higher need of physicians and medical facilities to handle complications related to Coronavirus, we consider higher risk those countries who present a higher than average percentage of elderly people and lower than average quantity of physicians per 1000 citizens.

---

In [None]:
#Paolo: what is the average of elderly people you are talking about, a defined number, where is it?

In [16]:
doctors = pd.read_csv('data/physicians-per-1000-people.csv')
doctors.rename(columns={'Physicians (per 1,000 people) (per 1,000 people)': 'data', 'Entity': 'Country'}, inplace=True)
doctors = doctors.groupby('Country').agg('max')
doctors = doctors.reset_index()

age = pd.read_csv('data/age.csv')
age.rename(columns={'Share who is 70 or over (%)': 'data', 'Entity': 'Country'}, inplace=True)
age = age.groupby('Country').agg('max')
age = age.reset_index()

merged_df = pd.merge(age, doctors, on='Country')
merged_df.drop(columns=['Code_y', 'Year_y'] ,axis=1 ,inplace=True)
merged_df.rename(columns={'Code_x': 'Code', 'Year_x': 'Year', 'data_x': 'age', 'data_y': 'doctors'} ,inplace=True)

In [17]:
countries_list = list(doctors['Country'].unique())
countries_list.sort()
@widgets.interact(
    countries = widgets.SelectMultiple(
        options=countries_list,

        description='Countries',
        disabled=False
    ),
    data = widgets.Dropdown(
        options=['High Risk Countries(Age/Doctors Ratio)', 'Low Risk Countries(Age/Doctors Ratio)', "Overall"],
        value = "Overall",
        description='Options:',
        disabled=False,
    )
)

def chart(data, countries):
    df = merged_df
    
    if data == 'High Risk Countries(Age/Doctors Ratio)':
        df = df[(df['age'] > 9) & (df['doctors'] < 4)]
    
    if data == 'Low Risk Countries(Age/Doctors Ratio)':
        df = df[(df['age'] < 9) & (df['doctors'] > 4)]

    if len(list(countries)) > 0 and data == 'Overall':
        df = df[df['Country'].isin(countries)]
    fig = px.scatter(df, 
                     x="age", 
                     y="doctors", 
                     color="Country",
                     hover_data=['Country', 'age', 'doctors'])
    
    annotations = []
    #Source Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.5, 
            y=-0.16,
            xanchor='center', 
            yanchor='top',
            text='Source: https://ourworldindata.org/',
            font=dict(
                family='Arial',
                size=12,
                color='rgb(150,150,150)'
            ),
            showarrow=False
        )
    )
    
    # Title Part
    annotations.append(
        dict(
            xref='paper', 
            yref='paper', 
            x=0.0, 
            y=1.05,
            xanchor='left', 
            yanchor='bottom',
            text=f'Countries risk according population age/doctors per 1000',
            font=dict(family='Arial',
                size=30,
                color='rgb(37,37,37)'
             ),
             showarrow=False
        )
    )
    
    fig.update_layout(
        xaxis=dict(
            showticklabels=True,
            tickangle=45,
        ),
        yaxis=dict(
            nticks=20,
        ),
        plot_bgcolor='white',
        annotations = annotations
    )
    fig.show()


In [None]:
#Paolo: I do not understand the age variable, is the age of the population? Why does ot go up to low numbers 10,15. At the beginning
#it is not clear how to use the plot, as if you do not do anything it only dispalys a point. Maybe better not to have
# the option and how directly data for all countries. Or explain how to use the plot

Insights:

- Japan, Italy and Germany are amongst the countries with the oldest population. Germany and Italy are amongst the countries most affected by coronavirus.
- Cuba and Greece are the countries with the highest quantity of physicians per 1000 citizens.
- There are multiple high risk countries, and amongst these some countries that are in the list of most affected countries (like Netherlands, France, US).
- In low risk countries we have only 2, Israel and Turkmenistan.

---

## Conclusions

- Contrary to popular belief, younger adults are also affected by the coronavirus. Infection rate is lower, but still highly significant.
- So far, China has been the only country who has experienced a contamination crisis and has managed to revert it.
- The worst part is yet to come, if the data follows the same trend then the next days/week will be the worst part of the crisis.
- Social distancing is the only tool we have at the moment to face this crisis and manage to bend the curve like China.

## Improvements

- Get some more data on demographics.
- Get more detailed data on demographics and contagion/deaths/recoveries.
- Be able to link this data with comorbidity.
- Interactive maps.