In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#plotly allows to use interactive visualizations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

## Prepare pytest for unit testing

In [None]:
!pip install ipytest

import pytest
import ipytest

ipytest.autoconfig()

# Get data (data is relevant to August 7, 2020)

## Data from https://www.worldometers.info/coronavirus/

In [None]:
world_data=pd.read_csv('../input/corona-virus-report/worldometer_data.csv')
world_data.head()

## Day wise no. of cases (Doesn't have country level data)

In [None]:
daywise=pd.read_csv('../input/corona-virus-report/day_wise.csv')
daywise.head()

## Day to day country wise no. of cases (Has County/State/Province level data) 

In [None]:
full_grouped=pd.read_csv('../input/corona-virus-report/full_grouped.csv')
full_grouped.head()

With first cases dating back to November 2019 \[[1](https://www.scmp.com/news/china/society/article/3074991/coronavirus-chinas-first-confirmed-covid-19-case-traced-back), [2](https://www.theguardian.com/world/2020/mar/13/first-covid-19-case-happened-in-november-china-government-records-show-report)\] a novel coronavirus COVID-19 managed to spread outside China no later than January 2020 \[[3](https://www.who.int/news/item/27-04-2020-who-timeline---covid-19)\].

## Total numbers of COVID cases throughout the first half of year 2020 grew exponentialy

In [None]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
def show_cases_by_date(data, country):
    if country not in np.unique(data['Country/Region']):
        return
    data = data[data['Country/Region'] == country]

    area_data = pd.DataFrame(columns=['Date', 'Metric', 'Cases'])
    for i, r in data.iterrows():
        for m in ['Deaths', 'Recovered', 'Active']:
            area_data = area_data.append(
                {'Date': r['Date'], 'Metric': m, 'Cases': r[m]},
                ignore_index=True)

    fig = px.area(area_data,
                  title=f'Cases by date in {country}',
                  x='Date',
                  y='Cases',
                  color='Metric',
                  line_group='Metric'
                  )
    fig.show()


data = full_grouped[['Country/Region',
                     'Date',
                     'Deaths',
                     'Recovered',
                     'Active']]
for d in np.unique(data['Date']):
    df = data[data['Date'] == d]
    data = data.append(
        {
            'Country/Region': 'all countries',
            'Date': d,
            'Deaths': df['Deaths'].sum(),
            'Recovered': df['Recovered'].sum(),
            'Active': df['Active'].sum()
        },
        ignore_index=True)

options = list(np.unique(data['Country/Region']))
interactive(show_cases_by_date,
         data=fixed(data),
         country=widgets.Combobox(options=options,
                                  value='all countries'))

## The greatest number of 282,756 new cases was registered on August 23

In [None]:
peak = (
    daywise.iloc[daywise['New cases'].argmax()]
    [['Date', 'New cases']]
)

date, cases = peak['Date'], peak['New cases']
fig = px.line(daywise,
              x='Date',
              y=['New cases', 'New recovered', 'New deaths'],
              labels={'variable': 'Metric', 'value': 'Cases'},
              title=f'New cases by date. Greatest number of {cases:,} cases on {date}',
              custom_data=['variable', 'Date', 'value']
              )
fig.update_traces(hovertemplate='<br>'.join([
                  'Date=%{customdata[1]}',
                  'Cases=%{customdata[2]:,.f}'
                  ]))
fig.show()


## 16,480,485 cases were confirmed From January 22 to August 27

In [None]:
start = daywise.iloc[0][['Date', 'Confirmed']]
end = daywise.iloc[-1][['Date', 'Confirmed']]

print('Start date: {}. Confirmed cases: {:,}'
      .format(start['Date'], start['Confirmed']))
print('End date: {}. Confirmed cases: {:,}'
      .format(end['Date'], end['Confirmed']))
      
start, end = start['Date'], end['Date']

In [None]:
data = full_grouped.copy()
fig = px.scatter_geo(
    data,
    locationmode='country names',
    locations='Country/Region',
    color='WHO Region',
    hover_name='Country/Region',
    size='Confirmed',
    animation_frame='Date',
    projection='natural earth',
    title=f'COVID spread over the world from {start} to {end}'
)

fig.show()

## 654,036 people died of COVID

In [None]:
print('Deaths: {:,}'.format(daywise.iloc[-1]['Deaths']))

In [None]:
def country_death_by_date(full_grouped, region):
    country_codes = (
        pd.read_csv(
            'https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
        .drop('GDP (BILLIONS)', axis=1)
    )

    country_codes.replace(['United States', 'Czech Republic'],
                          ['US', 'Czechia'],
                          inplace=True)
    data = full_grouped[['Date', 'Country/Region', 'Deaths', 'WHO Region']]

    if region != 'all regions':
        data = data[data['WHO Region'] == region]

    data = pd.merge(data,
                    country_codes,
                    how='inner',
                    left_on='Country/Region',
                    right_on='COUNTRY')
    fig = px.choropleth(data,
                        locations='CODE',
                        color='Deaths',
                        color_continuous_scale=px.colors.sequential.Plasma,
                        labels={'CODE': 'Country', 'Deaths': 'Total deaths'},
                        title=f'Deaths from COVID by country in {region}',
                        animation_frame='Date'
                        )
    fig.show()


regions = np.unique(full_grouped['WHO Region']).tolist()
regions.insert(0, 'all regions')
interactive(country_death_by_date,
         full_grouped=fixed(full_grouped),
         region=regions)

## Most deathes from COVID happened in America

In [None]:
import datetime


def show_cases_by_region(full_grouped, date):
    date = date.strftime('%Y-%m-%d')
    start, end = full_grouped['Date'].min(), full_grouped['Date'].max()

    fig = make_subplots(
        rows=1,
        cols=2,
        specs=[[{'type': 'domain'}, {'type': 'domain'}]]
    )

    if date < start or date > end:
        display(widgets.Label(value=f'''No information for date: {date}.
        Available dates are from {start} to {end}.'''))
        return

    data = full_grouped[full_grouped['Date'] == date]

    fig.add_trace(
        go.Pie(
            labels=data['WHO Region'],
            values=data['Confirmed'],
            pull=[0, 0, 0, 0, 0, 0.3],
            hole=0.4,
            title='Confirmed'
        ),
        1, 1
    )

    fig.add_trace(
        go.Pie(
            labels=data['WHO Region'],
            values=data['Deaths'],
            pull=[0, 0, 0, 0, 0, 0.3],
            hole=0.4,
            title='Deaths'
        ),
        1, 2
    )

    fig.update_layout(title_text=f'''
    Cases by WHO Region to date {date}
    ''')
    fig.show()


interactive(show_cases_by_region,
            full_grouped=fixed(full_grouped),
            date=widgets.DatePicker(
                description='Pick a date',
                disabled=False,
                value=datetime.datetime.strptime(end, '%Y-%m-%d'))
            )


## What is the ratio of the dead to the recovered across countries?

In [None]:
data = full_grouped.copy()

# select top 10 countries with biggest number of confirmed cases to the August 7. Only this contries will be shown
last_date = np.unique(data['Date']).max()
countries = (
    data[data['Date'] == last_date]
    .sort_values('Confirmed', ascending=False)
    [:10]['Country/Region']
)

data = data[data['Country/Region'].isin(countries)]
fig = px.bar(data,
             x='Country/Region',
             y=['Recovered', 'Active', 'Deaths'],
             labels={'value': 'Cases', 'variable': 'Metric'},
             title='Recovered to active cases/deaths during the first half of year 2020',
             animation_frame='Date',
             custom_data=['variable', 'Country/Region', 'Date', 'value'])
fig.show()


## Luxembourg tested the entire population

In [None]:
def show_tested_ratio(world_data, continent):
    data = world_data[['Country/Region', 'TotalTests', 'Population', 'Continent']].copy()
    
    if continent != 'all continents':
        data = data[data['Continent'] == continent]
        
    data['Coverage'] = data['TotalTests'] / data['Population']

    fig = px.treemap(data,
                     values='Coverage',
                     path=['Country/Region'],
                     title=f'<b>Test coverage by country in {continent}</b>',
                     labels={'labels': 'Country'},
                     custom_data=['Country/Region', 'Coverage'])
    fig.update_traces(
        hovertemplate='<br>'.join([
            'Country=%{customdata[0]}',
            'Coverage=%{customdata[1]:.2f}',
        ])
    )
    fig.show()

continents = world_data['Continent'].astype(str)
continents = ['all continents'] + list(np.unique(continents))[:-1]
interactive(show_tested_ratio,
         world_data=fixed(world_data),
         continent=continents)

## Show the start date in certain country 

In [None]:
def get_start_date(full_grouped, country):
    df = (
        full_grouped[['Date', 'Country/Region', 'Confirmed']]
        [full_grouped['Country/Region'] == country]
        .drop('Country/Region', axis=1)
    )
    
    if df.empty:
        return None

    df.index = pd.Index([i for i in range(len(df))])
    idx = df['Confirmed'].ne(0).idxmax()
    return df.iloc[idx]['Date'], df.iloc[idx]['Confirmed']
    
get_start_date(full_grouped, 'Czechia')

## Show the max number of new cases and date in certain country

In [None]:
def get_max_new_cases(full_grouped, country):
    df = (
        full_grouped[['Date', 'Country/Region', 'New cases']]
        [full_grouped['Country/Region'] == country]
        .drop('Country/Region', axis=1)
    )
    
    if df.empty:
        return None
    
    df.index = pd.Index([i for i in range(len(df))])
    idx = df['New cases'].idxmax()
    return df.iloc[idx]['New cases'], df.iloc[idx]['Date']
    
get_max_new_cases(full_grouped, 'Germany')

## Show the longest increase in new cases in certain country

In [None]:
def get_longest_increase(full_grouped, country):
    df = (
        full_grouped[['Date', 'Country/Region', 'New cases']]
        [full_grouped['Country/Region'] == country]
        .drop('Country/Region', axis=1)
    )
    
    if df.empty:
        return None
    
    start, _ = get_start_date(full_grouped, country)
    end = None
    
    df.index = pd.Index([i for i in range(len(df))])
    start_idx = df.index[df['Date'] == start].tolist()[0]
    cur_len, cur_start = 1, start
    longest_len = 0
    
    for i in range(start_idx + 1, len(df) - 1):
        if (df.iloc[i]['New cases'] >= df.iloc[i+1]['New cases']):
            if cur_len > longest_len:
                longest_len = cur_len
                start = cur_start
                end = df.iloc[i]['Date']
            cur_len, cur_start = 1, df.iloc[i+1]['Date']
            continue
        cur_len += 1
                
    if end is None:
        end = df.iloc[-1]['Date']
    return longest_len, start, end
            
        
get_longest_increase(full_grouped, 'Czechia')

## Pytest tests

In [None]:
%%run_pytest[clean]

def test_get_start_date():
    assert(get_start_date(full_grouped, 'United Kingdom') == ('2020-01-31', 2))
    assert(get_start_date(full_grouped, 'Germany') == ('2020-01-27', 1))
    assert(get_start_date(full_grouped, 'Czechia') == ('2020-03-01', 3))
    assert(get_start_date(full_grouped, '123') == None)
    
def test_get_max_new_cases():
    assert(get_max_new_cases(full_grouped, 'United Kingdom') == (5505, '2020-04-22'))
    assert(get_max_new_cases(full_grouped, 'Germany') == (6933, '2020-03-27'))
    assert(get_max_new_cases(full_grouped, 'Czechia') == (381, '2020-04-04'))
    assert(get_max_new_cases(full_grouped, '123') == None)
    
def test_get_longest_increase():
    assert(get_longest_increase(full_grouped, 'United Kingdom') == (7, '2020-03-14', '2020-03-20'))
    assert(get_longest_increase(full_grouped, 'Germany') == (6, '2020-07-12', '2020-07-17'))
    assert(get_longest_increase(full_grouped, 'Czechia') == (6, '2020-06-14', '2020-06-19'))
    assert(get_longest_increase(full_grouped, '123') == None)

In [None]:
def show_brief(full_grouped, country):
    start_date = get_start_date(full_grouped, country)
    if start_date is None:
        display(widgets.Label(
            value=f'There is no data for country {country}')
        )
        return

    cases = 'case' if start_date[1] == 1 else 'cases'
    display(widgets.Label(
        value=f'First confirmed {start_date[1]} {cases} on {start_date[0]}')
    )

    max_new_cases = get_max_new_cases(full_grouped, country)
    display(widgets.Label(
        value='Maximal value of {} new cases reached on {}'
        .format(*max_new_cases))
    )

    longest_increase = get_longest_increase(full_grouped, country)

    days = 'day' if longest_increase[0] == 1 else 'days'
    display(widgets.Label(
        value='Longest increase in new cases from {} till {} lasted {} {}'
        .format(longest_increase[1],
                longest_increase[2],
                longest_increase[0],
                days))
            )


In [None]:
options = list(np.unique(full_grouped['Country/Region']))

display(widgets.Label(
        value=f'Show brief information about COVID-19 spread in country')
    )
interactive(show_brief,
         full_grouped=fixed(full_grouped),
         country=widgets.Combobox(options=options,value=options[0]))

## Visualizing COVID deadliness in USA by state and age group

In [None]:
data = pd.read_csv(
    '../input/provisional-covid-death-counts-by-sexagestate/Provisional_COVID-19_Death_Counts_by_Sex__Age__and_State.csv')

data = data[['State',
             'Sex',
             'Age group',
             'COVID-19 Deaths']]

death_by_age = (data[(data['Age group'] != 'All Ages') &
                     (data['State'] != 'United States') &
                     (data['Sex'] != 'Unknown')])

death_by_age['COVID-19 Deaths'] = death_by_age['COVID-19 Deaths'].fillna(0)


def heatmap_death_by_age(death_by_age, sex):
    death_by_age = (
        death_by_age[death_by_age['Sex'] == sex]
        .drop('Sex', axis=1)
    )

    states = np.unique(death_by_age['State'])
    age_groups = np.unique(death_by_age['Age group'])

    death_list = []
    for s in states:
        death_list.append([])
        for a in age_groups:
            death = (
                death_by_age[(death_by_age['State'] == s)
                             & (death_by_age['Age group'] == a)]
                .drop(['Age group', 'State'], axis=1)
            )

            death = None if death.empty else death.iloc[0, 0]
            death_list[-1].append(death)

    fig = go.Figure(data=go.Heatmap(
        z=death_list,
        x=age_groups,
        y=states,
        hoverongaps=False))
    
    fig.update_layout(
    title="Deaths from COVID-19 by age group and state",
    xaxis_title="Age group",
    yaxis_title="State"
    )
    fig.show()


interactive(heatmap_death_by_age,
         death_by_age=fixed(death_by_age),
         sex=['Male', 'Female'])

## References:
1. https://www.scmp.com/news/china/society/article/3074991/coronavirus-chinas-first-confirmed-covid-19-case-traced-back
2. https://www.theguardian.com/world/2020/mar/13/first-covid-19-case-happened-in-november-china-government-records-show-report
3. https://www.who.int/news/item/27-04-2020-who-timeline---covid-19