In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

In [None]:
pio.templates.default = "plotly_white"

## Data preparation
`get_columns_mapping` - is utility function for rename columns.

In [None]:
def get_columns_mapping(dictionary):
    def mapping_func(name):
        return dictionary.get(name,name)
    
    return mapping_func

### Data about covid
I will load data from "[Novel Corona Virus 2019 Dataset](https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset)" because it one of newest datasets. Anyway another datasets based on data from Johns Hopkins University and should have same quality but i don't want to paste there "git clone" chunk for original data which will try to run every notebook running.

In [None]:
covid_data = pd.read_csv('/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv')
covid_data['ObservationDate'] = pd.to_datetime(covid_data['ObservationDate'])
covid_data['Last Update'] = pd.to_datetime(covid_data['Last Update'])
covid_data.columns = list(map(get_columns_mapping({
    "SNo":"s_no",
    "ObservationDate":"observation_date",
    "Province/State":"province_state",
    "Country/Region":"country_region",
    "Last Update":"last_update",
    "Confirmed":"confirmed",
    "Deaths":"deaths",
    "Recovered":"recovered"
}),covid_data.columns))
covid_data.loc[covid_data['country_region'] == 'Mainland China','country_region'] = 'China'
covid_data.head()

### Population data
Also i will use "[Population by Country - 2020](https://www.kaggle.com/tanuprabhu/population-by-country-2020)" dataset because i would like to look to data with relation by population. Otherwise village with 100 humans where infected 50 humans(or 50%) population will be better than countries with small percent of confirmed cases.

In [None]:
population_df = pd.read_csv('/kaggle/input/population-by-country-2020/population_by_country_2020.csv')
population_df.columns = list(map(get_columns_mapping({
    "Country (or dependency)":"country",
    "Population (2020)":"population",
    "Yearly Change":"yearly_change",
    "Net Change":"net_change",
    "Density (P/Km²)":"density",
    "Land Area (Km²)":"land_area",
    "Migrants (net)":"migrants",
    "Fert. Rate":"fert_rate",
    "Med. Age":"med_age",
    "Urban Pop %":"urban_pop_percent",
    "World Share":"world_share"
}),population_df.columns))
population_df.head()

`population_mapping` just maps from country name to population.

In [None]:
def population_mapping(country):
    if country == 'US':
        country = 'United States'
    pop_list = population_df['population'][population_df['country'] == country].tolist()
    return None if len(pop_list) == 0 else pop_list[0]

Situation per province or state it is too difficult for grasp and require knowledge about anti-COVID policy for each province. So let's will get more common data and aggregate it per country. There is interest to compare few countries for me.

In [None]:
covid_by_country = covid_data[['observation_date','country_region','confirmed','deaths','recovered']].groupby(['country_region','observation_date']).agg(np.sum)
covid_by_country = covid_by_country.reset_index()
covid_by_country.sort_values(by=['country_region','observation_date'])
covid_by_country = covid_by_country[covid_by_country['country_region'].map(lambda x:x in ('US','India','Brazil','France','China','Russia','Spain','United Kingdom','Italy','Japan','Ukraine','Belarus'))]
for metric in ('confirmed','deaths','recovered'):
    covid_by_country[metric] = covid_by_country[metric] / covid_by_country['country_region'].map(lambda country:population_mapping(country)/1000000)
covid_by_country.head()

## Plotting
### Confirmed cases
Count of cases per population it should be useful metric for estimation of defence measures taken.

**Problem:** we couldn't to know real count of infected people. But real count should be at least bigger than official value.

In [None]:
px.line(covid_by_country,x='observation_date',y='confirmed',color='country_region',
        title = "Confirmed cases per 1M population",
        labels = {
            "observation_date":"Date",
            "confirmed":"Confirmed cases (per 1M)",
            "country_region":"Country"
        })

### Deaths
Deaths it is useful metrics for estimate efficiency of medical system.

In [None]:
px.line(covid_by_country,x='observation_date',y='deaths',color='country_region',
        title = "Deaths by coronavirus per 1M population",
        labels = {
            "observation_date":"Date",
            "deaths":"Death cases (per 1M)",
            "country_region":"Country"
        })

### Active cases
Also will be intresting to know amount of infected people which yet not recovered. They are active spreading disease and count of new cases should be proportional to this value.

In [None]:
covid_by_country['active_cases'] = covid_by_country['confirmed'] - covid_by_country['deaths'] - covid_by_country['recovered']

In [None]:
px.line(covid_by_country,x='observation_date',y='active_cases',color='country_region',
        title = "Coronavirus active cases per 1M population",
        labels = {
            "observation_date":"Date",
            "active_cases":"Active cases (per 1M)",
            "country_region":"Country"
        })

### New confirmed cases
Let's also calculate count of new confirmed cases per day. It shows current dynamic of spreading disease.

In [None]:
covid_by_country['confirmed_new'] = 0
for country in covid_by_country['country_region'].drop_duplicates():
    match = covid_by_country['country_region'] == country
    confirmed_col = covid_by_country['confirmed'][match]
    covid_by_country.loc[match,'confirmed_new'] = (confirmed_col - confirmed_col.shift(1)).fillna(0)

In [None]:
px.line(covid_by_country,x='observation_date',y='confirmed_new',color='country_region',
        title = "New confirmed cases per 1M population",
        labels = {
            "observation_date":"Date",
            "active_cases":"New confirmed cases (per 1M)",
            "country_region":"Country"
        })

Negative values caused by errors in dataset. We see number of confirmed cases reduced from 236899 to 213435 at 30 April in Spain. It cause most negative peak on this plot.