In [None]:
from pandas import read_csv, DataFrame
from requests import get
from matplotlib import pyplot as plt
from datetime import datetime
from os.path import isfile
import numpy as np
import seaborn as sns
import geopandas as gpd

data_url = "http://opendata.ecdc.europa.eu/covid19/casedistribution/csv/"
local_file = "snapshot-{0}.csv".format(datetime.today().strftime("%d-%m-%Y"))

First step, retrieve a fresh snapsot of data

In [None]:
if not isfile(local_file):
    data = get(data_url)
    with open(local_file, "wb") as out_f:
        out_f.write(data.content)
    print("Got new one")
else:
    print("We have snapshot")

Now, we can load this dataset and make a sneak peek

In [None]:
# category and sparse types
dtypes_dict = {"geoId": "category", "countryterritoryCode": "category", "popData2018": "Int64"}
dparser = lambda val: datetime.strptime(val, "%d/%m/%Y")
frame = read_csv(local_file, dtype=dtypes_dict, parse_dates=["dateRep"], date_parser=dparser)
frame.head()

In [None]:
frame.dtypes

In [None]:
frame.memory_usage().sum()

We can optimise memory consumption, using proper dtypes, for example, country/territory name can be category too

In [None]:
frame.countriesAndTerritories = frame.countriesAndTerritories.astype("category")

In [None]:
frame.memory_usage().sum()

In [None]:
%%script false --no-raise-error

# example of manual date parsing, we won't run it
from datetime import datetime
frame['dateConv'] = frame.apply(lambda x: datetime.strptime(x.dateRep, "%d/%m/%Y"), axis=1)
frame.head()

In [None]:
frame.count()

In [None]:
frame.describe()

Dataframe has all abovementioned features, for example — filtering

In [None]:
frame[frame.deaths>=100].count()

In [None]:
frame.countriesAndTerritories.nunique()

Let's get a small slice of data to work with

In [None]:
monty = frame[frame.countriesAndTerritories == "Montenegro"].copy()
monty.count()

In [None]:
monty.head()

In [None]:
monty.sort_values(by=["dateRep"], inplace=True)

In [None]:
monty.reset_index(drop=True, inplace=True)

In [None]:
monty.head()

In [None]:
monty['totalCases'] = 0

monty.loc[0, 'totalCases'] = monty.loc[0, 'cases']
for idx in range(1, len(monty)):
    monty.loc[idx, 'totalCases'] = monty.loc[idx, 'cases'] + monty.loc[idx-1, 'totalCases']
    
monty.head()

Usage of iteration approaches is discouraged in NumPy/Pandas, instead you need to use builtin methods working on series/frames level

In [None]:
%%script false --no-raise-error

monty['totalCases'] = monty.cases.cumsum()

In [None]:
monty.plot(x="dateRep", y="totalCases", kind='line')

Let's make plots a bit more fancy, for that we can use a seaborn package

In [None]:
sns.set()

In [None]:
plt.plot("dateRep", "totalCases", data=monty)
plt.plot("dateRep", "cases", data=monty)

Now let's try to get a cumulative statistics on per-country level, for that we will use gropung

In [None]:
grouped = frame.groupby("countriesAndTerritories")[["cases", "deaths"]].sum().copy()
grouped.sort_values(by=["deaths"], inplace=True, ascending=False)

grouped.head(10)

In [None]:
plot_data = grouped[:20]

In [None]:
fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.4

plot_data = grouped[:20]

plot_data.cases.plot(kind='bar', color='blue', ax=ax, width=width, position=0)
plot_data.deaths.plot(kind='bar', color='red', ax=ax2, width=width, position=1)

ax.set_ylabel('Cases')
ax2.set_ylabel('Deaths')

plt.show()

In [None]:
plot_data.plot(kind="bar", secondary_y="deaths", figsize=(12, 8))

In [None]:
grouped2 = frame.groupby("countriesAndTerritories").agg(
    totalCases = ("cases", "sum"),
    totalDeath = ("deaths", "sum"),
    population = ("popData2018", "first"),
    countryCode = ("countryterritoryCode", "first")
).copy()

grouped2.head()

In [None]:
grouped2["lethalRate"] = grouped2.totalDeath / grouped2.population * 1000
grouped2.sort_values(by=["lethalRate"], inplace=True, ascending=False)

new_plot_data = grouped2[:20]

In [None]:
new_plot_data.lethalRate.plot(kind="bar")

Let's make better visualisation, building _choropleth_ map

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

world.head()

In [None]:
world.plot(figsize=(16,14))

In [None]:
merged = world.merge(grouped2, left_on="iso_a3", right_on="countryCode")
merged.head()

In [None]:
merged.count()

In [None]:
merged.plot(column="lethalRate", cmap="tab10", figsize=(16,14), scheme='equal_interval', k=10, legend=True)

In [None]:
world[world.iso_a3 == '-99']

In [None]:
fixes = {
    'name': ['France', 'Norway', 'Somaliland', 'Kosovo', 'N. Cyprus'],
    'iso_a3': ['FRA', 'NOR', 'SOM', 'RKS', 'CYP']
}
fix_frame = DataFrame(fixes)

world = gpd.GeoDataFrame(fix_frame.set_index('name').combine_first(world.set_index('name')).reset_index())

world[world.iso_a3 == '-99']

In [None]:
merged = world.merge(grouped2, left_on="iso_a3", right_on="countryCode")
merged.count()

In [None]:
ax = merged.plot(column="lethalRate", cmap="Reds", figsize=(16,10), scheme='equal_interval', k=10, legend=True)
ax.set_title("Scary map", fontdict={'fontsize': 20}, loc='left')
ax.annotate("Are we doomed yet?", xy=(0.1, 0.1), size=12, xycoords='figure fraction')

ax.set_axis_off()
ax.get_legend().set_bbox_to_anchor((.12, .4))

Wes McKinney — **Python for Data Analysis**

Владимир Савельев — **Статистика и котики**

Дарелл Хафф — **Как лгать при помощи статистики**

Thank you for watching, please subscribe to our **YouTube** channel!

In [None]:
import qrcode as qr
qr.make('https://www.youtube.com/channel/UCJOEr3_V_F0V3MJPNyYR9jw', box_size=10, border=4, error_correction=qr.constants.ERROR_CORRECT_H)