In [1]:
import pandas as pd

### Знакомимся с новыми данными: коронавирус

#### Загружаем необходимые таблицы для работы

In [2]:
covid_data = pd.read_csv('data/covid_data.csv')
display(covid_data.head())

Unnamed: 0,date,province/state,country,confirmed,deaths,recovered
0,01/22/2020,Anhui,China,1.0,0.0,0.0
1,01/22/2020,Beijing,China,14.0,0.0,0.0
2,01/22/2020,Chongqing,China,6.0,0.0,0.0
3,01/22/2020,Fujian,China,1.0,0.0,0.0
4,01/22/2020,Gansu,China,0.0,0.0,0.0


In [6]:
vaccinations_data = pd.read_csv('data/country_vaccinations.csv')
vaccinations_data = vaccinations_data[
    ['country', 'date', 'total_vaccinations', 'people_vaccinated',
     'people_vaccinated_per_hundred', 'people_fully_vaccinated',
     'people_fully_vaccinated_per_hundred',
     'daily_vaccinations', 'vaccines']
]
display(vaccinations_data.head())

Unnamed: 0,country,date,total_vaccinations,people_vaccinated,people_vaccinated_per_hundred,people_fully_vaccinated,people_fully_vaccinated_per_hundred,daily_vaccinations,vaccines
0,Afghanistan,2021-02-22,0.0,0.0,0.0,,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
1,Afghanistan,2021-02-23,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
2,Afghanistan,2021-02-24,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
3,Afghanistan,2021-02-25,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."
4,Afghanistan,2021-02-26,,,,,,1367.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi..."


#### Предобработка данных

In [8]:
#Группируем таблицу по дате и названию страны
#и рассчитываем суммарные показатели по всем регионам
covid_data = covid_data.groupby(
    ['date', 'country'],
    as_index=False
)[['confirmed', 'deaths', 'recovered']].sum()
display(covid_data)

Unnamed: 0,date,country,confirmed,deaths,recovered
0,01/01/2021,Afghanistan,51526.0,2191.0,41727.0
1,01/01/2021,Albania,58316.0,1181.0,33634.0
2,01/01/2021,Algeria,99897.0,2762.0,67395.0
3,01/01/2021,Andorra,8117.0,84.0,7463.0
4,01/01/2021,Angola,17568.0,405.0,11146.0
...,...,...,...,...,...
86780,12/31/2020,Vietnam,1465.0,35.0,1325.0
86781,12/31/2020,West Bank and Gaza,138004.0,1400.0,117183.0
86782,12/31/2020,Yemen,2099.0,610.0,1394.0
86783,12/31/2020,Zambia,20725.0,388.0,18660.0


In [10]:
#Переводим даты в формат datetime
covid_data['date'] = pd.to_datetime(covid_data['date'])
display(covid_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86785 entries, 0 to 86784
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       86785 non-null  datetime64[ns]
 1   country    86785 non-null  object        
 2   confirmed  86785 non-null  float64       
 3   deaths     86785 non-null  float64       
 4   recovered  86785 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 3.3+ MB


None

In [12]:
#Зафиксируем число больных на данный по факту 
#(без выздоровевших и умерших)
covid_data['active'] = (covid_data['confirmed'] - 
covid_data['deaths'] - covid_data['recovered'])
display(covid_data.head())

Unnamed: 0,date,country,confirmed,deaths,recovered,active
0,2021-01-01,Afghanistan,51526.0,2191.0,41727.0,7608.0
1,2021-01-01,Albania,58316.0,1181.0,33634.0,23501.0
2,2021-01-01,Algeria,99897.0,2762.0,67395.0,29740.0
3,2021-01-01,Andorra,8117.0,84.0,7463.0,570.0
4,2021-01-01,Angola,17568.0,405.0,11146.0,6017.0


In [14]:
#Создадим признак ежедневного прироста числа заболевших, 
#умерших и выздоровевших людей
covid_data = covid_data.sort_values(by=['country', 'date'])
covid_data['daily_confirmed'] = (
    covid_data.groupby('country')['confirmed'].diff()
)
covid_data['daily_deaths'] = (
    covid_data.groupby('country')['deaths'].diff()
)
covid_data['daily_recovered'] = (
    covid_data.groupby('country')['recovered'].diff()
)
display(covid_data.head())

Unnamed: 0,date,country,confirmed,deaths,recovered,active,daily_confirmed,daily_deaths,daily_recovered
11337,2020-02-24,Afghanistan,1.0,0.0,0.0,1.0,,,
11570,2020-02-25,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11807,2020-02-26,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12051,2020-02-27,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12299,2020-02-28,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
#В таблице vaccination_data преобразуемм столбцы в формат datetime
vaccinations_data['date'] = pd.to_datetime(vaccinations_data['date'])
display(vaccinations_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42795 entries, 0 to 42794
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   country                              42795 non-null  object        
 1   date                                 42795 non-null  datetime64[ns]
 2   total_vaccinations                   23457 non-null  float64       
 3   people_vaccinated                    22371 non-null  float64       
 4   people_vaccinated_per_hundred        22371 non-null  float64       
 5   people_fully_vaccinated              19462 non-null  float64       
 6   people_fully_vaccinated_per_hundred  19462 non-null  float64       
 7   daily_vaccinations                   42558 non-null  float64       
 8   vaccines                             42795 non-null  object        
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 2.9+ MB


None