In [8]:
#Import libraries
import pandas as pd
import numpy as np
import math
import statistics as stats
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

### Cleaning Covid-19 dataframe (From Our World in Data)
- Daily data, turn into monthly;
- Covid cases by country, continent;
- From 2020 to 2022.

In [81]:
covid = pd.read_csv('data/Covid/owid-covid-data.csv')
covid = covid.drop(labels = ['location','new_cases_smoothed','new_deaths_smoothed', 'total_cases_per_million',
        'new_cases_per_million', 'new_cases_smoothed_per_million','total_deaths_per_million', 'new_deaths_per_million',
        'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients_per_million','hosp_patients_per_million',
        'weekly_icu_admissions_per_million','weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
        'total_tests_per_thousand', 'new_tests_per_thousand','new_tests_smoothed', 'new_tests_smoothed_per_thousand',
        'tests_per_case', 'tests_units', 'people_fully_vaccinated', 'total_boosters','new_vaccinations', 
        'new_vaccinations_smoothed','total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
        'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred','new_vaccinations_smoothed_per_million',
        'new_people_vaccinated_smoothed','new_people_vaccinated_smoothed_per_hundred', 'stringency_index',
        'population_density', 'aged_65_older','aged_70_older','cardiovasc_death_rate', 'diabetes_prevalence', 
        'female_smokers','male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand','life_expectancy',
        'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative',
        'excess_mortality', 'excess_mortality_cumulative_per_million', 'gdp_per_capita','extreme_poverty'], axis=1)
covid.columns = list(map(lambda x: x.lower().replace(' ', '_'), covid.columns))

In [82]:
covid['month'] = covid['date'].apply(lambda x: pd.Timestamp(x)).dt.month
covid['year'] = covid['date'].apply(lambda x: pd.Timestamp(x)).dt.year

In [83]:
eu28_countries_iso3 = ['AUT','BEL', 'BGR', 'HRV', 'CYP', 'CZE', 'DNK', 
                    'EST', 'FIN', 'FRA', 'DEU','GRC', 'HUN', 'IRL', 
                    'ITA', 'LVA', 'LTU', 'LUX', 'MLT', 'NLD', 'POL',
                    'PRT', 'ROU', 'SVK', 'SVN', 'ESP', 'SWE', 'GBR']
covid = covid[~covid['iso_code'].isin(eu28_countries_iso3)]

In [84]:
covid['vaccination_rate'] = round(covid['people_vaccinated']/covid['population'])

In [85]:
covid.head()

Unnamed: 0,iso_code,continent,date,total_cases,new_cases,total_deaths,new_deaths,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,positive_rate,total_vaccinations,people_vaccinated,population,median_age,human_development_index,month,year,vaccination_rate
0,AFG,Asia,2020-02-24,5.0,5.0,,,,,,,,,,39835428.0,18.6,0.511,2,2020,
1,AFG,Asia,2020-02-25,5.0,0.0,,,,,,,,,,39835428.0,18.6,0.511,2,2020,
2,AFG,Asia,2020-02-26,5.0,0.0,,,,,,,,,,39835428.0,18.6,0.511,2,2020,
3,AFG,Asia,2020-02-27,5.0,0.0,,,,,,,,,,39835428.0,18.6,0.511,2,2020,
4,AFG,Asia,2020-02-28,5.0,0.0,,,,,,,,,,39835428.0,18.6,0.511,2,2020,


In [93]:
covid_month = covid.pivot_table(index= ['continent','iso_code','year','month', 'positive_rate', 'vaccination_rate','human_development_index'], 
                                values = ['total_cases','new_cases','total_deaths', 'hosp_patients', 'total_vaccinations', 
                                          'people_vaccinated'],
                                aggfunc = {'total_cases': 'sum', 'new_cases': 'sum' ,'total_deaths': 'sum', 
                                           'hosp_patients': 'sum', 'total_vaccinations': 'sum', 'people_vaccinated': 'sum'})

In [94]:
covid_month

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,hosp_patients,new_cases,people_vaccinated,total_cases,total_deaths,total_vaccinations
continent,iso_code,year,month,positive_rate,vaccination_rate,human_development_index,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Africa,AGO,2022,1,0.1095,0.0,0.581,0.0,128.0,9713546.0,98029.0,1893.0,14588435.0
Africa,AGO,2022,1,0.1295,0.0,0.581,0.0,680.0,9287396.0,96582.0,1888.0,13944656.0
Africa,AGO,2022,2,0.0097,0.0,0.581,0.0,41.0,10427400.0,98555.0,1899.0,15902065.0
Africa,AGO,2022,2,0.0102,0.0,0.581,0.0,13.0,10591264.0,98671.0,1899.0,16259606.0
Africa,AGO,2022,2,0.0145,0.0,0.581,0.0,29.0,10241793.0,98453.0,1896.0,15505389.0
...,...,...,...,...,...,...,...,...,...,...,...,...
South America,URY,2022,4,0.0961,1.0,0.817,0.0,840.0,2975454.0,890980.0,7171.0,8082817.0
South America,URY,2022,4,0.0972,1.0,0.817,0.0,627.0,2974900.0,890140.0,7169.0,8077365.0
South America,URY,2022,4,0.0977,1.0,0.817,0.0,381.0,2974680.0,889513.0,7166.0,8068217.0
South America,URY,2022,4,0.0983,1.0,0.817,0.0,493.0,2974679.0,889132.0,7162.0,8068211.0


In [95]:
covid_month.apply(pd.Series.unique)

hosp_patients         [0.0, 7637.0, 8906.0, 4943.0, 6119.0, 3995.0, ...
new_cases             [128.0, 680.0, 41.0, 13.0, 29.0, 0.0, 23.0, 21...
people_vaccinated     [9713546.0, 9287396.0, 10427400.0, 10591264.0,...
total_cases           [98029.0, 96582.0, 98555.0, 98671.0, 98453.0, ...
total_deaths          [1893.0, 1888.0, 1899.0, 1896.0, 1895.0, 1900....
total_vaccinations    [14588435.0, 13944656.0, 15902065.0, 16259606....
dtype: object

## Creating .csv files

In [96]:
from pathlib import Path  

In [99]:
#Our World in Data - monthly covid cases: 
filepath = Path('data/Cleaned/covid_month.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
covid_month.to_csv(filepath) 