# Pre-processing Script for OWID COVID-19 Data

## Updated: 2021-03-28

Set up notebook

In [None]:
import datetime
import numpy as np
import pandas as pd

Pull data and convert date field, handle null values

In [None]:
url = 'https://raw.githubusercontent.com/rjanhealth/covid-19-data/master/public/data/owid-covid-data.csv'
df = pd.read_csv(url, error_bad_lines=False)
df['date']= pd.to_datetime(df['date']) 

In [None]:
df = df[df.columns.drop(list(df.filter(regex='smoothed|hundred|thousand|million|weekly|total|stringency|per_case|positive')))]

Tidy data

In [None]:
df = df.drop(df[df.location.isin(['World', 'International'])].index)
df = df.rename(columns={'location':'country',
                        'new_tests':'tests',
                        'new_cases':'cases',
                        'new_vaccinations':'vaccinations',
                        'new_deaths':'deaths',
                        'iso_code':'iso3c',
                        'human_development_index':'human_dev_idx'})
df.dropna(subset=['iso3c'], inplace=True)
df.dropna(subset=['continent'], inplace=True)

Change data types & case

In [None]:
df["population"] = df["population"].astype('Int64')
df["cases"] = df["cases"].astype('Int64')
df["tests"] = df["tests"].astype('Int64')
df["deaths"] = df["deaths"].astype('Int64')
df["icu_patients"] = df["icu_patients"].astype('Int64')
df["hosp_patients"] = df["hosp_patients"].astype('Int64')
df["people_vaccinated"] = df["people_vaccinated"].astype('Int64')
df["vaccinations"] = df["vaccinations"].astype('Int64')
df["people_fully_vaccinated"] = df["people_fully_vaccinated"].astype('Int64')
df.columns= df.columns.str.upper()

Split data

In [None]:
dim_country = df[['ISO3C', 'CONTINENT','COUNTRY','POPULATION','POPULATION_DENSITY','MEDIAN_AGE','AGED_65_OLDER','AGED_70_OLDER','GDP_PER_CAPITA','EXTREME_POVERTY','CARDIOVASC_DEATH_RATE','DIABETES_PREVALENCE','FEMALE_SMOKERS','MALE_SMOKERS','HANDWASHING_FACILITIES','LIFE_EXPECTANCY','HUMAN_DEV_IDX']]

In [None]:
dim_country = dim_country.drop_duplicates()

In [None]:
fact_covid = df[['DATE','ISO3C','CASES','DEATHS','REPRODUCTION_RATE','ICU_PATIENTS','HOSP_PATIENTS','TESTS','TESTS_UNITS','PEOPLE_VACCINATED','PEOPLE_FULLY_VACCINATED','VACCINATIONS']]

Export data

In [9]:
path='C:\\Users\\vicsxk6\\OneDrive - Department of Health and Human Services. Victoria\\Documents\\Python Scripts\\other_jurisdictions\\output\\'
df.to_excel(path+'intl.xlsx', index=False)
dim_country.to_csv(path+'dt_country.csv', index=False)
fact_covid.to_csv(path+'ft_covid.csv', index=False)