# Libraries and loading the dataset
## Fixing some data issues

In [5]:
# !pip install numpy pandas matplotlib ipywidgets

import os
import pandas as pd

owid_covid_data = pd.read_csv(r"/home/goncalo/Documents/cover/Illuminatti/hackathon/aiHackCovid/datasets/owid-covid-data.csv")


In [6]:
# Checking table columns and datapoints
owid_covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83862 entries, 0 to 83861
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   iso_code                               83862 non-null  object 
 1   continent                              79805 non-null  object 
 2   location                               83862 non-null  object 
 3   date                                   83862 non-null  object 
 4   total_cases                            81878 non-null  float64
 5   new_cases                              81876 non-null  float64
 6   new_cases_smoothed                     80875 non-null  float64
 7   total_deaths                           72262 non-null  float64
 8   new_deaths                             72420 non-null  float64
 9   new_deaths_smoothed                    80875 non-null  float64
 10  total_cases_per_million                81435 non-null  float64
 11  ne

## Changing the data type of a column
We can see that 'Date' is not of Datetype. It will probably be very useful to use 'Date' as the x-axis for various reasons. To cast this column to datetype then:

In [8]:
df = owid_covid_data.sort_values('date', ascending=True)
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83862 entries, 3009 to 83861
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   iso_code                               83862 non-null  object        
 1   continent                              79805 non-null  object        
 2   location                               83862 non-null  object        
 3   date                                   83862 non-null  datetime64[ns]
 4   total_cases                            81878 non-null  float64       
 5   new_cases                              81876 non-null  float64       
 6   new_cases_smoothed                     80875 non-null  float64       
 7   total_deaths                           72262 non-null  float64       
 8   new_deaths                             72420 non-null  float64       
 9   new_deaths_smoothed                    80875 non-null  flo

## Take a look at the dataframe as a table

In [36]:
# Show first 10 rows of the dataframe
df.head(1)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
3009,ARG,South America,Argentina,2020-01-01,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,,,,,tests performed,,,,,,,,,,0.0,45195777.0,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.5,16.2,27.7,,5.0,76.67,0.845


## The table doesn't display all the columns as a default to spare space. To do so use:

In [20]:
# Make jupyter show all the columns of the dataframe
pd.set_option('display.max_columns', None)

In [37]:
# Tail will show the last 5 results - you can call it with an int argument like '10' to view the last 10 rows.
df.tail(1)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
83861,ZWE,Africa,Zimbabwe,2021-04-24,38064.0,19.0,52.143,1556.0,0.0,0.571,2561.003,1.278,3.508,104.69,0.0,0.038,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14862927.0,42.729,19.6,2.822,1.882,1899.775,21.4,307.846,1.82,1.6,30.7,36.791,1.7,61.49,0.571


In this exploratory part the idea should be to find interesting relationships between variables or columns. To do so you should read the data notes and explanations regarding the column material - what are these variables and what do they represent?