In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/owid-covid-data.csv
/kaggle/input/US_counties.csv


# A structured Step-by-Step Process

1. Define Problem.
2. Summarize Data.
3. Prepare Data.
4. Evaluate Algorithm
5. Improve results
6. Present Results.


## 1. Problem definition

The purpose of this model is to assess risk of COVID-19 exposure based on places according to number of cases and deaths. The input data to the model would be place (zip-code), and the output would be a risk score measure based on the input attributes calculated within the model.

This dataset was collected by [Minh Son Nguyen](emailto://son.nguyen.ohiou@gmail.com)

In [None]:
US_counties = pd.read_csv('../input/US_counties.csv')

In [10]:
owid_covid = pd.read_csv('../input/owid-covid-data.csv')

## 2. Summary statistics

Now it is time to take a look at the data. In this step we are going to take a look at the data a few different ways:

*  Dimensions of the dataset.
*  Peek at the data itself.
*  Statistical summary of all attributes.
*  Breakdown of the data by the class variable.

Starting by the first dataset: **US_counties**

*  Dimensions of the dataset.

In [9]:
#shape 
print("US_counties dimention is: " + str(US_counties.shape))

US_counties dimention is: (310256, 6)


The US_counties has 310256 observations and 6 features

*  Peek at the data itself.

In [5]:
#head
display(US_counties.head())

Unnamed: 0,date,county,state,fips,cases,deaths
0,04/13/20,Motley,Texas,48345.0,1,0
1,04/14/20,Motley,Texas,48345.0,1,0
2,04/15/20,Motley,Texas,48345.0,1,0
3,04/16/20,Motley,Texas,48345.0,1,0
4,04/17/20,Motley,Texas,48345.0,1,0


In [8]:
print("Types of the data: ", str(US_counties.dtypes))

Types of the data:  date       object
county     object
state      object
fips      float64
cases       int64
deaths      int64
dtype: object


## Attributes:
The US_counties dataset has six(6 attributes) as the followings:
 - date: date of the reported cases.
 - county: The US county.
 - state: The US state.
 - fips: *Federal Information Processing Standard Publication*, identify each US county to join with other tables.
 - cases: Number of cases reported per day.
 - deaths: Number of deaths reported per day.

* Statistical Summary

In [13]:
# description
pd.set_option('precision', 2)
print(US_counties.describe())

            fips      cases     deaths
count  307024.00  310256.00  310256.00
mean    30152.99     498.09      26.57
std     15318.26    4122.34     358.55
min      1001.00       0.00       0.00
25%     18101.00       6.00       0.00
50%     29081.00      30.00       1.00
75%     45053.00     144.00       4.00
max     56045.00  222444.00   22696.00


In [15]:
print(US_counties.corr(method='pearson'))

        fips  cases  deaths
fips    1.00  -0.03   -0.03
cases  -0.03   1.00    0.95
deaths -0.03   0.95    1.00


Both deaths and cases have a positive stronge relationship. 

## owid_covid dataset

In [30]:
#shape 
print("owid_covid dimention is: " + str(owid_covid.shape))

owid_covid dimention is: (29134, 34)


In [28]:
pd.set_option('display.max_columns', None)
display(owid_covid.head())

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cvd_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy
0,AFG,Asia,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,38900000.0,54.42,18.6,2.58,1.34,1803.99,,597.03,9.59,,,37.75,0.5,64.83
1,AFG,Asia,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38900000.0,54.42,18.6,2.58,1.34,1803.99,,597.03,9.59,,,37.75,0.5,64.83
2,AFG,Asia,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38900000.0,54.42,18.6,2.58,1.34,1803.99,,597.03,9.59,,,37.75,0.5,64.83
3,AFG,Asia,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38900000.0,54.42,18.6,2.58,1.34,1803.99,,597.03,9.59,,,37.75,0.5,64.83
4,AFG,Asia,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38900000.0,54.42,18.6,2.58,1.34,1803.99,,597.03,9.59,,,37.75,0.5,64.83


In [31]:
print("Types of the data: ", str(owid_covid.dtypes))

Types of the data:  iso_code                            object
continent                           object
location                            object
date                                object
total_cases                        float64
new_cases                          float64
total_deaths                       float64
new_deaths                         float64
total_cases_per_million            float64
new_cases_per_million              float64
total_deaths_per_million           float64
new_deaths_per_million             float64
total_tests                        float64
new_tests                          float64
total_tests_per_thousand           float64
new_tests_per_thousand             float64
new_tests_smoothed                 float64
new_tests_smoothed_per_thousand    float64
tests_units                         object
stringency_index                   float64
population                         float64
population_density                 float64
median_age                        

In [40]:
print(owid_covid.describe())

       total_cases  new_cases  total_deaths  new_deaths  \
count     2.89e+04   28860.00      28860.00    28860.00   
mean      3.84e+04     832.79       2203.37       38.06   
std       3.85e+05    7463.50      21305.94      336.86   
min       0.00e+00  -29726.00          0.00    -1918.00   
25%       1.60e+01       0.00          0.00        0.00   
50%       3.12e+02       4.00          6.00        0.00   
75%       3.43e+03      79.00         78.00        2.00   
max       1.20e+07  214930.00     549276.00    10489.00   

       total_cases_per_million  new_cases_per_million  \
count                 28796.00               28796.00   
mean                    898.14                  16.10   
std                    2283.24                  62.61   
min                       0.00                -437.88   
25%                       4.80                   0.00   
50%                     103.48                   0.59   
75%                     666.06                   9.01   
max         

In [37]:
# get statistics for categorical data
from pandas import DataFrame
stats_cat = DataFrame(owid_covid, columns=['iso_code', 'continent', 'location', 'date', 'tests_units'])
stats_cat.describe()

Unnamed: 0,iso_code,continent,location,date,tests_units
count,29070,28878,29134,29134,10102
unique,211,6,212,192,5
top,NPL,Europe,Belarus,2020-06-30,tests performed
freq,192,8063,192,211,4248


In [41]:
pd.set_option('precision', 2)
print(owid_covid.corr(method='pearson'))

                                 total_cases  new_cases  total_deaths  \
total_cases                         1.00e+00   9.55e-01      9.84e-01   
new_cases                           9.55e-01   1.00e+00      9.45e-01   
total_deaths                        9.84e-01   9.45e-01      1.00e+00   
new_deaths                          7.59e-01   8.78e-01      7.96e-01   
total_cases_per_million             6.75e-02   5.14e-02      6.48e-02   
new_cases_per_million               4.17e-02   6.59e-02      2.98e-02   
total_deaths_per_million            8.41e-02   5.26e-02      1.32e-01   
new_deaths_per_million              3.53e-02   5.08e-02      4.58e-02   
total_tests                         9.14e-01   7.37e-01      8.10e-01   
new_tests                           9.20e-01   8.34e-01      8.08e-01   
total_tests_per_thousand            1.39e-01   5.04e-02      1.32e-01   
new_tests_per_thousand              1.19e-01   7.50e-02      1.04e-01   
new_tests_smoothed                  9.25e-01   8.24