## Feature Engineering

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [23]:
dataset=pd.read_csv('owid-covid-data.csv')
dataset.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,0.026,0.026,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,


In [24]:
dataset.shape

(99105, 60)

In [25]:
## Always remember there way always be a chance of data leakage so we need to split the data first and then apply feature
## Engineering
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset,dataset['total_cases'],test_size=0.1,random_state=0)

In [26]:
X_train.shape, X_test.shape

((89194, 60), (9911, 60))

In [27]:
dataset=X_train

In [29]:
dataset.to_csv('X_train.csv')

## Missing Values

In [30]:
## Let us capture all the nan values
## First lets handle Categorical features which are missing
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

continent: 0.0472% missing values
tests_units: 0.4612% missing values


In [31]:
## Replace missing value with a new label
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

dataset=replace_cat_feature(dataset,features_nan)

dataset[features_nan].isnull().sum()

continent      0
tests_units    0
dtype: int64

In [32]:
dataset.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
37522,GIN,Africa,Guinea,2021-04-07,20554.0,44.0,92.286,131.0,1.0,0.857,1565.09,3.35,7.027,9.975,0.076,0.065,1.0,,,,,,,,,,,,,,,,,Missing,82135.0,72576.0,9559.0,,2784.0,0.63,0.55,0.07,212.0,61.11,13132790.0,51.755,19.0,3.135,1.733,1998.926,35.3,336.717,2.42,,,17.45,0.3,61.6,0.477,
54208,MWI,Africa,Malawi,2020-09-04,5608.0,15.0,12.143,175.0,0.0,0.143,293.153,0.784,0.635,9.148,0.0,0.007,0.6,,,,,,,,,375.0,46411.0,2.426,0.02,313.0,0.016,0.039,25.8,samples tested,,,,,,,,,,59.26,19129960.0,197.519,18.1,2.979,1.783,1095.042,71.4,227.349,3.94,4.4,24.7,8.704,1.3,64.26,0.483,
59073,MCO,Europe,Monaco,2020-10-07,227.0,3.0,1.286,2.0,0.0,0.0,5784.324,76.445,32.762,50.963,0.0,0.0,1.08,,,,,,,,,,,,,,,,,Missing,,,,,,,,,,62.04,39244.0,19347.5,,,,,,,5.46,,,,13.8,86.75,,
65921,OWID_NAM,Missing,North America,2021-04-19,36725513.0,85086.0,85266.571,833714.0,744.0,1256.0,62028.774,143.709,144.014,1408.129,1.257,2.121,,,,,,,,,,,,,,,,,,Missing,240667069.0,155992080.0,91559928.0,2815107.0,3996275.0,40.65,26.35,15.46,6750.0,,592072200.0,,,,,,,,,,,,,,,
18632,CHN,Asia,China,2020-01-23,641.0,93.0,,18.0,1.0,,0.445,0.065,,0.013,0.001,,3.08,,,,,,,,,,,,,,,,,Missing,,,,,,,,,,44.91,1439324000.0,147.674,38.7,10.641,5.929,15308.712,0.7,261.899,9.74,1.9,48.4,,4.34,76.91,0.761,


In [33]:
## Now lets check for numerical variables the contains missing values
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']

## We will print the numerical nan variables and percentage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(dataset[feature].isnull().mean(),4)))

total_cases: 0.0354% missing value
new_cases: 0.0354% missing value
new_cases_smoothed: 0.0457% missing value
total_deaths: 0.1376% missing value
new_deaths: 0.136% missing value
new_deaths_smoothed: 0.0457% missing value
total_cases_per_million: 0.0405% missing value
new_cases_per_million: 0.0406% missing value
new_cases_smoothed_per_million: 0.0509% missing value
total_deaths_per_million: 0.1426% missing value
new_deaths_per_million: 0.141% missing value
new_deaths_smoothed_per_million: 0.0509% missing value
reproduction_rate: 0.2038% missing value
icu_patients: 0.9% missing value
icu_patients_per_million: 0.9% missing value
hosp_patients: 0.8754% missing value
hosp_patients_per_million: 0.8754% missing value
weekly_icu_admissions: 0.9911% missing value
weekly_icu_admissions_per_million: 0.9911% missing value
weekly_hosp_admissions: 0.9843% missing value
weekly_hosp_admissions_per_million: 0.9843% missing value
new_tests: 0.5504% missing value
total_tests: 0.5535% missing value
total

In [34]:
## Replacing the numerical Missing Values

for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
    
    ## create a new feature to capture nan values
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
    
dataset[numerical_with_nan].isnull().sum()
    

total_cases                              0
new_cases                                0
new_cases_smoothed                       0
total_deaths                             0
new_deaths                               0
new_deaths_smoothed                      0
total_cases_per_million                  0
new_cases_per_million                    0
new_cases_smoothed_per_million           0
total_deaths_per_million                 0
new_deaths_per_million                   0
new_deaths_smoothed_per_million          0
reproduction_rate                        0
icu_patients                             0
icu_patients_per_million                 0
hosp_patients                            0
hosp_patients_per_million                0
weekly_icu_admissions                    0
weekly_icu_admissions_per_million        0
weekly_hosp_admissions                   0
weekly_hosp_admissions_per_million       0
new_tests                                0
total_tests                              0
total_tests

In [35]:
dataset.head(50)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,total_casesnan,new_casesnan,new_cases_smoothednan,total_deathsnan,new_deathsnan,new_deaths_smoothednan,total_cases_per_millionnan,new_cases_per_millionnan,new_cases_smoothed_per_millionnan,total_deaths_per_millionnan,new_deaths_per_millionnan,new_deaths_smoothed_per_millionnan,reproduction_ratenan,icu_patientsnan,icu_patients_per_millionnan,hosp_patientsnan,hosp_patients_per_millionnan,weekly_icu_admissionsnan,weekly_icu_admissions_per_millionnan,weekly_hosp_admissionsnan,weekly_hosp_admissions_per_millionnan,new_testsnan,total_testsnan,total_tests_per_thousandnan,new_tests_per_thousandnan,new_tests_smoothednan,new_tests_smoothed_per_thousandnan,positive_ratenan,tests_per_casenan,total_vaccinationsnan,people_vaccinatednan,people_fully_vaccinatednan,new_vaccinationsnan,new_vaccinations_smoothednan,total_vaccinations_per_hundrednan,people_vaccinated_per_hundrednan,people_fully_vaccinated_per_hundrednan,new_vaccinations_smoothed_per_millionnan,stringency_indexnan,populationnan,population_densitynan,median_agenan,aged_65_oldernan,aged_70_oldernan,gdp_per_capitanan,extreme_povertynan,cardiovasc_death_ratenan,diabetes_prevalencenan,female_smokersnan,male_smokersnan,handwashing_facilitiesnan,hospital_beds_per_thousandnan,life_expectancynan,human_development_indexnan,excess_mortalitynan
37522,GIN,Africa,Guinea,2021-04-07,20554.0,44.0,92.286,131.0,1.0,0.857,1565.09,3.35,7.027,9.975,0.076,0.065,1.0,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,82135.0,72576.0,9559.0,26039.0,2784.0,0.63,0.55,0.07,212.0,61.11,13132790.0,51.755,19.0,3.135,1.733,1998.926,35.3,336.717,2.42,6.3,31.4,17.45,0.3,61.6,0.477,7.35,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1
54208,MWI,Africa,Malawi,2020-09-04,5608.0,15.0,12.143,175.0,0.0,0.143,293.153,0.784,0.635,9.148,0.0,0.007,0.6,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,375.0,46411.0,2.426,0.02,313.0,0.016,0.039,25.8,samples tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,59.26,19129960.0,197.519,18.1,2.979,1.783,1095.042,71.4,227.349,3.94,4.4,24.7,8.704,1.3,64.26,0.483,7.35,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
59073,MCO,Europe,Monaco,2020-10-07,227.0,3.0,1.286,2.0,0.0,0.0,5784.324,76.445,32.762,50.963,0.0,0.0,1.08,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,62.04,39244.0,19347.5,29.9,6.378,3.871,12951.839,2.2,242.648,5.46,6.3,31.4,49.839,13.8,86.75,0.748,7.35,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,0,1,1,1,0,0,1,1
65921,OWID_NAM,Missing,North America,2021-04-19,36725513.0,85086.0,85266.571,833714.0,744.0,1256.0,62028.774,143.709,144.014,1408.129,1.257,2.121,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,240667069.0,155992080.0,91559928.0,2815107.0,3996275.0,40.65,26.35,15.46,6750.0,60.19,592072200.0,83.479,29.9,6.378,3.871,12951.839,2.2,242.648,7.11,6.3,31.4,49.839,2.4,74.62,0.748,7.35,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
18632,CHN,Asia,China,2020-01-23,641.0,93.0,92.857,18.0,1.0,1.429,0.445,0.065,11.203,0.013,0.001,0.159,3.08,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,44.91,1439324000.0,147.674,38.7,10.641,5.929,15308.712,0.7,261.899,9.74,1.9,48.4,49.839,4.34,76.91,0.761,7.35,0,0,1,0,0,1,0,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
65490,OWID_NAM,Missing,North America,2020-02-13,21.0,1.0,0.571,388.0,0.0,0.0,0.035,0.002,0.001,51.3355,0.0,0.0,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,592072200.0,83.479,29.9,6.378,3.871,12951.839,2.2,242.648,7.11,6.3,31.4,49.839,2.4,74.62,0.748,7.35,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1700,DZA,Africa,Algeria,2020-09-26,50914.0,160.0,184.429,1711.0,4.0,6.571,1161.067,3.649,4.206,39.018,0.091,0.15,0.79,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,75.93,43851040.0,17.348,29.1,6.211,3.857,13913.839,0.5,278.364,6.73,0.7,30.4,83.741,1.9,76.88,0.748,7.35,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
81895,ZAF,Africa,South Africa,2020-02-19,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,95.0,0.002,0.627,4.0,0.0,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,2.78,59308690.0,46.754,27.3,5.344,3.053,12294.876,18.9,200.38,5.52,8.1,33.2,43.993,2.32,64.13,0.709,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
96008,VAT,Europe,Vatican,2021-03-20,27.0,0.0,0.0,388.0,2.0,0.0,33374.536,0.0,0.0,51.3355,0.135,0.0,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,809.0,83.479,29.9,6.378,3.871,12951.839,2.2,242.648,7.11,6.3,31.4,49.839,2.4,75.12,0.748,7.35,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1
63449,NCL,Oceania,New Caledonia,2021-03-06,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,379.0,10.86,7.9,4.05,1328.0,60.19,285491.0,15.342,33.4,9.954,6.489,12951.839,2.2,242.648,23.36,6.3,31.4,49.839,2.4,77.55,0.748,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,1,1,1,0,1,1


## Handling Categorical Feature



In [36]:
## First, we will convert date to datetime datatype.
dataset['date']=pd.to_datetime(dataset['date'])
dataset.info()
dataset['date'].describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89194 entries, 37522 to 68268
Columns: 115 entries, iso_code to excess_mortalitynan
dtypes: datetime64[ns](1), float64(55), int32(55), object(4)
memory usage: 60.2+ MB


  dataset['date'].describe()


count                   89194
unique                    546
top       2021-05-22 00:00:00
freq                      213
first     2020-01-01 00:00:00
last      2021-06-29 00:00:00
Name: date, dtype: object

In [37]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
dataset['date_enc'] = labelencoder.fit_transform(dataset['date'])

dataset=dataset.sort_values(by='date')


In [38]:
dataset

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,total_casesnan,new_casesnan,new_cases_smoothednan,total_deathsnan,new_deathsnan,new_deaths_smoothednan,total_cases_per_millionnan,new_cases_per_millionnan,new_cases_smoothed_per_millionnan,total_deaths_per_millionnan,new_deaths_per_millionnan,new_deaths_smoothed_per_millionnan,reproduction_ratenan,icu_patientsnan,icu_patients_per_millionnan,hosp_patientsnan,hosp_patients_per_millionnan,weekly_icu_admissionsnan,weekly_icu_admissions_per_millionnan,weekly_hosp_admissionsnan,weekly_hosp_admissions_per_millionnan,new_testsnan,total_testsnan,total_tests_per_thousandnan,new_tests_per_thousandnan,new_tests_smoothednan,new_tests_smoothed_per_thousandnan,positive_ratenan,tests_per_casenan,total_vaccinationsnan,people_vaccinatednan,people_fully_vaccinatednan,new_vaccinationsnan,new_vaccinations_smoothednan,total_vaccinations_per_hundrednan,people_vaccinated_per_hundrednan,people_fully_vaccinated_per_hundrednan,new_vaccinations_smoothed_per_millionnan,stringency_indexnan,populationnan,population_densitynan,median_agenan,aged_65_oldernan,aged_70_oldernan,gdp_per_capitanan,extreme_povertynan,cardiovasc_death_ratenan,diabetes_prevalencenan,female_smokersnan,male_smokersnan,handwashing_facilitiesnan,hospital_beds_per_thousandnan,life_expectancynan,human_development_indexnan,excess_mortalitynan,date_enc
3545,ARG,South America,Argentina,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,1.0,1.0,0.000,0.000,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.2,27.7,49.839,5.00,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
57667,MEX,North America,Mexico,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,26.0,26.0,0.000,0.000,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,1.289328e+08,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3546,ARG,South America,Argentina,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,36.0,37.0,0.001,0.001,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.2,27.7,49.839,5.00,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
57668,MEX,North America,Mexico,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,72.0,98.0,0.001,0.001,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,1.289328e+08,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3547,ARG,South America,Argentina,2020-01-03,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,4.0,41.0,0.001,0.000,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.2,27.7,49.839,5.00,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19155,CHN,Asia,China,2021-06-29,91834.0,9.0,18.286,4636.0,0.0,0.000,63.8040,0.006,0.013,3.2210,0.000,0.000,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,1.439324e+09,147.674,38.7,10.641,5.929,15308.712,0.7,261.899,9.74,1.9,48.4,49.839,4.34,76.91,0.761,7.35,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,545
95396,UZB,Asia,Uzbekistan,2021-06-29,110190.0,498.0,477.571,733.0,2.0,1.857,3292.2810,14.879,14.269,21.9010,0.060,0.055,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,3.346920e+07,76.134,28.2,4.469,2.873,6253.104,2.2,724.417,7.57,1.3,24.7,49.839,4.00,71.72,0.720,7.35,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,545
12150,BIH,Europe,Bosnia and Herzegovina,2021-06-29,205004.0,13.0,16.857,9663.0,5.0,2.143,62485.6930,3.962,5.138,2945.3050,1.524,0.653,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,3.280815e+06,68.496,42.5,16.569,10.711,11713.895,0.2,329.635,10.08,30.2,47.7,97.164,3.50,77.40,0.780,7.35,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,545
19636,COL,South America,Colombia,2021-06-29,4213074.0,25880.0,30864.714,105934.0,608.0,661.714,82799.4340,508.619,606.583,2081.9180,11.949,13.005,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,5.088288e+07,44.223,32.2,7.646,4.312,13254.949,4.5,124.240,7.44,4.7,13.5,65.386,1.71,77.29,0.767,7.35,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,545


In [39]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']

In [40]:
categorical_features

['iso_code', 'continent', 'location', 'tests_units']

In [41]:
#Since, iso_code and location are same, we can drop one of the column
dataset=dataset.drop(['iso_code'], axis = 1)
dataset.head()

Unnamed: 0,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,total_casesnan,new_casesnan,new_cases_smoothednan,total_deathsnan,new_deathsnan,new_deaths_smoothednan,total_cases_per_millionnan,new_cases_per_millionnan,new_cases_smoothed_per_millionnan,total_deaths_per_millionnan,new_deaths_per_millionnan,new_deaths_smoothed_per_millionnan,reproduction_ratenan,icu_patientsnan,icu_patients_per_millionnan,hosp_patientsnan,hosp_patients_per_millionnan,weekly_icu_admissionsnan,weekly_icu_admissions_per_millionnan,weekly_hosp_admissionsnan,weekly_hosp_admissions_per_millionnan,new_testsnan,total_testsnan,total_tests_per_thousandnan,new_tests_per_thousandnan,new_tests_smoothednan,new_tests_smoothed_per_thousandnan,positive_ratenan,tests_per_casenan,total_vaccinationsnan,people_vaccinatednan,people_fully_vaccinatednan,new_vaccinationsnan,new_vaccinations_smoothednan,total_vaccinations_per_hundrednan,people_vaccinated_per_hundrednan,people_fully_vaccinated_per_hundrednan,new_vaccinations_smoothed_per_millionnan,stringency_indexnan,populationnan,population_densitynan,median_agenan,aged_65_oldernan,aged_70_oldernan,gdp_per_capitanan,extreme_povertynan,cardiovasc_death_ratenan,diabetes_prevalencenan,female_smokersnan,male_smokersnan,handwashing_facilitiesnan,hospital_beds_per_thousandnan,life_expectancynan,human_development_indexnan,excess_mortalitynan,date_enc
3545,South America,Argentina,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,1.0,1.0,0.0,0.0,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.0,45195777.0,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.5,16.2,27.7,49.839,5.0,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
57667,North America,Mexico,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,26.0,26.0,0.0,0.0,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.0,128932753.0,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3546,South America,Argentina,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,36.0,37.0,0.001,0.001,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.0,45195777.0,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.5,16.2,27.7,49.839,5.0,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
57668,North America,Mexico,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,72.0,98.0,0.001,0.001,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.0,128932753.0,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.9,21.4,87.847,1.38,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3547,South America,Argentina,2020-01-03,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,4.0,41.0,0.001,0.0,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.0,45195777.0,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.5,16.2,27.7,49.839,5.0,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2


In [42]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']
categorical_features

['continent', 'location', 'tests_units']

#### Handling rare categorical data
We will remove categorical variables that are present less than 1% of the observations. Such values are maped to Rare_val 
as they donot have any significance towards the outcome. 
For explaination: refer to the link >> https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0

In [43]:
for feature in categorical_features:
    temp=dataset.groupby(feature)['total_cases'].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')
    

In [44]:
dataset.head(100)

Unnamed: 0,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality,total_casesnan,new_casesnan,new_cases_smoothednan,total_deathsnan,new_deathsnan,new_deaths_smoothednan,total_cases_per_millionnan,new_cases_per_millionnan,new_cases_smoothed_per_millionnan,total_deaths_per_millionnan,new_deaths_per_millionnan,new_deaths_smoothed_per_millionnan,reproduction_ratenan,icu_patientsnan,icu_patients_per_millionnan,hosp_patientsnan,hosp_patients_per_millionnan,weekly_icu_admissionsnan,weekly_icu_admissions_per_millionnan,weekly_hosp_admissionsnan,weekly_hosp_admissions_per_millionnan,new_testsnan,total_testsnan,total_tests_per_thousandnan,new_tests_per_thousandnan,new_tests_smoothednan,new_tests_smoothed_per_thousandnan,positive_ratenan,tests_per_casenan,total_vaccinationsnan,people_vaccinatednan,people_fully_vaccinatednan,new_vaccinationsnan,new_vaccinations_smoothednan,total_vaccinations_per_hundrednan,people_vaccinated_per_hundrednan,people_fully_vaccinated_per_hundrednan,new_vaccinations_smoothed_per_millionnan,stringency_indexnan,populationnan,population_densitynan,median_agenan,aged_65_oldernan,aged_70_oldernan,gdp_per_capitanan,extreme_povertynan,cardiovasc_death_ratenan,diabetes_prevalencenan,female_smokersnan,male_smokersnan,handwashing_facilitiesnan,hospital_beds_per_thousandnan,life_expectancynan,human_development_indexnan,excess_mortalitynan,date_enc
3545,South America,Rare_var,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,1.0,1.0,0.000,0.000,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.200,27.700,49.839,5.000,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
57667,North America,Rare_var,2020-01-01,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,26.0,26.0,0.000,0.000,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,1.289328e+08,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.900,21.400,87.847,1.380,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3546,South America,Rare_var,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,36.0,37.0,0.001,0.001,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.200,27.700,49.839,5.000,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
57668,North America,Rare_var,2020-01-02,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,72.0,98.0,0.001,0.001,6571.0,0.649,0.053,18.4,people tested,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,1.289328e+08,66.444,29.3,6.857,4.321,17336.469,2.5,152.783,13.06,6.900,21.400,87.847,1.380,75.05,0.779,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3547,South America,Rare_var,2020-01-03,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,4.0,41.0,0.001,0.000,6571.0,0.649,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,0.00,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.200,27.700,49.839,5.000,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79582,Asia,Rare_var,2020-01-23,1.0,1.0,92.857,388.0,2.0,1.429,0.1710,0.171,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,25.00,5.850343e+06,7915.731,42.4,12.922,7.049,85535.383,2.2,92.243,10.99,5.200,28.300,49.839,2.400,83.62,0.938,7.35,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,22
3567,South America,Rare_var,2020-01-23,13469.0,74.0,92.857,388.0,2.0,1.429,1788.1095,8.442,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6.0,0.000,0.053,18.4,tests performed,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,11.11,4.519578e+07,16.177,31.9,11.198,7.441,18933.907,0.6,191.032,5.50,16.200,27.700,49.839,5.000,76.67,0.845,7.35,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,22
97199,Missing,Rare_var,2020-01-23,655.0,98.0,92.857,18.0,1.0,1.429,0.0840,0.013,11.203,0.0020,0.000,0.159,3.12,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,60.19,7.794799e+09,58.045,30.9,8.696,5.355,15469.207,10.0,233.070,8.51,6.434,34.635,60.130,2.705,72.58,0.737,7.35,0,0,1,0,0,1,0,0,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,22
39486,Asia,Rare_var,2020-01-23,2.0,2.0,92.857,388.0,2.0,1.429,0.2670,0.267,11.203,51.3355,0.135,0.159,1.01,184.0,17.4475,666.0,82.207,51.539,9.009,322.4435,41.77,6200.0,827053.0,72.113,0.627,6571.0,0.649,0.053,18.4,Missing,917280.0,642078.0,387438.0,26039.0,6969.0,10.86,7.9,4.05,1677.0,13.89,7.496988e+06,7039.714,44.8,16.303,10.158,56054.920,2.2,242.648,8.33,6.300,31.400,49.839,2.400,84.86,0.949,7.35,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,1,1,1,1,0,0,1,22
