## This notebook inputs the missing values using Missing Indicators

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import MissingIndicator

#### Choose wether the imputation should be done on the data set with daily or weekly granularity


In [2]:
INPUT, OUTPUT = 'output/\CompleteCovid', 'output/\MIICovidImputed'

In [3]:
df = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0)
df_train = df.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00'].dropna(how='all', axis=1) #Selecting training data
df_test = df.loc['2020-02-26 02:30:00':, df_train.columns] #Selecting test data
df = df_train

In [4]:
indicator = MissingIndicator(missing_values=np.nan, features="all")

In [5]:
indicator.fit(df)
mask_all = indicator.transform(df)
mask_test_all = indicator.transform(df_test)

In [6]:
df_masks = pd.DataFrame(data=mask_all, index=df.index, columns=df.columns)
df_test_masks = pd.DataFrame(data=mask_test_all, index=df_test.index, columns=df_test.columns)

In [7]:
df_masks = df_masks.add_prefix('MI_')
df_test_masks = df_test_masks.add_prefix('MI_')

In [8]:
df_final = pd.concat([df, df_masks], axis=1, join='outer')
df_test_final = pd.concat([df_test, df_test_masks], axis=1, join='outer')

In [9]:
df_final.head()

Unnamed: 0_level_0,Asia_confirmed_cases,Asia_deaths,Asia_recovered,Europe_confirmed_cases,Europe_deaths,Europe_recovered,Americas_confirmed_cases,Americas_deaths,Americas_recovered,Oceania_confirmed_cases,...,MI_Europe_recovered,MI_Americas_confirmed_cases,MI_Americas_deaths,MI_Americas_recovered,MI_Oceania_confirmed_cases,MI_Oceania_deaths,MI_Oceania_recovered,MI_Africa_confirmed_cases,MI_Africa_deaths,MI_Africa_recovered
update_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22 09:00:00,270.0,6.0,0.0,,,,,,,,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 09:30:00,,,,,,,,,,,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 10:00:00,,,,,,,,,,,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 10:30:00,,,,,,,,,,,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 11:00:00,,,,,,,,,,,...,True,True,True,True,True,True,True,True,True,True


In [10]:
df.head()

Unnamed: 0_level_0,Asia_confirmed_cases,Asia_deaths,Asia_recovered,Europe_confirmed_cases,Europe_deaths,Europe_recovered,Americas_confirmed_cases,Americas_deaths,Americas_recovered,Oceania_confirmed_cases,Oceania_deaths,Oceania_recovered,Africa_confirmed_cases,Africa_deaths,Africa_recovered
update_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-22 09:00:00,270.0,6.0,0.0,,,,,,,,,,,,
2020-01-22 09:30:00,,,,,,,,,,,,,,,
2020-01-22 10:00:00,,,,,,,,,,,,,,,
2020-01-22 10:30:00,,,,,,,,,,,,,,,
2020-01-22 11:00:00,,,,,,,,,,,,,,,


In [11]:
df_final=df_final.fillna(0)
df_test_final=df_test_final.fillna(0)
df_final.head()

Unnamed: 0_level_0,Asia_confirmed_cases,Asia_deaths,Asia_recovered,Europe_confirmed_cases,Europe_deaths,Europe_recovered,Americas_confirmed_cases,Americas_deaths,Americas_recovered,Oceania_confirmed_cases,...,MI_Europe_recovered,MI_Americas_confirmed_cases,MI_Americas_deaths,MI_Americas_recovered,MI_Oceania_confirmed_cases,MI_Oceania_deaths,MI_Oceania_recovered,MI_Africa_confirmed_cases,MI_Africa_deaths,MI_Africa_recovered
update_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22 09:00:00,270.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,True,True,True,True,True,True,True
2020-01-22 11:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,True,True,True,True,True,True,True


In [12]:
df_final.to_csv(OUTPUT+'_training.csv', index = True)
df_test_final.to_csv(OUTPUT+ '_test.csv', index = True)