In [6]:
from imputationLibrary import decompose, forwardFilling, hotDeck, meanImputation, movingAverage, splineInterpolation, randomSampleImputation, nature
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from sklearn import preprocessing
from datetime import datetime, timedelta
from tsmoothie.smoother import *

In [7]:
def plot_ac(df, name):
    normalized = preprocessing.scale([np.array(df.fillna(0))])
    corr = signal.correlate(normalized, normalized, mode='full')
    plt.plot(corr[0], 'o-', markersize=2)
    plt.axhline(2/np.sqrt(len(df)), ls=':')
    plt.axhline(-2/np.sqrt(len(df)), ls=':')
    plt.title(name + ' auto-correlation')
    plt.show()

In [8]:
def decompose_and_plot(ts, flag_plot = False):
    ts_decomposed = decompose.additive(ts, period)
    if flag_plot:
        ts_decomposed.plot()
        plt.show()
        print("Resid is white noise? ", nature.isWhiteNoise(ts_decomposed.resid))
        print("Resid is seasonal noise? ", nature.isSeasonal(ts_decomposed.resid))
        print("Resid is trended noise? ", nature.isTrended(ts_decomposed.resid, period))
        print("Resid is seasonal and trended noise? ", nature.isTrendedAndSeasonal(ts_decomposed.resid, period))
        plot_ac(ts_decomposed.resid, "Resid")
        plot_ac(ts_decomposed.trend, "Trend")
        plot_ac(ts_decomposed.seasonal, "Seasonal")
    return ts_decomposed.resid, ts_decomposed.trend, ts_decomposed.seasonal
    

In [9]:
df = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0, parse_dates=True)
df_train = df.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00'].dropna(how='all', axis=1) #Selecting training data
df_test = df.loc['2020-02-26 02:30:00':, df_train.columns] #Selecting test data
period=30
df_filled = df_train.fillna(0)
df_filled_test = df_test.fillna(0)

df_decomposed_resid = pd.DataFrame()
df_decomposed_trend = pd.DataFrame()
df_decomposed_seasonal = pd.DataFrame()

df_decomposed_resid_test = pd.DataFrame()
df_decomposed_trend_test = pd.DataFrame()
df_decomposed_seasonal_test = pd.DataFrame()

df_nan = df_train.isna()
df_nan_test = df_test.isna()

decomp_type_dict_train = {}
decomp_type_dict_test = {}
for col in df_filled.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled.loc[:,col])
    df_decomposed_resid[col] = resid
    df_decomposed_trend[col] = trend
    df_decomposed_seasonal[col] = seasonal

for col in df_filled_test.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled_test.loc[:,col])
    df_decomposed_resid_test[col] = resid
    df_decomposed_trend_test[col] = trend
    df_decomposed_seasonal_test[col] = seasonal

df_final_resid = df_decomposed_resid.mask(df_nan == True, np.nan)
df_final_trend = df_decomposed_trend.mask(df_nan == True, np.nan)
df_final_seasonal = df_decomposed_seasonal.mask(df_nan == True, np.nan)

df_final_resid_test = df_decomposed_resid_test.mask(df_nan_test == True, np.nan)
df_final_trend_test = df_decomposed_trend_test.mask(df_nan_test == True, np.nan)
df_final_seasonal_test = df_decomposed_seasonal_test.mask(df_nan_test == True, np.nan)

df_white_noise_train, df_white_noise_test =  meanImputation.input(df_final_resid, df_final_resid_test)
df_seasonal_train, df_seasonal_test = splineInterpolation.input(df_final_seasonal, df_final_seasonal_test)
df_trended_train, df_trended_test = forwardFilling.input(df_final_trend, df_final_trend_test)

df_final_train = df_white_noise_train + df_seasonal_train + df_trended_train
df_final_test = df_white_noise_test + df_seasonal_test + df_trended_test

### USE KALMAN FILTER TO SMOOTH ALL DATA (ONLY VISUALIZATION PURPOSE) ###

smoother_train = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_train.smooth(df_final_train.T)

smoother_test = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_test.smooth(df_final_test.T)

df_filled = pd.DataFrame(data = smoother_train.smooth_data.T, index = df_train.index, columns= df_train.columns)
df_filled_test = pd.DataFrame(data = smoother_test.smooth_data.T, index = df_test.index, columns= df_test.columns)


df_filled.to_csv(r'output\/CompleteCovidTrainAddDecompKalman_0.csv', index = True)
df_filled_test.to_csv(r'output\/CompleteCovidTestAddDecompKalman_0.csv', index = True)

In [10]:
df_filled

Unnamed: 0_level_0,Asia_confirmed_cases,Asia_deaths,Asia_recovered,Europe_confirmed_cases,Europe_deaths,Europe_recovered,Americas_confirmed_cases,Americas_deaths,Americas_recovered,Oceania_confirmed_cases,Oceania_deaths,Oceania_recovered,Africa_confirmed_cases,Africa_deaths,Africa_recovered
update_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-01-22 09:00:00,1542.936596,38.158827,215.101408,144.357833,5.981772,-31.786030,5.779440,0.0,0.237501,7.094856,0.0,0.518457,0.161525,0.0,0.0
2020-01-22 09:30:00,1703.445829,42.537796,238.381401,143.603925,5.944032,-31.520518,5.792284,0.0,0.238825,7.104483,0.0,0.519433,0.163107,0.0,0.0
2020-01-22 10:00:00,1937.957893,48.899111,272.064638,142.298620,5.881309,-31.100935,5.798041,0.0,0.240176,7.102840,0.0,0.519689,0.164676,0.0,0.0
2020-01-22 10:30:00,2197.287096,55.927338,308.977277,140.650049,5.803061,-30.585890,5.799039,0.0,0.241510,7.094071,0.0,0.519488,0.166229,0.0,0.0
2020-01-22 11:00:00,2451.135505,62.811800,344.719278,138.791776,5.715355,-30.012988,5.796636,0.0,0.242791,7.080835,0.0,0.518999,0.167768,0.0,0.0
2020-01-22 11:30:00,2681.291343,69.064884,376.665566,136.809490,5.622086,-29.406351,5.791489,0.0,0.243983,7.064839,0.0,0.518330,0.169290,0.0,0.0
2020-01-22 12:00:00,2877.239060,74.404799,403.321356,134.758139,5.525749,-28.781443,5.783686,0.0,0.245049,7.047178,0.0,0.517551,0.170797,0.0,0.0
2020-01-22 12:30:00,3033.305123,78.679413,423.903032,132.672934,5.427943,-28.148171,5.772767,0.0,0.245933,7.028555,0.0,0.516707,0.172286,0.0,0.0
2020-01-22 13:00:00,3146.769974,81.815955,438.060925,130.576416,5.329693,-27.512870,5.757627,0.0,0.246560,7.009419,0.0,0.515826,0.173759,0.0,0.0
2020-01-22 13:30:00,3216.569437,83.786561,445.688682,128.482987,5.231651,-26.879585,5.736298,0.0,0.246812,6.990061,0.0,0.514927,0.175213,0.0,0.0


In [11]:
df_filled_test

Unnamed: 0_level_0,Asia_confirmed_cases,Asia_deaths,Asia_recovered,Europe_confirmed_cases,Europe_deaths,Europe_recovered,Americas_confirmed_cases,Americas_deaths,Americas_recovered,Oceania_confirmed_cases,Oceania_deaths,Oceania_recovered,Africa_confirmed_cases,Africa_deaths,Africa_recovered
update_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-02-26 02:30:00,7.493694e+191,2.509729e+190,1.363150e+191,1.750138e+130,4.661306e+128,4.402857e+128,7.794696e+118,2.878830e+116,-1.174943e+117,-5.645962e+58,1.307888e+56,-9.749620e+57,1.593788e+52,-7.411339e+49,-1.080838e+51
2020-02-26 03:00:00,4.681814e+191,1.567996e+190,8.516512e+190,1.112609e+130,2.963314e+128,2.799011e+128,4.976142e+118,1.837848e+116,-7.500850e+116,-3.604064e+58,8.348821e+55,-6.223608e+57,1.017991e+52,-4.733801e+49,-6.903571e+50
2020-02-26 03:30:00,2.875224e+191,9.629472e+189,5.230212e+190,7.032269e+129,1.872969e+128,1.769121e+128,3.166491e+118,1.169486e+116,-4.773050e+116,-2.293061e+58,5.311879e+55,-3.959727e+57,6.483066e+51,-3.014717e+49,-4.396534e+50
2020-02-26 04:00:00,1.714057e+191,5.740584e+189,3.117977e+190,4.403249e+129,1.172758e+128,1.107734e+128,2.004597e+118,7.403615e+115,-3.021654e+116,-1.451322e+58,3.361989e+55,-2.506186e+57,4.109557e+51,-1.911002e+49,-2.786923e+50
2020-02-26 04:30:00,9.672549e+190,3.239453e+189,1.759497e+190,2.714663e+129,7.230214e+127,6.829330e+127,1.258580e+118,4.648339e+115,-1.897137e+116,-9.108630e+57,2.110016e+55,-1.572906e+57,2.585663e+51,-1.202370e+49,-1.753484e+50
2020-02-26 05:00:00,4.864635e+190,1.629225e+189,8.849075e+189,1.629834e+129,4.340888e+127,4.100204e+127,7.795599e+117,2.879163e+115,-1.175080e+116,-5.638279e+57,1.306108e+55,-9.736353e+56,1.607227e+51,-7.473835e+48,-1.089952e+50
2020-02-26 05:30:00,1.764295e+190,5.908836e+188,3.209362e+189,9.325928e+128,2.483861e+127,2.346141e+127,4.719430e+117,1.743036e+115,-7.113893e+115,-3.409653e+57,7.898466e+54,-5.887892e+56,9.789611e+50,-4.552308e+48,-6.638889e+49
2020-02-26 06:00:00,-2.400633e+189,-8.040009e+187,-4.366901e+188,4.841446e+128,1.289467e+127,1.217972e+127,2.743528e+117,1.013273e+115,-4.135492e+115,-1.978108e+57,4.582291e+54,-3.415857e+56,5.754749e+50,-2.676040e+48,-3.902621e+49
2020-02-26 06:30:00,-1.541123e+190,-5.161406e+188,-2.803398e+189,1.953730e+128,5.203550e+126,4.915036e+126,1.473812e+117,5.443258e+114,-2.221569e+115,-1.058155e+57,2.451217e+54,-1.827254e+56,3.162600e+50,-1.470654e+48,-2.144738e+49
2020-02-26 07:00:00,-2.391015e+190,-8.007797e+188,-4.349405e+189,9.058766e+126,2.412705e+125,2.278931e+125,6.572501e+116,2.427434e+114,-9.907143e+114,-4.664857e+56,1.080615e+54,-8.055418e+55,1.496239e+50,-6.957722e+47,-1.014684e+49
