In [1]:
import stelarImputation as tsi

## Gap Generation

In [2]:
df_input = '../datasets/example_input_type2.csv'

parameters = {
    'time_column' : 'time',
    'datetime_format' : '%Y-%m-%d',
    'sep' : ',',
    'header' : 0,
    'preprocessing': False,
    'index': False,
    'train_params': {
        "gap_type": "no_overlap",
        "miss_perc": 0.1,
        "gap_length": 100,
        "max_gap_length": 10,
        "max_gap_count": 5
    }
}



time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
datetime_format = parameters['datetime_format']
preprocessing = parameters['preprocessing']
index = parameters['index']
train_params = parameters['train_params']

In [3]:
df_missing = tsi.run_gap_generation(ground_truth=df_input, 
                                    train_params=train_params, 
                                    time_column=time_column, 
                                    datetime_format=datetime_format, 
                                    header=header, 
                                    sep=sep, 
                                    preprocessing=preprocessing, 
                                    index=index)

missing = df_missing.isnull().sum().sum()
print(f"Missing values count: {missing}")
df_missing

Missing values count: 480


Unnamed: 0,time,3i Group PLC_035999.txt,Admiral Group_036346.txt,Anglo American PLC_035918.txt,Antofagasta PLC_028149.txt,Ashtead Group_028090.txt,Associated British Foods PLC_035919.txt,Astrazeneca PLC_035998.txt,Aviva PLC_035907.txt,Barclays PLC_035976.txt,...,Standard Chartered PLC_035959.txt,Standard Life Aberdeen Plc_036365.txt,Taylor Wimpey PLC_036366.txt,Tesco PLC_035966.txt,TUI AG_02821N.txt,Unilever PLC_035922.txt,United Utilities Group PLC_036341.txt,Vodafone Group PLC_035943.txt,Whitbread PLC_035895.txt,WPP PLC_035947.txt
0,2017-01-02,,-1.347635,-1.294240,-2.295047,-1.199292,0.108603,-1.456547,-0.317386,1.288425,...,-0.841939,0.109476,-1.712247,0.012647,-0.987098,-2.869013,0.733771,-0.007477,-0.898946,1.592327
1,2017-01-03,,-1.189826,-1.286110,-2.136391,-1.136860,-0.002687,-1.453486,-0.134716,1.719911,...,-0.641924,0.235964,-1.596214,-0.013695,-1.006700,-2.931364,0.710001,0.100982,-0.881773,1.624480
2,2017-01-04,,-1.325091,-1.365373,-2.120526,-1.155222,-0.333285,-1.376940,-0.145305,1.878207,...,-0.565862,0.226663,-1.253914,-0.038155,-0.967495,-2.886828,0.638689,0.241340,-0.713477,1.578037
3,2017-01-05,,-1.336363,-1.355211,-2.035909,-1.122170,-0.087792,-1.226910,-0.222080,1.753102,...,-0.465855,-0.054214,-0.789779,-0.280875,-0.947893,-2.845854,0.724263,0.400838,-0.469618,1.642343
4,2017-01-06,,-1.313819,-1.395858,-2.094083,-1.155222,-0.185989,-1.225889,-0.237964,1.890973,...,-0.389793,-0.039333,-0.841994,-0.263941,-0.977297,-2.863669,0.695738,0.481651,-0.332233,1.678069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,2018-12-25,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
517,2018-12-26,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
518,2018-12-27,-1.310175,0.433355,0.958448,-1.406574,-1.175421,-2.195762,1.161321,-3.541919,-2.662352,...,-1.995545,-2.152421,-2.898693,-0.638369,-1.325238,-0.063217,-1.002434,-2.063088,1.501856,-1.929542
519,2018-12-28,-1.027002,0.861694,1.121038,-1.173878,-0.978943,-2.100838,1.434845,-3.224231,-2.455035,...,-1.723694,-1.996172,-2.675328,-0.542410,-1.180671,0.097114,-0.802761,-1.932938,1.766322,-1.848087


## Imputation

In [4]:
parameters = {
    'time_column' : 'time',
    'datetime_format' : '%Y-%m-%d',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': False,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    }
}

time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']

In [7]:
dict_of_imputed_dfs = tsi.run_imputation(missing = df_missing, 
                                         algorithms=algorithms, 
                                         params=params, 
                                         time_column=time_column,
                                         datetime_format=datetime_format, 
                                         header=header, 
                                         sep=sep, 
                                         is_multivariate=is_multivariate, 
                                         areaVStime=areaVStime, 
                                         preprocessing=preprocessing, 
                                         index=index)

freeing copy memory @ 0x1dcfb6a0
freeing copy memory @ 0x1dd5d1e0
freeing copy memory @ 0x1ddbed20
2024-04-18 16:01:56 [INFO]: No given device, using default device: cuda
2024-04-18 16:01:56 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 16:01:56 [INFO]: Epoch 001 - training loss: 1.4196
2024-04-18 16:01:56 [INFO]: Epoch 002 - training loss: 0.6113
2024-04-18 16:01:56 [INFO]: Epoch 003 - training loss: 0.3391
2024-04-18 16:01:56 [INFO]: Epoch 004 - training loss: 0.3217
2024-04-18 16:01:56 [INFO]: Epoch 005 - training loss: 0.3105
2024-04-18 16:01:56 [INFO]: Epoch 006 - training loss: 0.2794
2024-04-18 16:01:56 [INFO]: Epoch 007 - training loss: 0.2406
2024-04-18 16:01:57 [INFO]: Epoch 008 - training loss: 0.2041
2024-04-18 16:01:57 [INFO]: Epoch 009 - training loss: 0.1992
2024-04-18 16:01:57 [INFO]: Epoch 010 - training loss: 0.2112
2024-04-18 16:01:57 [INFO]: Epoch 011 - training loss: 0.1943
2024-04-18 16:01:57 [IN

In [8]:
imputed_df =dict_of_imputed_dfs['SoftImpute']
missing = imputed_df.isnull().sum().sum()
print(f"Missing values count: {missing}")
imputed_df

Missing values count: 0


Unnamed: 0,time,3i Group PLC_035999.txt,Admiral Group_036346.txt,Anglo American PLC_035918.txt,Antofagasta PLC_028149.txt,Ashtead Group_028090.txt,Associated British Foods PLC_035919.txt,Astrazeneca PLC_035998.txt,Aviva PLC_035907.txt,Barclays PLC_035976.txt,...,Standard Chartered PLC_035959.txt,Standard Life Aberdeen Plc_036365.txt,Taylor Wimpey PLC_036366.txt,Tesco PLC_035966.txt,TUI AG_02821N.txt,Unilever PLC_035922.txt,United Utilities Group PLC_036341.txt,Vodafone Group PLC_035943.txt,Whitbread PLC_035895.txt,WPP PLC_035947.txt
0,2017-01-02,-2.603107,-1.347635,-1.294240,-2.295047,-1.199292,0.108603,-1.456547,-0.317386,1.288425,...,-0.841939,0.109476,-1.712247,0.012647,-0.987098,-2.869013,0.733771,-0.007477,-0.898946,1.592327
1,2017-01-03,-2.581500,-1.189826,-1.286110,-2.136391,-1.136860,-0.002687,-1.453486,-0.134716,1.719911,...,-0.641924,0.235964,-1.596214,-0.013695,-1.006700,-2.931364,0.710001,0.100982,-0.881773,1.624480
2,2017-01-04,-2.526286,-1.325091,-1.365373,-2.120526,-1.155222,-0.333285,-1.376940,-0.145305,1.878207,...,-0.565862,0.226663,-1.253914,-0.038155,-0.967495,-2.886828,0.638689,0.241340,-0.713477,1.578037
3,2017-01-05,-2.510187,-1.336363,-1.355211,-2.035909,-1.122170,-0.087792,-1.226910,-0.222080,1.753102,...,-0.465855,-0.054214,-0.789779,-0.280875,-0.947893,-2.845854,0.724263,0.400838,-0.469618,1.642343
4,2017-01-06,-2.494425,-1.313819,-1.395858,-2.094083,-1.155222,-0.185989,-1.225889,-0.237964,1.890973,...,-0.389793,-0.039333,-0.841994,-0.263941,-0.977297,-2.863669,0.695738,0.481651,-0.332233,1.678069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,2018-12-25,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
517,2018-12-26,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
518,2018-12-27,-1.310175,0.433355,0.958448,-1.406574,-1.175421,-2.195762,1.161321,-3.541919,-2.662352,...,-1.995545,-2.152421,-2.898693,-0.638369,-1.325238,-0.063217,-1.002434,-2.063088,1.501856,-1.929542
519,2018-12-28,-1.027002,0.861694,1.121038,-1.173878,-0.978943,-2.100838,1.434845,-3.224231,-2.455035,...,-1.723694,-1.996172,-2.675328,-0.542410,-1.180671,0.097114,-0.802761,-1.932938,1.766322,-1.848087


## Train Ensemble Model

In [9]:
parameters = {
    'time_column' : 'time',
    'datetime_format' : '%Y-%m-%d',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': False,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    },
    'train_params': {
        "smooth": False,
        "window": 2,
        "order": 1,
        "normalize": False,
        "gap_type": "no_overlap",
        "miss_perc": 0.1,
        "gap_length": 100,
        "max_gap_length": 10,
        "max_gap_count": 5
    }
}

time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']
train_params = parameters['train_params']

In [10]:
model, metrics = tsi.train_ensemble(ground_truth = df_input, 
                                    algorithms=algorithms,
                                    params=params, 
                                    train_params=train_params,
                                    time_column=time_column, 
                                    datetime_format=datetime_format,
                                    header=header, 
                                    sep=sep, 
                                    is_multivariate=is_multivariate, 
                                    areaVStime=areaVStime, 
                                    preprocessing=preprocessing, 
                                    index=index)
metrics

freeing copy memory @ 0x1e077180
freeing copy memory @ 0x1e0d8c60
freeing copy memory @ 0x1e0d8c60
2024-04-18 16:02:37 [INFO]: No given device, using default device: cuda
2024-04-18 16:02:37 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 16:02:37 [INFO]: Epoch 001 - training loss: 0.5664
2024-04-18 16:02:37 [INFO]: Epoch 002 - training loss: 0.3004
2024-04-18 16:02:37 [INFO]: Epoch 003 - training loss: 0.2838
2024-04-18 16:02:37 [INFO]: Epoch 004 - training loss: 0.2533
2024-04-18 16:02:37 [INFO]: Epoch 005 - training loss: 0.2143
2024-04-18 16:02:37 [INFO]: Epoch 006 - training loss: 0.2007
2024-04-18 16:02:37 [INFO]: Epoch 007 - training loss: 0.2015
2024-04-18 16:02:38 [INFO]: Epoch 008 - training loss: 0.1905
2024-04-18 16:02:38 [INFO]: Epoch 009 - training loss: 0.1725
2024-04-18 16:02:38 [INFO]: Epoch 010 - training loss: 0.1661
2024-04-18 16:02:38 [INFO]: Epoch 011 - training loss: 0.1543
2024-04-18 16:02:38 [IN

{'SoftImpute': {'mae': 0.3219266070325954,
  'mse': 0.18658043936819818,
  'rmse': 0.4319495796597077,
  'r2': 0.7920643014497039,
  'euclidean_distance': 207.33579823665974},
 'IterativeSVD': {'mae': 0.4263449182479925,
  'mse': 0.29746515592730716,
  'rmse': 0.5454036632873923,
  'r2': 0.6684881587718029,
  'euclidean_distance': 261.7937583779483},
 'SVT': {'mae': 0.1767880887466014,
  'mse': 0.07526445645365355,
  'rmse': 0.27434368309413204,
  'r2': 0.9161210715244661,
  'euclidean_distance': 131.68496788518337},
 'TimesNet': {'mae': 0.5354786291961865,
  'mse': 0.5444717148289551,
  'rmse': 0.7378832663971687,
  'r2': 0.39321020602584444,
  'euclidean_distance': 354.183967870641},
 'Ensemble_Model': {'mae': 0.13919305939084572,
  'mse': 0.05264141138764481,
  'rmse': 0.22943716217658552,
  'r2': 0.9413334608567272,
  'euclidean_distance': 110.12983784476106}}

## Imputation with Ensemble Model

In [11]:
parameters = {
    'time_column' : 'time',
    'datetime_format' : '%Y-%m-%d',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': False,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    }
}

time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']

In [14]:
model_imputed_df = tsi.run_imputation_ensemble(missing = df_missing, 
                                               algorithms=algorithms,
                                               params=params, 
                                               model=model,
                                               time_column=time_column,
                                               datetime_format=datetime_format,
                                               header=header, 
                                               sep=sep, 
                                               is_multivariate=is_multivariate, 
                                               areaVStime=areaVStime, 
                                               preprocessing=preprocessing,
                                               index=index)

freeing copy memory @ 0x1ddbed20
freeing copy memory @ 0x1de20860
freeing copy memory @ 0x1de20860
2024-04-18 16:04:14 [INFO]: No given device, using default device: cuda
2024-04-18 16:04:14 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 16:04:14 [INFO]: Epoch 001 - training loss: 1.3716
2024-04-18 16:04:14 [INFO]: Epoch 002 - training loss: 0.6072
2024-04-18 16:04:14 [INFO]: Epoch 003 - training loss: 0.3212
2024-04-18 16:04:15 [INFO]: Epoch 004 - training loss: 0.2769
2024-04-18 16:04:15 [INFO]: Epoch 005 - training loss: 0.2696
2024-04-18 16:04:15 [INFO]: Epoch 006 - training loss: 0.2499
2024-04-18 16:04:15 [INFO]: Epoch 007 - training loss: 0.2221
2024-04-18 16:04:15 [INFO]: Epoch 008 - training loss: 0.1962
2024-04-18 16:04:15 [INFO]: Epoch 009 - training loss: 0.1941
2024-04-18 16:04:15 [INFO]: Epoch 010 - training loss: 0.1895
2024-04-18 16:04:15 [INFO]: Epoch 011 - training loss: 0.1755
2024-04-18 16:04:15 [IN

In [15]:
missing = model_imputed_df.isnull().sum().sum()
print(f"Missing values count: {missing}")
model_imputed_df

Missing values count: 0


Unnamed: 0,time,3i Group PLC_035999.txt,Admiral Group_036346.txt,Anglo American PLC_035918.txt,Antofagasta PLC_028149.txt,Ashtead Group_028090.txt,Associated British Foods PLC_035919.txt,Astrazeneca PLC_035998.txt,Aviva PLC_035907.txt,Barclays PLC_035976.txt,...,Standard Chartered PLC_035959.txt,Standard Life Aberdeen Plc_036365.txt,Taylor Wimpey PLC_036366.txt,Tesco PLC_035966.txt,TUI AG_02821N.txt,Unilever PLC_035922.txt,United Utilities Group PLC_036341.txt,Vodafone Group PLC_035943.txt,Whitbread PLC_035895.txt,WPP PLC_035947.txt
0,2017-01-02,-0.272915,-1.347635,-1.294240,-2.295047,-1.199292,0.108603,-1.456547,-0.317386,1.288425,...,-0.841939,0.109476,-1.712247,0.012647,-0.987098,-2.869013,0.733771,-0.007477,-0.898946,1.592327
1,2017-01-03,-0.170053,-1.189826,-1.286110,-2.136391,-1.136860,-0.002687,-1.453486,-0.134716,1.719911,...,-0.641924,0.235964,-1.596214,-0.013695,-1.006700,-2.931364,0.710001,0.100982,-0.881773,1.624480
2,2017-01-04,-0.105915,-1.325091,-1.365373,-2.120526,-1.155222,-0.333285,-1.376940,-0.145305,1.878207,...,-0.565862,0.226663,-1.253914,-0.038155,-0.967495,-2.886828,0.638689,0.241340,-0.713477,1.578037
3,2017-01-05,-0.168553,-1.336363,-1.355211,-2.035909,-1.122170,-0.087792,-1.226910,-0.222080,1.753102,...,-0.465855,-0.054214,-0.789779,-0.280875,-0.947893,-2.845854,0.724263,0.400838,-0.469618,1.642343
4,2017-01-06,-1.251226,-1.313819,-1.395858,-2.094083,-1.155222,-0.185989,-1.225889,-0.237964,1.890973,...,-0.389793,-0.039333,-0.841994,-0.263941,-0.977297,-2.863669,0.695738,0.481651,-0.332233,1.678069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,2018-12-25,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
517,2018-12-26,-1.509627,0.405175,1.060066,-1.271187,-0.967925,-2.068105,1.653256,-3.340717,-2.628650,...,-1.847647,-2.137541,-2.842126,-0.621435,-1.151267,0.237849,-0.878827,-1.936340,1.656414,-1.858805
518,2018-12-27,-1.310175,0.433355,0.958448,-1.406574,-1.175421,-2.195762,1.161321,-3.541919,-2.662352,...,-1.995545,-2.152421,-2.898693,-0.638369,-1.325238,-0.063217,-1.002434,-2.063088,1.501856,-1.929542
519,2018-12-28,-1.027002,0.861694,1.121038,-1.173878,-0.978943,-2.100838,1.434845,-3.224231,-2.455035,...,-1.723694,-1.996172,-2.675328,-0.542410,-1.180671,0.097114,-0.802761,-1.932938,1.766322,-1.848087
