In [1]:
import stelarImputation as tsi

## Gap Generation

In [2]:
df_input = '../datasets/example_input_type1.csv'

parameters = {
    'dimension_column' : 'Dimension',
    'datetime_format' : '%Y-%m-%d %H:%M:%S',
    'spatial_x_column': 'Spatial_X',
    'spatial_y_column' : 'Spatial_Y',
    'sep' : ',',
    'header' : 0,
    'preprocessing': True,
    'index': False,
    'train_params': {
        "gap_type": "no_overlap",
        "miss_perc": 0.1,
        "gap_length": 100,
        "max_gap_length": 10,
        "max_gap_count": 5
    }
}



dimension_column = parameters['dimension_column']
header = parameters['header']
sep = parameters['sep']
spatial_x_column = parameters['spatial_x_column']
spatial_y_column = parameters['spatial_y_column']
datetime_format = parameters['datetime_format']
preprocessing = parameters['preprocessing']
index = parameters['index']
train_params = parameters['train_params']

In [3]:
df_missing = tsi.run_gap_generation(ground_truth=df_input, 
                                    train_params=train_params, 
                                    dimension_column=dimension_column, 
                                    datetime_format=datetime_format, 
                                    spatial_x_column=spatial_x_column, 
                                    spatial_y_column=spatial_y_column, 
                                    header=header, 
                                    sep=sep, 
                                    preprocessing=preprocessing, 
                                    index=index)

missing = df_missing.isnull().sum().sum()
print(f"Missing values count: {missing}")
df_missing

Missing values count: 480


Unnamed: 0,Dimension,Spatial_X,Spatial_Y,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2018-12-18 00:00:00,2018-12-19 00:00:00,2018-12-20 00:00:00,2018-12-21 00:00:00,2018-12-24 00:00:00,2018-12-25 00:00:00,2018-12-26 00:00:00,2018-12-27 00:00:00,2018-12-28 00:00:00,2018-12-31 00:00:00
0,3i Group PLC_035999,96,0,,,,,,-1.887600,-2.004563,...,-1.356960,-1.273239,-1.297863,-1.098411,-1.509627,-1.509627,-1.509627,-1.310175,-1.027002,-1.297863
1,Admiral Group_036346,97,1,-1.347635,-1.189826,-1.325091,-1.336363,-1.313819,,,...,0.129009,0.151553,0.326270,0.343178,0.405175,0.405175,0.405175,0.433355,0.861694,1.132225
2,Anglo American PLC_035918,98,2,-1.294240,-1.286110,-1.365373,-1.355211,-1.395858,-1.316596,-0.979221,...,0.951131,1.116160,0.913735,1.110469,1.060066,1.060066,1.060066,0.958448,1.121038,1.095023
3,Antofagasta PLC_028149,99,3,-2.295047,-2.136391,-2.120526,-2.035909,-2.094083,-1.988312,-1.787348,...,-1.290226,-1.120993,-1.427728,-1.300803,-1.271187,-1.271187,-1.271187,-1.406574,-1.173878,-1.150609
4,Ashtead Group_028090,100,4,-1.199292,-1.136860,-1.155222,-1.122170,-1.155222,-1.188275,-1.122170,...,-0.920183,-0.918347,-1.098299,-1.120334,-0.967925,-0.967925,-0.967925,-1.175421,-0.978943,-0.989960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,Unilever PLC_035922,187,91,-2.869013,-2.931364,-2.886828,-2.845854,-2.863669,-2.683742,-2.717589,...,0.399962,0.417777,0.378584,0.373240,0.237849,0.237849,0.237849,-0.063217,0.097114,0.038326
92,United Utilities Group PLC_036341,188,92,0.733771,0.710001,0.638689,0.724263,0.695738,0.705247,0.633935,...,-0.762826,-0.705777,-0.606891,-0.519415,-0.878827,-0.878827,-0.878827,-1.002434,-0.802761,-0.833187
93,Vodafone Group PLC_035943,189,93,-0.007477,0.100982,0.241340,0.400838,0.481651,0.337039,0.434864,...,-1.793430,-1.725377,-1.704111,-1.841067,-1.936340,-1.936340,-1.936340,-2.063088,-1.932938,-2.004393
94,Whitbread PLC_035895,190,94,-0.898946,-0.881773,-0.713477,-0.469618,-0.332233,-0.593265,0.045575,...,1.405687,1.467510,1.481249,1.453772,1.656414,1.656414,1.656414,1.501856,1.766322,1.859057


## Imputation

In [4]:
parameters = {
    'dimension_column' : 'Dimension',
    'datetime_format' : '%Y-%m-%d %H:%M:%S',
    'spatial_x_column': 'Spatial_X',
    'spatial_y_column' : 'Spatial_Y',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': True,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    }
}

dimension_column = parameters['dimension_column']
header = parameters['header']
sep = parameters['sep']
spatial_x_column = parameters['spatial_x_column']
spatial_y_column = parameters['spatial_y_column']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']

In [5]:
dict_of_imputed_dfs = tsi.run_imputation(missing = df_missing, 
                                         algorithms=algorithms, 
                                         params=params, 
                                         dimension_column=dimension_column,
                                         datetime_format=datetime_format, 
                                         spatial_x_column=spatial_x_column,
                                         spatial_y_column=spatial_y_column, 
                                         header=header, 
                                         sep=sep, 
                                         is_multivariate=is_multivariate, 
                                         areaVStime=areaVStime, 
                                         preprocessing=preprocessing, 
                                         index=index)

freeing copy memory @ 0x8890a20
freeing copy memory @ 0x8798100
freeing copy memory @ 0x89bbf80
2024-04-18 15:58:15 [INFO]: No given device, using default device: cuda
2024-04-18 15:58:16 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 15:58:16 [INFO]: Epoch 001 - training loss: 0.5469
2024-04-18 15:58:16 [INFO]: Epoch 002 - training loss: 0.3120
2024-04-18 15:58:16 [INFO]: Epoch 003 - training loss: 0.3056
2024-04-18 15:58:16 [INFO]: Epoch 004 - training loss: 0.2839
2024-04-18 15:58:16 [INFO]: Epoch 005 - training loss: 0.2314
2024-04-18 15:58:16 [INFO]: Epoch 006 - training loss: 0.2028
2024-04-18 15:58:16 [INFO]: Epoch 007 - training loss: 0.1999
2024-04-18 15:58:16 [INFO]: Epoch 008 - training loss: 0.1892
2024-04-18 15:58:16 [INFO]: Epoch 009 - training loss: 0.1705
2024-04-18 15:58:16 [INFO]: Epoch 010 - training loss: 0.1595
2024-04-18 15:58:16 [INFO]: Epoch 011 - training loss: 0.1528
2024-04-18 15:58:16 [INFO]

In [6]:
imputed_df =dict_of_imputed_dfs['SoftImpute']
missing = imputed_df.isnull().sum().sum()
print(f"Missing values count: {missing}")
imputed_df

Missing values count: 0


Unnamed: 0,Dimension,Spatial_X,Spatial_Y,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2018-12-18 00:00:00,2018-12-19 00:00:00,2018-12-20 00:00:00,2018-12-21 00:00:00,2018-12-24 00:00:00,2018-12-25 00:00:00,2018-12-26 00:00:00,2018-12-27 00:00:00,2018-12-28 00:00:00,2018-12-31 00:00:00
0,3i Group PLC_035999,96,0,-2.603107,-2.581500,-2.526286,-2.510187,-2.494425,-1.887600,-2.004563,...,-1.356960,-1.273239,-1.297863,-1.098411,-1.509627,-1.509627,-1.509627,-1.310175,-1.027002,-1.297863
1,Admiral Group_036346,97,1,-1.347635,-1.189826,-1.325091,-1.336363,-1.313819,-1.308421,-1.268704,...,0.129009,0.151553,0.326270,0.343178,0.405175,0.405175,0.405175,0.433355,0.861694,1.132225
2,Anglo American PLC_035918,98,2,-1.294240,-1.286110,-1.365373,-1.355211,-1.395858,-1.316596,-0.979221,...,0.951131,1.116160,0.913735,1.110469,1.060066,1.060066,1.060066,0.958448,1.121038,1.095023
3,Antofagasta PLC_028149,99,3,-2.295047,-2.136391,-2.120526,-2.035909,-2.094083,-1.988312,-1.787348,...,-1.290226,-1.120993,-1.427728,-1.300803,-1.271187,-1.271187,-1.271187,-1.406574,-1.173878,-1.150609
4,Ashtead Group_028090,100,4,-1.199292,-1.136860,-1.155222,-1.122170,-1.155222,-1.188275,-1.122170,...,-0.920183,-0.918347,-1.098299,-1.120334,-0.967925,-0.967925,-0.967925,-1.175421,-0.978943,-0.989960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,Unilever PLC_035922,187,91,-2.869013,-2.931364,-2.886828,-2.845854,-2.863669,-2.683742,-2.717589,...,0.399962,0.417777,0.378584,0.373240,0.237849,0.237849,0.237849,-0.063217,0.097114,0.038326
92,United Utilities Group PLC_036341,188,92,0.733771,0.710001,0.638689,0.724263,0.695738,0.705247,0.633935,...,-0.762826,-0.705777,-0.606891,-0.519415,-0.878827,-0.878827,-0.878827,-1.002434,-0.802761,-0.833187
93,Vodafone Group PLC_035943,189,93,-0.007477,0.100982,0.241340,0.400838,0.481651,0.337039,0.434864,...,-1.793430,-1.725377,-1.704111,-1.841067,-1.936340,-1.936340,-1.936340,-2.063088,-1.932938,-2.004393
94,Whitbread PLC_035895,190,94,-0.898946,-0.881773,-0.713477,-0.469618,-0.332233,-0.593265,0.045575,...,1.405687,1.467510,1.481249,1.453772,1.656414,1.656414,1.656414,1.501856,1.766322,1.859057


## Train Ensemble Model

In [7]:
parameters = {
    'dimension_column' : 'Dimension',
    'datetime_format' : '%Y-%m-%d %H:%M:%S',
    'spatial_x_column': 'Spatial_X',
    'spatial_y_column' : 'Spatial_Y',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': True,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    },
    'train_params': {
        "smooth": False,
        "window": 2,
        "order": 1,
        "normalize": False,
        "gap_type": "no_overlap",
        "miss_perc": 0.1,
        "gap_length": 100,
        "max_gap_length": 10,
        "max_gap_count": 5
    }
}

dimension_column = parameters['dimension_column']
header = parameters['header']
sep = parameters['sep']
spatial_x_column = parameters['spatial_x_column']
spatial_y_column = parameters['spatial_y_column']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']
train_params = parameters['train_params']

In [8]:
model, metrics = tsi.train_ensemble(ground_truth = df_input, 
                                    algorithms=algorithms,
                                    params=params, 
                                    train_params=train_params,
                                    dimension_column=dimension_column, 
                                    datetime_format=datetime_format,
                                    spatial_x_column=spatial_x_column, 
                                    spatial_y_column=spatial_y_column,
                                    header=header, 
                                    sep=sep, 
                                    is_multivariate=is_multivariate, 
                                    areaVStime=areaVStime, 
                                    preprocessing=preprocessing, 
                                    index=index)
metrics

freeing copy memory @ 0x1c9d5de0
freeing copy memory @ 0x1c9d5de0
freeing copy memory @ 0x1c9d5de0
2024-04-18 15:58:26 [INFO]: No given device, using default device: cuda
2024-04-18 15:58:26 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 15:58:27 [INFO]: Epoch 001 - training loss: 0.8405
2024-04-18 15:58:27 [INFO]: Epoch 002 - training loss: 0.3585
2024-04-18 15:58:27 [INFO]: Epoch 003 - training loss: 0.3235
2024-04-18 15:58:27 [INFO]: Epoch 004 - training loss: 0.3226
2024-04-18 15:58:27 [INFO]: Epoch 005 - training loss: 0.2568
2024-04-18 15:58:27 [INFO]: Epoch 006 - training loss: 0.2209
2024-04-18 15:58:27 [INFO]: Epoch 007 - training loss: 0.2166
2024-04-18 15:58:27 [INFO]: Epoch 008 - training loss: 0.2110
2024-04-18 15:58:27 [INFO]: Epoch 009 - training loss: 0.1884
2024-04-18 15:58:27 [INFO]: Epoch 010 - training loss: 0.1751
2024-04-18 15:58:27 [INFO]: Epoch 011 - training loss: 0.1629
2024-04-18 15:58:27 [IN

{'SoftImpute': {'mae': 0.3219266070325954,
  'mse': 0.18658043936819813,
  'rmse': 0.43194957965970765,
  'r2': 0.792064301449704,
  'euclidean_distance': 207.3357982366597},
 'IterativeSVD': {'mae': 0.42643071248629255,
  'mse': 0.29755081878255524,
  'rmse': 0.5454821892441175,
  'r2': 0.6683926912849316,
  'euclidean_distance': 261.8314508371764},
 'SVT': {'mae': 0.17825238352526018,
  'mse': 0.08049457273010083,
  'rmse': 0.2837156547145413,
  'r2': 0.9102923368236316,
  'euclidean_distance': 136.18351426297986},
 'TimesNet': {'mae': 0.534439375265742,
  'mse': 0.5310527978483749,
  'rmse': 0.7287336947392887,
  'r2': 0.4081649991734746,
  'euclidean_distance': 349.79217347485866},
 'Ensemble_Model': {'mae': 0.14089275544057492,
  'mse': 0.04989781316643831,
  'rmse': 0.2233781841775027,
  'r2': 0.9443910804796616,
  'euclidean_distance': 107.2215284052013}}

## Imputation with Ensemble Model

In [9]:
parameters = {
    'dimension_column' : 'Dimension',
    'datetime_format' : '%Y-%m-%d %H:%M:%S',
    'spatial_x_column': 'Spatial_X',
    'spatial_y_column' : 'Spatial_Y',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': True,
    'index': False,
    "algorithms": ["SoftImpute", "IterativeSVD", "SVT", "TimesNet"],
    "params": { 
        "SoftImpute": { "max_rank": 5 },
        "IterativeSVD": { "rank": 3 }, 
        "SVT": { "tauScale": 0.7}, 
        "TimesNet":{ 
            "n_layers": 2, "top_k": 3, 
            "d_model":56, "d_ffn":56, 
            "n_kernels":1, "dropout":0.05, 
            "apply_nonstationary_norm": False,
            "batch_size": 32,
            "epochs":50,
            "num_workers": 0                                                        
        }
    }
}

dimension_column = parameters['dimension_column']
header = parameters['header']
sep = parameters['sep']
spatial_x_column = parameters['spatial_x_column']
spatial_y_column = parameters['spatial_y_column']
datetime_format = parameters['datetime_format']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']

In [10]:
model_imputed_df = tsi.run_imputation_ensemble(missing = df_missing, 
                                               algorithms=algorithms,
                                               params=params, 
                                               model=model,
                                               dimension_column=dimension_column,
                                               datetime_format=datetime_format,
                                               spatial_x_column=spatial_x_column,
                                               spatial_y_column=spatial_y_column,
                                               header=header, 
                                               sep=sep, 
                                               is_multivariate=is_multivariate, 
                                               areaVStime=areaVStime, 
                                               preprocessing=preprocessing,
                                               index=index)

freeing copy memory @ 0x1cf2aba0
freeing copy memory @ 0x1c6c2040
freeing copy memory @ 0x1c6c2040
2024-04-18 15:58:43 [INFO]: No given device, using default device: cuda
2024-04-18 15:58:43 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 13,105
2024-04-18 15:58:44 [INFO]: Epoch 001 - training loss: 0.4638
2024-04-18 15:58:44 [INFO]: Epoch 002 - training loss: 0.3006
2024-04-18 15:58:44 [INFO]: Epoch 003 - training loss: 0.2673
2024-04-18 15:58:44 [INFO]: Epoch 004 - training loss: 0.2240
2024-04-18 15:58:44 [INFO]: Epoch 005 - training loss: 0.1988
2024-04-18 15:58:44 [INFO]: Epoch 006 - training loss: 0.1823
2024-04-18 15:58:44 [INFO]: Epoch 007 - training loss: 0.1784
2024-04-18 15:58:44 [INFO]: Epoch 008 - training loss: 0.1639
2024-04-18 15:58:44 [INFO]: Epoch 009 - training loss: 0.1537
2024-04-18 15:58:44 [INFO]: Epoch 010 - training loss: 0.1424
2024-04-18 15:58:44 [INFO]: Epoch 011 - training loss: 0.1413
2024-04-18 15:58:44 [IN

In [12]:
missing = model_imputed_df.isnull().sum().sum()
print(f"Missing values count: {missing}")
model_imputed_df

Missing values count: 0


Unnamed: 0,Dimension,Spatial_X,Spatial_Y,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2018-12-18 00:00:00,2018-12-19 00:00:00,2018-12-20 00:00:00,2018-12-21 00:00:00,2018-12-24 00:00:00,2018-12-25 00:00:00,2018-12-26 00:00:00,2018-12-27 00:00:00,2018-12-28 00:00:00,2018-12-31 00:00:00
0,3i Group PLC_035999,96,0,-0.356507,-0.274725,-0.320673,-0.326436,-1.283419,-1.887600,-2.004563,...,-1.356960,-1.273239,-1.297863,-1.098411,-1.509627,-1.509627,-1.509627,-1.310175,-1.027002,-1.297863
1,Admiral Group_036346,97,1,-1.347635,-1.189826,-1.325091,-1.336363,-1.313819,-1.121690,-1.223969,...,0.129009,0.151553,0.326270,0.343178,0.405175,0.405175,0.405175,0.433355,0.861694,1.132225
2,Anglo American PLC_035918,98,2,-1.294240,-1.286110,-1.365373,-1.355211,-1.395858,-1.316596,-0.979221,...,0.951131,1.116160,0.913735,1.110469,1.060066,1.060066,1.060066,0.958448,1.121038,1.095023
3,Antofagasta PLC_028149,99,3,-2.295047,-2.136391,-2.120526,-2.035909,-2.094083,-1.988312,-1.787348,...,-1.290226,-1.120993,-1.427728,-1.300803,-1.271187,-1.271187,-1.271187,-1.406574,-1.173878,-1.150609
4,Ashtead Group_028090,100,4,-1.199292,-1.136860,-1.155222,-1.122170,-1.155222,-1.188275,-1.122170,...,-0.920183,-0.918347,-1.098299,-1.120334,-0.967925,-0.967925,-0.967925,-1.175421,-0.978943,-0.989960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,Unilever PLC_035922,187,91,-2.869013,-2.931364,-2.886828,-2.845854,-2.863669,-2.683742,-2.717589,...,0.399962,0.417777,0.378584,0.373240,0.237849,0.237849,0.237849,-0.063217,0.097114,0.038326
92,United Utilities Group PLC_036341,188,92,0.733771,0.710001,0.638689,0.724263,0.695738,0.705247,0.633935,...,-0.762826,-0.705777,-0.606891,-0.519415,-0.878827,-0.878827,-0.878827,-1.002434,-0.802761,-0.833187
93,Vodafone Group PLC_035943,189,93,-0.007477,0.100982,0.241340,0.400838,0.481651,0.337039,0.434864,...,-1.793430,-1.725377,-1.704111,-1.841067,-1.936340,-1.936340,-1.936340,-2.063088,-1.932938,-2.004393
94,Whitbread PLC_035895,190,94,-0.898946,-0.881773,-0.713477,-0.469618,-0.332233,-0.593265,0.045575,...,1.405687,1.467510,1.481249,1.453772,1.656414,1.656414,1.656414,1.501856,1.766322,1.859057
