In [1]:
from pycaret.regression import load_model, predict_model

In [2]:
import re
import pandas as pd

def correct_lgbm_names(df) -> pd.DataFrame:
    # Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
    new_names = {col: re.sub(r"[^A-Za-z0-9_]+", "", col) for col in df.columns}
    new_n_list = list(new_names.values())
    # [LightGBM] Feature appears more than one time.
    new_names = {
        col: f"{new_col}_{i}" if new_col in new_n_list[:i] else new_col
        for i, (col, new_col) in enumerate(new_names.items())
    }
    return features.rename(columns=new_names)
    

def correct_features(original_df, model_features):
    
    corrected_df = correct_lgbm_names(original_df)
    
    # Add columns which (for some reason) are not present but are required by the trained model's pipeline
    # Default value "None"
    for feat in model_features:
        found_in_model = feat in corrected_df
        if not found_in_model:
            print(feat)
            if feat != 'label':
                corrected_df[feat] = None
    return corrected_df

In [3]:
from src.bgc_providers.ohio_bgc_provider import OhioBgcProvider
test_provider = OhioBgcProvider()
test_dataframe = test_provider.tsfresh_dataframe()
mock_last_measurements = test_dataframe.head(6)
mock_last_measurements

Unnamed: 0,date_time,mock_date,time_of_day,part_of_day,time,bg_value,id
0,2021-12-07 01:17:00,2021-12-07,01:17:00,late night,0.0,101,a
1,2021-12-07 01:22:00,2021-12-07,01:22:00,late night,0.08,98,a
2,2021-12-07 01:27:00,2021-12-07,01:27:00,late night,0.17,104,a
3,2021-12-07 01:32:00,2021-12-07,01:32:00,late night,0.25,112,a
4,2021-12-07 01:37:00,2021-12-07,01:37:00,late night,0.33,120,a
5,2021-12-07 01:42:00,2021-12-07,01:42:00,late night,0.42,127,a


In [4]:
test_dataframe[0:20]

Unnamed: 0,date_time,mock_date,time_of_day,part_of_day,time,bg_value,id
0,2021-12-07 01:17:00,2021-12-07,01:17:00,late night,0.0,101,a
1,2021-12-07 01:22:00,2021-12-07,01:22:00,late night,0.08,98,a
2,2021-12-07 01:27:00,2021-12-07,01:27:00,late night,0.17,104,a
3,2021-12-07 01:32:00,2021-12-07,01:32:00,late night,0.25,112,a
4,2021-12-07 01:37:00,2021-12-07,01:37:00,late night,0.33,120,a
5,2021-12-07 01:42:00,2021-12-07,01:42:00,late night,0.42,127,a
6,2021-12-07 01:47:00,2021-12-07,01:47:00,late night,0.5,135,a
7,2021-12-07 01:52:00,2021-12-07,01:52:00,late night,0.58,142,a
8,2021-12-07 01:57:00,2021-12-07,01:57:00,late night,0.67,140,a
9,2021-12-07 02:02:00,2021-12-07,02:02:00,late night,0.75,145,a


In [5]:
from src.featurizers.tsfresh import TsfreshFeaturizer

def featurize_stream_df(stream_df, window, horizon):
    featurizer = TsfreshFeaturizer(stream_df.tail(window), window, horizon, plot_chunks=False, minimal_features=False)
    featurizer.chunks = 1
    featurizer.create_feature_dataframe()
    return featurizer.feature_dataframe
    
features = featurize_stream_df(mock_last_measurements, 6, 6)
features

100%|██████████| 1/1 [00:09<00:00,  9.43s/it]


Unnamed: 0,bg_value__variance_larger_than_standard_deviation,bg_value__has_duplicate_max,bg_value__has_duplicate_min,bg_value__has_duplicate,bg_value__sum_values,bg_value__abs_energy,bg_value__mean_abs_change,bg_value__mean_change,bg_value__mean_second_derivative_central,bg_value__median,...,bg_value__permutation_entropy__dimension_7__tau_1,bg_value__query_similarity_count__query_None__threshold_0.0,bg_value__mean_n_absolute_max__number_of_maxima_7,start,end,start_time,end_time,start_time_of_day,end_time_of_day,part_of_day
0,1.0,0.0,0.0,0.0,662.0,73694.0,6.4,5.2,1.25,108.0,...,,,,0,5,0.0,0.42,01:17:00,01:42:00,late night


In [6]:
model = load_model('models/559_6_6_1_LGBMRegressor_8ef033d3-ac4b-47ba-8231-fb49991f8b7f')
saved_model_features = model.feature_names_in_

Transformation Pipeline and Model Successfully Loaded


In [7]:
predict_model(model, correct_features(features, saved_model_features))

bg_value__value_count__value_1_217
bg_value__number_crossing_m__m_1_250
label


Unnamed: 0,bg_value__variance_larger_than_standard_deviation,bg_value__has_duplicate_max,bg_value__has_duplicate_min,bg_value__has_duplicate,bg_value__sum_values,bg_value__abs_energy,bg_value__mean_abs_change,bg_value__mean_change,bg_value__mean_second_derivative_central,bg_value__median,...,start,end,start_time,end_time,start_time_of_day,end_time_of_day,part_of_day,bg_value__value_count__value_1_217,bg_value__number_crossing_m__m_1_250,prediction_label
0,1.0,0.0,0.0,0.0,662.0,73694.0,6.4,5.2,1.25,108.0,...,0,5,0.0,0.42,01:17:00,01:42:00,late night,,,147.286874


# Old

In [8]:
# from src.featurizers.tsfresh import TsfreshFeaturizer
# test_features = TsfreshFeaturizer(mock_last_measurements, 6, 6, plot_chunks=False, minimal_features=False)
# test_features.chunks = 1
# test_features.chunks

In [9]:
# test_features.create_feature_dataframe()
# features = test_features.feature_dataframe
# features

In [10]:
# import re
# new_names = {col: re.sub(r"[^A-Za-z0-9_]+", "", col) for col in features.columns}
# new_n_list = list(new_names.values())
# # [LightGBM] Feature appears more than one time.
# new_names = {
#     col: f"{new_col}_{i}" if new_col in new_n_list[:i] else new_col
#     for i, (col, new_col) in enumerate(new_names.items())
# }
# correct_features = features.rename(columns=new_names)
# # print(correct_features)
# for feat in saved_model_features:
#     found_in_model = feat in correct_features
#     if not found_in_model:
#         print(feat)
#         if feat != 'label':
#             correct_features[feat] = None