In [2]:
import numpy as np
import pandas as pd
import nannyml as nml
from sklearn.datasets import fetch_california_housing

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
# Experiment set up
dataset = 'avocados'
min_n_train = 52
n_test = 12
n_prod = 24
n_simulations = 3000
n_retrainings = 3
freq = 'W'
models = ['LGBMRegressor', 'ElasticNet', 'RandomForestRegressor', 'MLPRegressor']

In [53]:
pe_comparison = pd.read_parquet(f'../results/performance_estimation/{dataset}/pe_comparison_{dataset}_{models[0]}_{n_simulations}_simulations_{n_prod}_prod_.parquet')

In [41]:
degradation_threshold = 0.15
pe_comparison['upper_threshold'] = degradation_threshold
pe_comparison['estimated_alert'] = np.where(pe_comparison['value'] > degradation_threshold, True, False)
pe_comparison['realized_alert'] = np.where(pe_comparison['realized'] > degradation_threshold, True, False)

pe_comparison = pe_comparison[['realized', 'value', 'upper_threshold', 'estimated_alert', 'realized_alert', 'simulation_id']]

In [42]:
pe_comparison.head()

Unnamed: 0,realized,value,upper_threshold,estimated_alert,realized_alert,simulation_id
0,0.029868,0.133711,0.15,False,False,0
1,0.118612,0.133711,0.15,False,False,0
2,0.071403,0.133711,0.15,False,False,0
3,0.036387,0.133711,0.15,False,False,0
4,0.067065,0.133711,0.15,False,False,0


In [43]:
pe_comparison['realized_alert'].value_counts()

False    15521
True      3285
Name: realized_alert, dtype: int64

In [44]:
pe_comparison['estimated_alert'].value_counts()

False    17465
True      1341
Name: estimated_alert, dtype: int64

In [45]:
positive_alerts = pe_comparison[pe_comparison['realized_alert'] == True]
negative_alerts = pe_comparison[pe_comparison['realized_alert'] == False]

tp_alerts = sum(positive_alerts['estimated_alert'] == positive_alerts['realized_alert'])
tn_alerts = sum(negative_alerts['estimated_alert'] == negative_alerts['realized_alert'])

# degradations_per_model = pe_comparison.groupby(['simulation_id'])[['realized_alert']].sum().reset_index()
# num_degradated_models = len(degradations_per_model[degradations_per_model['realized_alert'] > 5])

# print(f"{num_degradated_models} out of {len(degradations_per_model)} models ({np.round(100 * num_degradated_models / len(degradations_per_model), 1)} %) degradaded")

degradations_per_model = pe_comparison.groupby(['simulation_id'])[['realized_alert', 'estimated_alert']].sum().reset_index()
num_degradated_models = len(degradations_per_model[degradations_per_model['realized_alert'] > 0])

degradations_per_model['is_true_positive'] = np.where((degradations_per_model['realized_alert'] > 0) & (degradations_per_model['estimated_alert'] > 0), True, False)
tp_alerts_per_model = degradations_per_model[degradations_per_model['is_true_positive'] == True]

print(f"{num_degradated_models} out of {len(degradations_per_model)} models ({np.round(100 * num_degradated_models / len(degradations_per_model), 1)} %) degradaded")
print(f"NannyML estimated at least a degradation in {len(tp_alerts_per_model)} out of the {num_degradated_models} ({np.round(100 * len(tp_alerts_per_model) / num_degradated_models, 1)} %) models that degradated")
print(f"There were {len(positive_alerts)} degradations alerts")
print(f"NannyML estimated {tp_alerts} out of {len(positive_alerts)} ({np.round(tp_alerts/len(positive_alerts) * 100, 1)} %) of the degradation alerts correctly")
print(f"NannyML estimated {tp_alerts + tn_alerts} ({np.round((tp_alerts + tn_alerts) / len(pe_comparison) * 100, 1)} %) of the time the behaivoir of the performance correctly")

1945 out of 3000 models (64.8 %) degradaded
NannyML estimated at least a degradation in 350 out of the 1945 (18.0 %) models that degradated
There were 3285 degradations alerts
NannyML estimated 394 out of 3285 (12.0 %) of the degradation alerts correctly
NannyML estimated 14968 (79.6 %) of the time the behaivoir of the performance correctly


1945 out of 3000 models (64.8 %) degradaded
NannyML estimated at least a degradation in 350 out of the 1945 (18.0 %) models that degradated
There were 3285 degradations alerts
NannyML estimated 394 out of 3285 (12.0 %) of the degradation alerts correctly
NannyML estimated 14968 (79.6 %) of the time the behaivoir of the performance correctly