In [1]:
import numpy as np
import pandas as pd
import nannyml as nml
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_absolute_percentage_error

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
# Experiment set up
dataset = 'taxi'
n_train = 8800 # one year
n_test = 2200
n_prod = 4400
n_simulations = 1500
metric = mean_absolute_percentage_error
freq = 'D'
chunk_period='M'
models = ['LGBMRegressor', 'ElasticNet', 'RandomForestRegressor']

In [50]:
errors_df = pd.read_parquet(f'../results/aging/{dataset}/aging_{dataset}_{models[1]}_{n_simulations}_simulations_{n_prod}_prod.parquet')
d_errors_df = pd.read_parquet(f'../results/aging/{dataset}/aging_{dataset}_{models[1]}_{n_simulations}_simulations_{n_prod}_prod_{freq}.parquet')

pe_comparison = pd.read_parquet(f'../results/performance_estimation/{dataset}/pe_comparison_{dataset}_{models[1]}_{n_simulations}_simulations_{n_prod}_prod_{n_prod}_chunk{chunk_period}_save.parquet')
comparable_model_ids = pe_comparison['simulation_id'].drop_duplicates().values
errors_df = errors_df[errors_df['simulation_id'].isin(comparable_model_ids)]
d_errors_df = d_errors_df[d_errors_df['simulation_id'].isin(comparable_model_ids)]


In [51]:
# errors_df = errors_df[errors_df['model_age'] <= 80]
# d_errors_df = d_errors_df[d_errors_df['model_age'] <= 80]

test_errors_df = errors_df[errors_df['partition'] == 'test']
test_mape = test_errors_df.groupby('simulation_id').apply(lambda group: mean_absolute_percentage_error(group.y, group.y_pred))
test_mape = pd.DataFrame(test_mape, columns=['test_mape']).reset_index()

d_errors_df = pd.merge(d_errors_df, test_mape, how='left', on='simulation_id')
d_errors_df = d_errors_df[d_errors_df['test_mape'] <= 0.1]

valid_models = d_errors_df['simulation_id'].drop_duplicates()

d_errors_df['degradation'] = d_errors_df['error'] > 0.15

degradations_per_model = d_errors_df.groupby(['simulation_id'])[['degradation']].sum().reset_index()
num_degradated_models = len(degradations_per_model[degradations_per_model['degradation'] > 5])

In [52]:
pe_comparison = pe_comparison[pe_comparison['simulation_id'].isin(valid_models.values)]

In [53]:
degradation_threshold = 0.15
pe_comparison['upper_threshold'] = degradation_threshold
pe_comparison['estimated_alert'] = np.where(pe_comparison['value'] > degradation_threshold, True, False)
pe_comparison['realized_alert'] = np.where(pe_comparison['realized'] > degradation_threshold, True, False)

pe_comparison = pe_comparison[['realized', 'value', 'upper_threshold', 'estimated_alert', 'realized_alert', 'simulation_id']]

In [54]:
pe_comparison.head()

Unnamed: 0,realized,value,upper_threshold,estimated_alert,realized_alert,simulation_id
0,63.073522,66.852624,0.15,True,True,17
1,89.616447,76.461179,0.15,True,True,17
2,126.604712,170.80644,0.15,True,True,17
3,116.242359,155.648244,0.15,True,True,17
4,94.31852,239.90846,0.15,True,True,17


In [55]:
pe_comparison['realized_alert'].value_counts()

True     1168
False      25
Name: realized_alert, dtype: int64

In [56]:
pe_comparison['estimated_alert'].value_counts()

True     1180
False      13
Name: estimated_alert, dtype: int64

In [57]:
positive_alerts = pe_comparison[pe_comparison['realized_alert'] == True]
negative_alerts = pe_comparison[pe_comparison['realized_alert'] == False]

tp_alerts = sum(positive_alerts['estimated_alert'] == positive_alerts['realized_alert'])
tn_alerts = sum(negative_alerts['estimated_alert'] == negative_alerts['realized_alert'])
fp_alerts = sum(negative_alerts['estimated_alert'] == True)
fn_alerts = sum(positive_alerts['estimated_alert'] == False)

# degradations_per_model = pe_comparison.groupby(['simulation_id'])[['realized_alert']].sum().reset_index()
# num_degradated_models = len(degradations_per_model[degradations_per_model['realized_alert'] > 5])

# print(f"{num_degradated_models} out of {len(degradations_per_model)} models ({np.round(100 * num_degradated_models / len(degradations_per_model), 1)} %) degradaded")

degradations_per_model = pe_comparison.groupby(['simulation_id'])[['realized_alert', 'estimated_alert']].sum().reset_index()

degradations_per_model['is_true_positive'] = np.where((degradations_per_model['realized_alert'] > 0) & (degradations_per_model['estimated_alert'] > 0), True, False)
tp_alerts_per_model = degradations_per_model[degradations_per_model['is_true_positive'] == True]

print(f"{num_degradated_models} out of {len(degradations_per_model)} models ({np.round(100 * num_degradated_models / len(degradations_per_model), 1)} %) degradaded")
print(f"NannyML estimated at least a degradation in {len(tp_alerts_per_model)} out of the {num_degradated_models} ({np.round(100 * len(tp_alerts_per_model) / num_degradated_models, 1)} %) models that degradated")
print(f"There were {len(positive_alerts)} degradations alerts")
print(f"NannyML estimated {tp_alerts} out of {len(positive_alerts)} ({np.round(tp_alerts/len(positive_alerts) * 100, 1)} %) of the degradation alerts correctly")
print(f"NannyML estimated {tp_alerts + tn_alerts} / {len(pe_comparison)}  ({np.round((tp_alerts + tn_alerts) / len(pe_comparison) * 100, 1)} %) of the time the behaivoir of the performance correctly")

170 out of 170 models (100.0 %) degradaded
NannyML estimated at least a degradation in 170 out of the 170 (100.0 %) models that degradated
There were 1168 degradations alerts
NannyML estimated 1161 out of 1168 (99.4 %) of the degradation alerts correctly
NannyML estimated 1167 / 1193  (97.8 %) of the time the behaivoir of the performance correctly


In [58]:
2*tp_alerts / (2*tp_alerts + fp_alerts + fn_alerts)


0.9889267461669506

In [59]:
(tp_alerts + tn_alerts) / (tp_alerts + tn_alerts + fp_alerts + fn_alerts)

0.9782062028499581

In [60]:
tpr = tp_alerts / (tp_alerts + fn_alerts)

print(tpr)

0.9940068493150684


In [61]:
tnr = tn_alerts / (tn_alerts + fp_alerts)
print(tpr)

0.9940068493150684


In [62]:
tp_alerts / (tp_alerts + fp_alerts)

0.9838983050847457