In [127]:
import numpy as np
import pandas as pd
import nannyml as nml
from sklearn.datasets import fetch_california_housing

In [158]:
def evaluate_nannyml(aging_df, metric, chunk_period):
    simulation_ids = aging_df['simulation_id'].unique()
    nml_data = aging_df.merge(data, left_index=True, right_index=True, how='left')

    comparison_results = []
    pe_results = {}
    realized_results = {}
    
    for simulation_id in simulation_ids:
        simulation_df = nml_data[nml_data['simulation_id'] == simulation_id]

        # get original reference set
        reference_df = simulation_df[simulation_df['partition'] == 'reference']

        # get original prod set
        analysis_df = simulation_df[simulation_df['partition'] == 'prod']

        # fit DLE from NannyML
        estimator = nml.DLE(
            feature_column_names=data.columns.tolist(),
            y_pred='y_pred',
            y_true='y',
            timestamp_column_name='timestamp',
            metrics=[metric],
            chunk_period=chunk_period,
            tune_hyperparameters=False
        )

        estimator.fit(reference_df)
        
        # performance estimation results
        pe_result = estimator.estimate(analysis_df)
        
        # performance calculculator results
        calculator = nml.PerformanceCalculator(
            y_pred='y_pred',
            y_true='y',
            timestamp_column_name='timestamp',
            metrics=[metric],
            chunk_period=chunk_period,
            problem_type='regression'
        ).fit(reference_df)
        realized_result = calculator.calculate(analysis_df)

        pe_results[simulation_id] = pe_result
        realized_results[simulation_id] = realized_result
        
        comparison_result = pe_result.filter(period='analysis').to_df()[metric]
        comparison_result['estimated_alert'] = comparison_result['alert']
        comparison_result['realized_alert'] = np.where(comparison_result['realized'] > comparison_result['upper_threshold'], 
                                               True, False)

        comparison_result['simulation_id'] = simulation_id        
        comparison_results.append(comparison_result)
    
    
    return pd.concat(comparison_results), pe_results, realized_results

In [186]:
# load original data
data, target = fetch_california_housing(as_frame=True, return_X_y=True)
timestamp = pd.date_range(start='1/1/2018', periods=len(data), freq='H')
data.index = timestamp
target.index = timestamp

# load aging results
aging_df = pd.read_parquet('../results/aging/cal_house/aging_cal_house_100_simulations_3000_prod.parquet')

In [187]:
aging_df.head()

Unnamed: 0,y,y_pred,partition,timestamp,model_age,is_model_valid,simulation_id
2018-02-22 21:00:00,1.254,1.455663,train,2018-02-22 21:00:00,-313,True,0
2018-02-22 22:00:00,1.514,1.415912,train,2018-02-22 22:00:00,-313,True,0
2018-02-22 23:00:00,2.631,2.138654,train,2018-02-22 23:00:00,-313,True,0
2018-02-23 00:00:00,1.871,2.083668,train,2018-02-23 00:00:00,-313,True,0
2018-02-23 01:00:00,1.567,1.228171,train,2018-02-23 01:00:00,-313,True,0


In [188]:
pe_comparison, pe_result, realized_result = evaluate_nannyml(aging_df, metric='mape', chunk_period='M')


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider 


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way o


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider 


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way o


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider 


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using cat


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


Using categorical_feature in Dataset.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider 

In [191]:
realized_result[99].plot()

In [192]:
pe_result[99].plot()

In [193]:
pe_result[99].compare(realized_result[99]).plot()

In [194]:
pe_comparison.head()

Unnamed: 0,sampling_error,realized,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert,estimated_alert,realized_alert,simulation_id
0,0.00984,0.275513,0.228618,0.25814,0.199097,0.433767,0.0,False,False,False,0
1,0.009449,0.266924,0.485109,0.513456,0.456761,0.433767,0.0,True,True,False,0
2,0.009605,0.263685,0.447021,0.475837,0.418205,0.433767,0.0,True,True,False,0
3,0.009449,0.369532,0.430803,0.45915,0.402456,0.433767,0.0,False,False,False,0
4,0.025034,0.447805,0.394978,0.470079,0.319877,0.433767,0.0,False,False,True,0


In [195]:
pe_comparison['realized_alert'].value_counts()

False    432
True      59
Name: realized_alert, dtype: int64

In [235]:
positive_alerts = pe_comparison[pe_comparison['realized_alert'] == True]
negative_alerts = pe_comparison[pe_comparison['realized_alert'] == False]

tp_alerts = sum(positive_alerts['estimated_alert'] == positive_alerts['realized_alert'])
tn_alerts = sum(negative_alerts['estimated_alert'] == negative_alerts['realized_alert'])

degradations_per_model = pe_comparison.groupby(['simulation_id'])[['realized_alert', 'estimated_alert']].sum().reset_index()
num_degradated_models = len(degradations_per_model[degradations_per_model['realized_alert'] > 0])

degradations_per_model['is_true_positive'] = np.where((degradations_per_model['realized_alert'] > 0) & (degradations_per_model['estimated_alert'] > 0), True, False)
tp_alerts_per_model = degradations_per_model[degradations_per_model['is_true_positive'] == True]

print(f"{num_degradated_models} out of {len(degradations_per_model)} models ({np.round(100 * num_degradated_models / len(degradations_per_model), 1)} %) degradaded")
print(f"NannyML estimated at least a degradation in {len(tp_alerts_per_model)} out of the {num_degradated_models} ({np.round(100 * len(tp_alerts_per_model) / num_degradated_models, 1)} %) models that degradated")
print(f"There were {len(positive_alerts)} degradations alerts")
print(f"NannyML estimated {tp_alerts} out of {len(positive_alerts)} ({np.round(tp_alerts/len(positive_alerts) * 100, 1)} %) of the degradation alerts correctly")
print(f"NannyML estimated {tp_alerts + tn_alerts} ({np.round((tp_alerts + tn_alerts) / len(pe_comparison) * 100, 1)} %) of the time the behaivoir of the performance correctly")

25 out of 100 models (25.0 %) degradaded
NannyML estimated at least a degradation in 20 out of the 25 (80.0 %) models that degradated
There were 59 degradations alerts
NannyML estimated 27 out of 59 (45.8 %) of the degradation alerts correctly
NannyML estimated 398 (81.1 %) of the time the behaivoir of the performance correctly


In [233]:
tp_alerts_per_model

Unnamed: 0,simulation_id,realized_alert,estimated_alert,is_true_positive
0,0,1,2,True
3,3,1,2,True
15,15,1,4,True
19,19,1,2,True
22,22,6,1,True
40,40,2,4,True
41,41,5,3,True
43,43,6,2,True
47,47,1,4,True
54,54,1,3,True
