In [1]:
import pandas as pd
import numpy as np
import nannyml as nml
import nannyml_premium as nml_premium
from nannyml_premium.performance_estimation.confidence_based import MCBPE
from nannyml_premium.concept_shift.rcs import ClassificationConceptShiftEstimator

In [2]:
df_raw = pd.read_csv('data/occupacy_data.csv', index_col=0)
df_raw.head()

Unnamed: 0,predicted,measured,prob_predicted,co2,temperature,time
0,0.0,0.0,2e-06,406.449058,22.149338,2021-03-30 00:12:00
1,0.0,0.0,5e-06,406.389182,22.151172,2021-03-30 00:15:00
2,0.0,0.0,1.6e-05,406.584225,22.151124,2021-03-30 00:18:00
3,0.0,0.0,7.1e-05,407.031691,22.150188,2021-03-30 00:21:00
4,0.0,0.0,0.000374,407.353326,22.148271,2021-03-30 00:24:00


In [3]:
features = ['co2', 'temperature']
y_pred = 'predicted'
target = 'measured'
y_pred_proba = 'prob_predicted'
timestamp = 'time'

Directly taken from the paper:

"The data drift analysis encompasses the input variables co2 and room temperature, alongside the target variable measured

Consequently, the dataset is divided into two parts: training data, which includes records from March 30, 2021, at 12:12 AM to May 8, 2021, at 11:57 PM, and test data, which spans from May 9, 2021, at 12:00 AM to July 11, 2021, at 11:57 PM."

In [4]:
df_train = df_raw[df_raw['time'] <= '2021-05-08 23:57:00']
df_test = df_raw[(df_raw['time'] >= '2021-06-09 00:00:00') & (df_raw['time'] <= '2021-06-25 12:03:00')]
df_prod = df_raw[df_raw['time'] >= '2021-06-25 12:06:00']

print(f'train set')
print(f'lenght: {len(df_train)}')
print(f'from: {df_train.time.iloc[0]} to {df_train.time.iloc[-1]}')
print('\n')

print(f'test set')
print(f'lenght: {len(df_test)}')
print(f'from: {df_test.time.iloc[0]} to {df_test.time.iloc[-1]}')
print('\n')

print(f'prod set')
print(f'lenght: {len(df_prod)}')
print(f'from: {df_prod.time.iloc[0]} to {df_prod.time.iloc[-1]}')

train set
lenght: 16091
from: 2021-03-30 00:12:00 to 2021-05-02 23:57:00


test set
lenght: 7854
from: 2021-06-09 00:12:00 to 2021-06-25 12:03:00


prod set
lenght: 7854
from: 2021-06-25 12:06:00 to 2021-07-11 23:57:00


# Red flag 1: using training data as reference data

## fit on training data

In [5]:
performance_calc = nml.PerformanceCalculator(
    y_pred_proba=y_pred_proba,
    y_pred=y_pred,
    y_true=target,
    metrics=['roc_auc', 'accuracy', 'f1', 'recall'],
    chunk_size=1964,
    problem_type='classification_binary',
).fit(df_train)

realized_results = performance_calc.calculate(df_prod)

realized_results.plot()



## fit on test data

In [6]:
performance_calc = nml.PerformanceCalculator(
    y_pred_proba=y_pred_proba,
    y_pred=y_pred,
    y_true=target,
    metrics=['roc_auc', 'accuracy', 'f1', 'recall'],
    chunk_size=1964,
    problem_type='classification_binary',
).fit(df_test)

realized_results = performance_calc.calculate(df_prod)

realized_results.plot()


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



# Red Flag 2: not providing the chunk size of their experiments

## Using small chunk size

In [7]:
univariate_drift_calc = nml.UnivariateDriftCalculator(
    column_names=features + [y_pred],
    treat_as_categorical=y_pred,
    timestamp_column_name=timestamp,
    continuous_methods=['kolmogorov_smirnov'],
    categorical_methods=['chi2'],
    chunk_size=500
)

univariate_drift_calc.fit(df_test)
univariate_drift_results = univariate_drift_calc.calculate(df_prod)
univariate_drift_results.plot()


In [8]:
fig = univariate_drift_results.filter(column_names='temperature').plot()
fig.show()
fig.write_image('small_chunk_temperature.svg')

## using a big chunk size

In [9]:
univariate_drift_calc = nml.UnivariateDriftCalculator(
    column_names=features + [y_pred],
    treat_as_categorical=y_pred,
    timestamp_column_name=timestamp,
    continuous_methods=['kolmogorov_smirnov'],
    categorical_methods=['chi2'],
    chunk_size=1964
)

univariate_drift_calc.fit(df_test)
univariate_drift_results = univariate_drift_calc.calculate(df_prod)
univariate_drift_results.plot()



The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



In [10]:
fig = univariate_drift_results.filter(column_names='temperature').plot()
fig.show()
fig.write_image('big_chunk_temperature.svg')

# Red flag 3: comparing only univariate drift methods

In [11]:
multivariate_drift_calc = nml.DataReconstructionDriftCalculator(
    column_names=features,
    timestamp_column_name=timestamp,
    chunk_size=1964
)

multivariate_drift_calc.fit(df_test)
multivariate_drift_results = multivariate_drift_calc.calculate(df_prod)
multivariate_drift_results.plot()


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



# Red flag 4: focusing only on data drift and not on model performance

## Performance estimation

### CBPE

In [12]:
cbpe = nml.CBPE(
    y_pred_proba=y_pred_proba,
    y_pred=y_pred,
    y_true=target,
    timestamp_column_name=timestamp,
    metrics=['roc_auc', 'accuracy', 'f1', 'recall'],
    chunk_size=1964,
    problem_type='classification_binary',
)

cbpe.fit(df_test)
cbpe_results = cbpe.estimate(df_prod)
cbpe_results.plot()



The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



### PAPE

In [13]:
pape = MCBPE(
    feature_column_names=features,
    y_pred_proba=y_pred_proba,
    y_pred=y_pred,
    y_true=target,
    timestamp_column_name=timestamp,
    metrics=['f1'],
    chunk_size=1964,
    problem_type='classification_binary',
)

pape.fit(df_test)
pape_results = pape.estimate(df_prod)
fig = pape_results.plot()
fig.show()
fig.write_image('mcbpe_results.svg')


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



## Realized vs. Esimated performance

In [14]:
performance_calc = nml.PerformanceCalculator(
    y_pred_proba=y_pred_proba,
    y_pred=y_pred,
    y_true=target,
    timestamp_column_name=timestamp,
    metrics=['roc_auc', 'accuracy', 'f1', 'recall'],
    chunk_size=1964,
    problem_type='classification_binary',
).fit(df_test)

realized_results = performance_calc.calculate(df_prod)


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.


The resulting number of chunks is too low. Please consider splitting your data in a different way or continue at your own risk.



In [15]:
cbpe_results.filter(metrics='accuracy').compare(realized_results.filter(metrics='accuracy')).plot()

In [16]:
cbpe_results.filter(metrics='roc_auc').compare(realized_results.filter(metrics='roc_auc')).plot()

In [17]:
fig = cbpe_results.filter(metrics='f1').compare(realized_results.filter(metrics='f1')).plot()
fig.write_image('realized_f1_vs_estimated_f1.svg')

In [18]:
cbpe_results.filter(metrics='recall').compare(realized_results.filter(metrics='recall')).plot()

In [21]:
fig = pape_results.filter(metrics='f1').compare(realized_results.filter(metrics='f1')).plot()
fig.show()
fig.write_image('realized_f1_vs_estimated_f1.svg')

In [22]:
fig = realized_results.filter(metrics='f1').plot()
fig.show()
fig.write_image('f1_realized_performance.svg')