In [1]:
import pandas
import numpy
import json

from modelop.monitors.drift import *
from modelop.monitors.performance import *
from modelop.monitors.volumetrics import *
from modelop.schema.infer import infer_schema, set_monitoring_parameters, autoset_parameters

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_baseline = pandas.read_json('df_baseline_scored.json', orient='records', lines=True)
df_sample = pandas.read_json('df_sample_scored.json', orient='records', lines=True)

In [3]:
df_baseline_schema = infer_schema(data=df_baseline, schema_name='df_baseline_scored_input_schema.avsc')
df_sample_schema = infer_schema(data=df_sample, schema_name='df_sample_scored_input_schema.avsc')

In [4]:
pandas.DataFrame(df_sample_schema['fields'])

Unnamed: 0,name,type,dataClass,role,protectedClass,driftCandidate,specialValues,scoringOptional
0,Id,int,numerical,non-predictor,False,False,[],False
1,prediction,float,numerical,score,False,True,[],True
2,ground_truth,int,numerical,label,False,True,[],True
3,eOverallQual_TotalSF,int,numerical,predictor,False,True,[],False
4,OverallQual,int,numerical,predictor,False,True,[],False
5,eTotalSF,int,numerical,predictor,False,True,[],False
6,GrLivArea,int,numerical,predictor,False,True,[],False
7,ExterQual,int,numerical,predictor,False,True,[],False
8,KitchenQual,int,numerical,predictor,False,True,[],False
9,GarageCars,int,numerical,predictor,False,True,[],False


In [5]:
with open('df_baseline_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_baseline_schema, fp, indent=2)

with open('df_sample_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_sample_schema, fp, indent=2)

In [6]:
# Check that baseline and sample schemas match
pandas.DataFrame(df_baseline_schema["fields"]).equals(pandas.DataFrame(df_sample_schema["fields"]))

True

In [7]:
fields_df = pandas.DataFrame(df_sample_schema['fields'])
fields_df.set_index('name', inplace=True)

monitoring_parameters = set_monitoring_parameters(fields_df)

monitoring_parameters

{'categorical_columns': ['eHasGarage',
  'eHasRemodeling',
  'eHasFireplace',
  'eHasBsmt'],
 'numerical_columns': ['eOverallQual_TotalSF',
  'OverallQual',
  'eTotalSF',
  'GrLivArea',
  'ExterQual',
  'KitchenQual',
  'GarageCars',
  'eTotalBathrooms',
  'BsmtQual',
  'GarageArea',
  'TotalBsmtSF',
  'GarageFinish',
  'YearBuilt',
  'TotRmsAbvGrd',
  'FireplaceQu',
  'MasVnrArea'],
 'score_column': 'prediction',
 'label_column': 'ground_truth',
 'label_type': 'numerical',
 'protected_classes': []}

In [8]:
df_baseline[monitoring_parameters['categorical_columns']] = df_baseline[monitoring_parameters['categorical_columns']].fillna('None')
df_baseline[monitoring_parameters['numerical_columns']] = df_baseline[monitoring_parameters['numerical_columns']].fillna(0)

df_sample[monitoring_parameters['categorical_columns']] = df_sample[monitoring_parameters['categorical_columns']].fillna('None')
df_sample[monitoring_parameters['numerical_columns']] = df_sample[monitoring_parameters['numerical_columns']].fillna(0)

In [9]:
drift_detector=DriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample,
    categorical_columns=monitoring_parameters['categorical_columns'], 
    numerical_columns=monitoring_parameters['numerical_columns'], 
)

In [10]:
for metric in ['es', 'ks']:
    if metric == 'es':
        print('Epps-Singleton Metric')
    else:
        print('Kolmogorov–Smirnov Metric')
    for k,v in drift_detector.calculate_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                print(k, v)
        except:
            pass
    print()

Epps-Singleton Metric
GarageArea_p-value 0.013373830102258634
TotRmsAbvGrd_p-value 0.0017577018866932223

Kolmogorov–Smirnov Metric





In [11]:
drift_detector.calculate_drift(pre_defined_metric='JENSEN-SHANNON')

{'ExterQual': 0.13634383666952524,
 'GarageFinish': 0.11421990299228807,
 'GarageCars': 0.1093811625276817,
 'KitchenQual': 0.10799633697399472,
 'FireplaceQu': 0.0964847121285537,
 'BsmtQual': 0.08892449695149877,
 'MasVnrArea': 0.08791022889490084,
 'TotRmsAbvGrd': 0.08053003634870205,
 'GarageArea': 0.07036139860062982,
 'GrLivArea': 0.06794925946554,
 'eTotalBathrooms': 0.0675311228593119,
 'eTotalSF': 0.0613509643555278,
 'eOverallQual_TotalSF': 0.061188281726520334,
 'TotalBsmtSF': 0.06015696924954054,
 'OverallQual': 0.05888480132380598,
 'YearBuilt': 0.05000413342455589,
 'eHasBsmt': 0.02473055285614587,
 'eHasRemodeling': 0.012734709801091572,
 'eHasGarage': 0.00804141644719935,
 'eHasFireplace': 0.0}

In [12]:
concept_drift_detector=ConceptDriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample, 
    target_column=monitoring_parameters['score_column'],
    label_type=monitoring_parameters['label_type']
)

In [13]:
concept_drift_detector.calculate_concept_drift(pre_defined_metric='jensen-shannon')

{'prediction': 0.05964543710432606}

In [14]:
for metric in ['es', 'ks']:
    count = 0
    if metric == 'es':
        print('Epps-Singleton Metric')
        print()
    else:
        print('Kolmogorov–Smirnov Metric')
        print()
    for k,v in concept_drift_detector.calculate_concept_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                count += 1
                print(k, v)
        except:
            pass
    if count == 0:
        print('\tConcept drift not detected')
    print()

Epps-Singleton Metric

	Concept drift not detected

Kolmogorov–Smirnov Metric

	Concept drift not detected

