In [1]:
import pandas
import numpy
import json

from modelop.monitors.drift import *
from modelop.monitors.performance import *
from modelop.monitors.volumetrics import *
from modelop.schema.infer import infer_schema, set_monitoring_parameters, autoset_parameters

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_baseline = pandas.read_json('df_baseline_scored.json', orient='records', lines=True)
df_sample = pandas.read_json('df_sample_scored.json', orient='records', lines=True)

In [3]:
df_baseline_schema = infer_schema(data=df_baseline, schema_name='df_baseline_scored_input_schema.avsc')
df_sample_schema = infer_schema(data=df_sample, schema_name='df_sample_scored_input_schema.avsc')

In [4]:
pandas.DataFrame(df_sample_schema['fields'])

Unnamed: 0,name,type,dataClass,role,protectedClass,driftCandidate,specialValues,scoringOptional
0,FullBath,int,numerical,predictor,False,True,[],False
1,1stFlrSF,int,numerical,predictor,False,True,[],False
2,TotalBsmtSF,int,numerical,predictor,False,True,[],False
3,BsmtQual,string,categorical,predictor,False,True,[],False
4,GarageArea,int,numerical,predictor,False,True,[],False
5,GarageCars,int,numerical,predictor,False,True,[],False
6,KitchenQual,string,categorical,predictor,False,True,[],False
7,ExterQual,string,categorical,predictor,False,True,[],False
8,GrLivArea,int,numerical,predictor,False,True,[],False
9,OverallQual,int,numerical,predictor,False,True,[],False


In [5]:
with open('df_baseline_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_baseline_schema, fp, indent=2)

with open('df_sample_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_sample_schema, fp, indent=2)

In [6]:
# Check that baseline and sample schemas match
pandas.DataFrame(df_baseline_schema["fields"]).equals(pandas.DataFrame(df_sample_schema["fields"]))

True

In [7]:
fields_df = pandas.DataFrame(df_sample_schema['fields'])
fields_df.set_index('name', inplace=True)

monitoring_parameters = set_monitoring_parameters(fields_df)

monitoring_parameters

{'categorical_columns': ['BsmtQual', 'KitchenQual', 'ExterQual'],
 'numerical_columns': ['FullBath',
  '1stFlrSF',
  'TotalBsmtSF',
  'GarageArea',
  'GarageCars',
  'GrLivArea',
  'OverallQual'],
 'score_column': 'prediction',
 'label_column': 'ground_truth',
 'label_type': 'numerical',
 'protected_classes': []}

In [8]:
df_baseline[monitoring_parameters['categorical_columns']] = df_baseline[monitoring_parameters['categorical_columns']].fillna('None')
df_baseline[monitoring_parameters['numerical_columns']] = df_baseline[monitoring_parameters['numerical_columns']].fillna(0)

df_sample[monitoring_parameters['categorical_columns']] = df_sample[monitoring_parameters['categorical_columns']].fillna('None')
df_sample[monitoring_parameters['numerical_columns']] = df_sample[monitoring_parameters['numerical_columns']].fillna(0)

In [9]:
drift_detector=DriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample,
    categorical_columns=monitoring_parameters['categorical_columns'], 
    numerical_columns=monitoring_parameters['numerical_columns'], 
)

In [10]:
for metric in ['es', 'ks']:
    if metric == 'es':
        print('Epps-Singleton Metric')
    else:
        print('Kolmogorov–Smirnov Metric')
    for k,v in drift_detector.calculate_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                print(k, v)
        except:
            pass
    print()

Epps-Singleton Metric
FullBath_p-value 0.020239243951419635
OverallQual_p-value 0.0387916619344517

Kolmogorov–Smirnov Metric





In [11]:
drift_detector.calculate_drift(pre_defined_metric='JENSEN-SHANNON')

{'FullBath': 0.11957787972603248,
 'GarageCars': 0.11749926193277065,
 'OverallQual': 0.07036324532994655,
 'GrLivArea': 0.0700550951578258,
 '1stFlrSF': 0.0646760461686575,
 'GarageArea': 0.060563379308929396,
 'TotalBsmtSF': 0.05988656608945518,
 'KitchenQual': 0.05657490759994198,
 'BsmtQual': 0.04502488720532775,
 'ExterQual': 0.031005711671486153}

In [12]:
concept_drift_detector=ConceptDriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample, 
    target_column=monitoring_parameters['score_column'],
    label_type=monitoring_parameters['label_type']
)

In [13]:
concept_drift_detector.calculate_concept_drift(pre_defined_metric='jensen-shannon')

{'prediction': 0.05832835603654477}

In [15]:
for metric in ['es', 'ks']:
    count = 0
    if metric == 'es':
        print('Epps-Singleton Metric')
        print()
    else:
        print('Kolmogorov–Smirnov Metric')
        print()
    for k,v in concept_drift_detector.calculate_concept_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                count += 1
                print(k, v)
        except:
            pass
    if count == 0:
        print('\tConcept drift not detected')
    print()

Epps-Singleton Metric

	Concept drift not detected

Kolmogorov–Smirnov Metric

	Concept drift not detected

