In [10]:
import pandas
import numpy
import json

from modelop.monitors.drift import *
from modelop.monitors.performance import *
from modelop.monitors.volumetrics import *
from modelop.schema.infer import infer_schema, set_monitoring_parameters, autoset_parameters

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
df_baseline = pandas.read_json('df_baseline_scored.json', orient='records', lines=True)
df_sample = pandas.read_json('df_sample_scored.json', orient='records', lines=True)

In [8]:
df_baseline_schema = infer_schema(data=df_baseline, schema_name='df_baseline_scored_input_schema.avsc')
df_sample_schema = infer_schema(data=df_sample, schema_name='df_sample_scored_input_schema.avsc')

In [9]:
pandas.DataFrame(df_sample_schema['fields'])

Unnamed: 0,name,type,dataClass,role,protectedClass,driftCandidate,specialValues,scoringOptional
0,Id,int,numerical,non-predictor,False,False,[],False
1,MSSubClass,int,numerical,predictor,False,True,[],False
2,MSZoning,string,categorical,predictor,False,True,[],False
3,LotFrontage,float,numerical,predictor,False,True,[],False
4,LotArea,int,numerical,predictor,False,True,[],False
...,...,...,...,...,...,...,...,...
77,YrSold,int,numerical,predictor,False,True,[],False
78,SaleType,string,categorical,predictor,False,True,[],False
79,SaleCondition,string,categorical,predictor,False,True,[],False
80,ground_truth,int,numerical,label,False,True,[],True


In [11]:
with open('df_baseline_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_baseline_schema, fp, indent=2)

with open('df_sample_scored_input_schema.avsc', 'w') as fp:
    json.dump(df_sample_schema, fp, indent=2)

In [13]:
# Check that baseline and sample schemas match
pandas.DataFrame(df_baseline_schema["fields"]).equals(pandas.DataFrame(df_sample_schema["fields"]))

True

In [15]:
fields_df = pandas.DataFrame(df_sample_schema['fields'])
fields_df.set_index('name', inplace=True)

monitoring_parameters = set_monitoring_parameters(fields_df)

monitoring_parameters

{'categorical_columns': ['MSZoning',
  'Street',
  'Alley',
  'LotShape',
  'LandContour',
  'Utilities',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Foundation',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'BsmtFinType1',
  'BsmtFinType2',
  'Heating',
  'HeatingQC',
  'CentralAir',
  'Electrical',
  'KitchenQual',
  'Functional',
  'FireplaceQu',
  'GarageType',
  'GarageFinish',
  'GarageQual',
  'GarageCond',
  'PavedDrive',
  'PoolQC',
  'Fence',
  'MiscFeature',
  'SaleType',
  'SaleCondition'],
 'numerical_columns': ['MSSubClass',
  'LotFrontage',
  'LotArea',
  'OverallQual',
  'OverallCond',
  'YearBuilt',
  'YearRemodAdd',
  'MasVnrArea',
  'BsmtFinSF1',
  'BsmtFinSF2',
  'BsmtUnfSF',
  'TotalBsmtSF',
  '1stFlrSF',
  '2ndFlrSF',
  'LowQualFinSF',
  'GrLivArea',
  'BsmtFullBath',
  'BsmtHalfBath',


In [21]:
df_baseline[monitoring_parameters['categorical_columns']] = df_baseline[monitoring_parameters['categorical_columns']].fillna('None')
df_baseline[monitoring_parameters['numerical_columns']] = df_baseline[monitoring_parameters['numerical_columns']].fillna(0)

df_sample[monitoring_parameters['categorical_columns']] = df_sample[monitoring_parameters['categorical_columns']].fillna('None')
df_sample[monitoring_parameters['numerical_columns']] = df_sample[monitoring_parameters['numerical_columns']].fillna(0)

In [22]:
drift_detector=DriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample,
    categorical_columns=monitoring_parameters['categorical_columns'], 
    numerical_columns=monitoring_parameters['numerical_columns'], 
)

In [34]:
for metric in ['es', 'ks']:
    if metric == 'es':
        print('Epps-Singleton Metric')
    else:
        print('Kolmogorov–Smirnov Metric')
    for k,v in drift_detector.calculate_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                print(k, v)
        except:
            pass
    print()

Epps-Singleton Metric
WoodDeckSF_p-value 0.014113806099201716

Kolmogorov–Smirnov Metric



In [36]:
drift_detector.calculate_drift(pre_defined_metric='JENSEN-SHANNON')

{'MiscVal': 0.3917582007697687,
 'PoolArea': 0.2756345089885898,
 '3SsnPorch': 0.220750897996971,
 'GarageYrBlt': 0.2102038397072132,
 'BsmtHalfBath': 0.19359625874639183,
 'ScreenPorch': 0.17807331367473203,
 'GarageCars': 0.17703675682296918,
 'BsmtFinSF2': 0.1726661196638598,
 'HalfBath': 0.16625531893474405,
 'BedroomAbvGr': 0.16208064260930485,
 'BsmtFullBath': 0.159646446947822,
 'KitchenAbvGr': 0.15701312233606285,
 'FullBath': 0.15295129683160816,
 'Fireplaces': 0.1459740816669563,
 'Exterior2nd': 0.11306230001363142,
 'MasVnrArea': 0.10020167185284548,
 'WoodDeckSF': 0.09958986314252749,
 'Neighborhood': 0.0995047402201817,
 'Exterior1st': 0.09921128424654123,
 'LowQualFinSF': 0.09347954725695384,
 'OverallCond': 0.09250524486140568,
 'TotalBsmtSF': 0.08329504854456402,
 'LotFrontage': 0.08255120321949831,
 'BsmtFinSF1': 0.08021137338468619,
 'Condition1': 0.07993391820509156,
 'EnclosedPorch': 0.07923173344709654,
 '2ndFlrSF': 0.077714614234592,
 'GarageType': 0.0759153026796

In [38]:
concept_drift_detector=ConceptDriftDetector(
    df_baseline=df_baseline, 
    df_sample=df_sample, 
    target_column=monitoring_parameters['score_column'],
    label_type=monitoring_parameters['label_type']
)

In [39]:
concept_drift_detector.calculate_concept_drift(pre_defined_metric='jensen-shannon')

{'prediction': 0.054137022648067856}

In [43]:
for metric in ['es', 'ks']:
    if metric == 'es':
        print('Epps-Singleton Metric')
    else:
        print('Kolmogorov–Smirnov Metric')
    for k,v in concept_drift_detector.calculate_concept_drift(pre_defined_metric=metric).items():
        try:
            if v < 0.05:
                print(k, v)
        except:
            pass
    print()

Epps-Singleton Metric


AssertionError: pre_defined_metric should be one of ['jensen-shannon', 'ks', 'es'].