In [2]:
# ! pip install alibi-detect
# ! pip install alibi
# ! pip install snowflake-connector-python
# ! pip install snowflake-sqlalchemy
# ! pip install xgboost

In [2]:
# importing the required libraries
import pandas as pd
import numpy as np

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector

import sqlalchemy
import snowflake.connector
from sqlalchemy import create_engine, text
from snowflake.sqlalchemy import *

import time
import pytz
tz_ny = pytz.timezone("Asia/Kolkata")

from creds import ACCOUNT,USERNAME,PASSWORD

import warnings
warnings.filterwarnings('ignore')

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
# Creating the connection engine (way 1)
engine = create_engine(URL(
        account=ACCOUNT,
        user= USERNAME,
        password= PASSWORD,
        role="ACCOUNTADMIN",
        warehouse="COMPUTE_WH",
        database="HEALTH_DB",
        schema="PUBLIC"
    ))


## Data drift detector

Let's build a detector that detects data drift

In [4]:
# defining the query to fetch data from the snowflake database
query="""

    SELECT CASE_ID,
           COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
           COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
           COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
           COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
           COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
           COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
           COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
           COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
           COALESCE(BED_GRADE,0) AS BED_GRADE,
           PATIENTID,
           COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
           COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
           COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
           COALESCE(VISITORS_WITH_PATIENT,0) AS VISITORS_WITH_PATIENT,
           COALESCE(AGE,'None') AS AGE,
           COALESCE(ADMISSION_DEPOSIT,0) AS ADMISSION_DEPOSIT,
           ADMISSION_DATE,
           DISCHARGE_DATE

    FROM HEALTH_DB.PUBLIC.HEALTH_DATA

"""

In [5]:
# Connecting to the DB and fetching the training data from snowflake
with engine.connect() as conn:
    result = conn.execute(text(query))
    data = pd.DataFrame(result.fetchall())
    data.columns = result.keys()
    data.columns = [col.upper() for col in data.columns.tolist()]
    
    
# defining numerical,id and categorical columns
num_cols = ["AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL","ADMISSION_DEPOSIT","VISITORS_WITH_PATIENT"]
id_cols = ["CASE_ID","PATIENTID","ADMISSION_DATE","DISCHARGE_DATE"]
cat_cols = [col for col in data.columns if col not in num_cols+id_cols]

cat_cols

['HOSPITAL_CODE',
 'HOSPITAL_TYPE_CODE',
 'CITY_CODE_HOSPITAL',
 'HOSPITAL_REGION_CODE',
 'DEPARTMENT',
 'WARD_TYPE',
 'WARD_FACILITY_CODE',
 'BED_GRADE',
 'CITY_CODE_PATIENT',
 'TYPE_OF_ADMISSION',
 'SEVERITY_OF_ILLNESS',
 'AGE']

In [6]:
# filtering the data to include only cat and num cols
df = data[num_cols + cat_cols]
df.head()

Unnamed: 0,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,ADMISSION_DEPOSIT,VISITORS_WITH_PATIENT,HOSPITAL_CODE,HOSPITAL_TYPE_CODE,CITY_CODE_HOSPITAL,HOSPITAL_REGION_CODE,DEPARTMENT,WARD_TYPE,WARD_FACILITY_CODE,BED_GRADE,CITY_CODE_PATIENT,TYPE_OF_ADMISSION,SEVERITY_OF_ILLNESS,AGE
0,3,4911,2,8,c,3,Z,radiotherapy,R,F,2,7,Emergency,Extreme,51-60
1,2,4745,2,10,e,1,X,anesthesia,S,E,2,7,Trauma,Extreme,51-60
2,2,7272,2,26,b,2,Y,radiotherapy,R,D,2,7,Trauma,Extreme,51-60
3,2,5558,2,26,b,2,Y,radiotherapy,S,D,2,7,Trauma,Extreme,51-60
4,2,4449,2,23,a,6,X,anesthesia,S,F,2,7,Trauma,Extreme,51-60


In [7]:
df.columns

Index(['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL', 'ADMISSION_DEPOSIT',
       'VISITORS_WITH_PATIENT', 'HOSPITAL_CODE', 'HOSPITAL_TYPE_CODE',
       'CITY_CODE_HOSPITAL', 'HOSPITAL_REGION_CODE', 'DEPARTMENT', 'WARD_TYPE',
       'WARD_FACILITY_CODE', 'BED_GRADE', 'CITY_CODE_PATIENT',
       'TYPE_OF_ADMISSION', 'SEVERITY_OF_ILLNESS', 'AGE'],
      dtype='object')

In [8]:
# defining the indices of categorical columns
cat_indices = list(range(3,15))

# defining the no of categories present in each cateforical feature
cats_per_feature = {i:None for i in cat_indices}

print(cat_indices)
print(cats_per_feature)

[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
{3: None, 4: None, 5: None, 6: None, 7: None, 8: None, 9: None, 10: None, 11: None, 12: None, 13: None, 14: None}


In [9]:
# initializing the detector
detector = TabularDrift(df.values,p_val=.05,categories_per_feature=cats_per_feature)
detector.get_config()

{'name': 'TabularDrift',
 'meta': {'version': '0.12.0'},
 'x_ref': array([[3, 4911, 2, ..., 'Emergency', 'Extreme', '51-60'],
        [2, 4745, 2, ..., 'Trauma', 'Extreme', '51-60'],
        [2, 7272, 2, ..., 'Trauma', 'Extreme', '51-60'],
        ...,
        [3, 4241, 15, ..., 'Emergency', 'Moderate', '41-50'],
        [4, 3036, 4, ..., 'Emergency', 'Extreme', '41-50'],
        [4, 4326, 4, ..., 'Trauma', 'Extreme', '31-40']], dtype=object),
 'p_val': 0.05,
 'categories_per_feature': {3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None,
  9: None,
  10: None,
  11: None,
  12: None,
  13: None,
  14: None},
 'x_ref_preprocessed': False,
 'preprocess_at_init': True,
 'update_x_ref': None,
 'preprocess_fn': None,
 'correction': 'bonferroni',
 'alternative': 'two-sided',
 'n_features': None,
 'input_shape': None,
 'data_type': None}

In [10]:
# saving the detector model
with open('artifacts/drift_detector.pkl','wb') as f:
    pickle.dump(detector,f)

# loading the detector model from pickle file
with open('artifacts/drift_detector.pkl','rb') as f:
    detector = pickle.load(f)

### Testing Drift detector

Let's test this drift detector by passing our train data with added noise 

In [12]:
# adding the noise by introducing a new category (100) in the hospital code feature
test = df.copy()
test.loc[:1000,'HOSPITAL_CODE']=100

In [13]:
# predicting whether there's a drift
prediction = detector.predict(test.values,drift_type='feature')
prediction

{'data': {'is_drift': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'distance': array([   0.    ,    0.    ,    0.    , 1003.3416,    0.    ,    0.    ,
            0.    ,    0.    ,    0.    ,    0.    ,    0.    ,    0.    ,
            0.    ,    0.    ,    0.    ], dtype=float32),
  'p_val': array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        dtype=float32),
  'threshold': 0.05},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.12.0',
  'detector_type': 'drift'}}

In [14]:
# printing details
labels=["No","Yes"]

for i in range(detector.n_features):
    # determining the type of test done for the feature i
    stat = 'Chi2' if i in list(cat_cols) else 'KS'
    # getting the name of the feature
    fname = test.columns.tolist()[i]
    # finding if drift is happened at the feature
    is_drift = prediction['data']['is_drift'][i]
    # getting the test-statistic and pvalue for the test
    test_stat,p_value = prediction['data']['distance'][i], prediction['data']['p_val'][i]
    print(f"{fname} \t\t--Drift? {labels[is_drift]} --{stat} {test_stat}-- p-value:{p_value}")


AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL 		--Drift? No --KS 0.0-- p-value:1.0
ADMISSION_DEPOSIT 		--Drift? No --KS 0.0-- p-value:1.0
VISITORS_WITH_PATIENT 		--Drift? No --KS 0.0-- p-value:1.0
HOSPITAL_CODE 		--Drift? Yes --KS 1003.3416137695312-- p-value:0.0
HOSPITAL_TYPE_CODE 		--Drift? No --KS 0.0-- p-value:1.0
CITY_CODE_HOSPITAL 		--Drift? No --KS 0.0-- p-value:1.0
HOSPITAL_REGION_CODE 		--Drift? No --KS 0.0-- p-value:1.0
DEPARTMENT 		--Drift? No --KS 0.0-- p-value:1.0
WARD_TYPE 		--Drift? No --KS 0.0-- p-value:1.0
WARD_FACILITY_CODE 		--Drift? No --KS 0.0-- p-value:1.0
BED_GRADE 		--Drift? No --KS 0.0-- p-value:1.0
CITY_CODE_PATIENT 		--Drift? No --KS 0.0-- p-value:1.0
TYPE_OF_ADMISSION 		--Drift? No --KS 0.0-- p-value:1.0
SEVERITY_OF_ILLNESS 		--Drift? No --KS 0.0-- p-value:1.0
AGE 		--Drift? No --KS 0.0-- p-value:1.0


In [15]:
# defining a logging data frame
log_df = pd.DataFrame()

# log_df['Time Period'] = str(df['ADMISSION_DATE'].min()) + 'to' + str(df['ADMISSION_DATE'].max())
log_df['Time Period'] = ["testing"]*len(test.columns.tolist())
log_df['Total Records'] = test.shape[0]
log_df['Features'] = test.columns.tolist()
log_df["is_Drift"] = prediction['data']['is_drift']
log_df['Test'] = log_df['Features'].apply(lambda x:'Chi2' if x in cat_cols else "KS")
log_df['Test stat'] = np.round(prediction['data']['distance'])
log_df['P value'] = np.round(prediction['data']['p_val'])

In [16]:
log_df

Unnamed: 0,Time Period,Total Records,Features,is_Drift,Test,Test stat,P value
0,testing,236704,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0,KS,0.0,1.0
1,testing,236704,ADMISSION_DEPOSIT,0,KS,0.0,1.0
2,testing,236704,VISITORS_WITH_PATIENT,0,KS,0.0,1.0
3,testing,236704,HOSPITAL_CODE,1,Chi2,1003.0,0.0
4,testing,236704,HOSPITAL_TYPE_CODE,0,Chi2,0.0,1.0
5,testing,236704,CITY_CODE_HOSPITAL,0,Chi2,0.0,1.0
6,testing,236704,HOSPITAL_REGION_CODE,0,Chi2,0.0,1.0
7,testing,236704,DEPARTMENT,0,Chi2,0.0,1.0
8,testing,236704,WARD_TYPE,0,Chi2,0.0,1.0
9,testing,236704,WARD_FACILITY_CODE,0,Chi2,0.0,1.0


We can see that the model has detected the drift in the Hospital code feature where we explicitly added noise, therefore the model is working fine

### Data Drift checking function

In [17]:
# %%writefile utils.py -a

""" DATA DRIFT DETECOR UTILITY FUNCTIONS"""


# defining a function which returns the query given the batch id for data drift detection
def get_data_drift_query(x):
    
    """ x : if the x is 1, then the function pulls last one week's data """
    
    query=f"""

    SELECT CASE_ID,
           COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
           COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
           COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
           COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
           COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
           COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
           COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
           COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
           COALESCE(BED_GRADE,0) AS BED_GRADE,
           PATIENTID,
           COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
           COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
           COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
           COALESCE(VISITORS_WITH_PATIENT,0) AS VISITORS_WITH_PATIENT,
           COALESCE(AGE,'None') AS AGE,
           COALESCE(ADMISSION_DEPOSIT,0) AS ADMISSION_DEPOSIT,
           ADMISSION_DATE,
           DISCHARGE_DATE

    FROM HEALTH_DB.PUBLIC.PREDICTION_LOGGING
    WHERE ADMISSION_DATE>=CURRENT_DATE-579+{x*7} 

    """ 
    return query

Appending to utils.py


In [18]:
# %%writefile utils.py -a


# defining a function which pulls recent data and check if it is drifted
def data_drift_monitor(batch_id):
    
    """ batch_id : if the batch_id is 1, then the function pulls last one week's data """
    
    # defining the query
    query = get_data_drift_query(batch_id)
    

    # creating the snowflake engine
    engine = create_engine(URL(
            account=ACCOUNT,
            user= USERNAME,
            password= PASSWORD,
            role="ACCOUNTADMIN",
            warehouse="COMPUTE_WH",
            database="HEALTH_DB",
            schema="PUBLIC"
        ))
    
    # connecting to the engine
    with engine.connect() as conn:
        result = conn.execute(text(query))
        batch = pd.DataFrame(result.fetchall())
        batch.columns = result.keys()
        batch.columns = [col.upper() for col in batch.columns.tolist()]
        
    # defining numerical,id and categorical columns
    num_cols = ["AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL","ADMISSION_DEPOSIT","VISITORS_WITH_PATIENT"]
    id_cols = ["CASE_ID","PATIENTID","ADMISSION_DATE","DISCHARGE_DATE"]
    cat_cols = [col for col in batch.columns if col not in num_cols+id_cols]
    
    # filtering the data to include only cat and num cols
    batch_df = batch[num_cols + cat_cols]
    
    # loading the detector model from pickle file
    with open('artifacts/drift_detector.pkl','rb') as f:
        detector = pickle.load(f)
    
    # predicting whether there's a drift
    prediction = detector.predict(batch_df.values,drift_type='feature')
    
    # printing details
    labels=["No","Yes"]

    for i in range(detector.n_features):
        # determining the type of test done for the feature i
        stat = 'Chi2' if i in list(cat_cols) else 'KS'
        # getting the name of the feature
        fname = batch_df.columns.tolist()[i]
        # finding if drift is happened at the feature
        is_drift = prediction['data']['is_drift'][i]
        # getting the test-statistic and pvalue for the test
        test_stat,p_value = prediction['data']['distance'][i], prediction['data']['p_val'][i]
        print(f"{fname} \t\t--Drift? {labels[is_drift]} --{stat} {test_stat}-- p-value:{p_value}\n\n\n")
        
    
    # defining a logging data frame
    log_df = pd.DataFrame()

    log_df['Time Period'] = [str(batch['ADMISSION_DATE'].min()) + ' to ' + str(batch['ADMISSION_DATE'].max())] * len(batch_df.columns.tolist())
    log_df['Total Records'] = batch_df.shape[0]
    log_df['Features'] = batch_df.columns.tolist()
    log_df["is_Drift"] = prediction['data']['is_drift']
    log_df['Test'] = log_df['Features'].apply(lambda x:'Chi2' if x in cat_cols else "KS")
    log_df['Test stat'] = np.round(prediction['data']['distance'])
    log_df['P value'] = np.round(prediction['data']['p_val'])

    
    return log_df
    

Appending to utils.py


#### Checking for Data drift

In [4]:
from utils import data_drift_monitor

# testing for data drift 
data_drift_monitor(0)

AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL 		--Drift? Yes --KS 0.03185763210058212-- p-value:0.0001613584317965433



ADMISSION_DEPOSIT 		--Drift? Yes --KS 0.020330514758825302-- p-value:0.04286612942814827



VISITORS_WITH_PATIENT 		--Drift? No --KS 0.014468704350292683-- p-value:0.28401073813438416



HOSPITAL_CODE 		--Drift? Yes --KS 96.4312515258789-- p-value:1.235563473755974e-08



HOSPITAL_TYPE_CODE 		--Drift? Yes --KS 24.409318923950195-- p-value:0.000439027528045699



CITY_CODE_HOSPITAL 		--Drift? Yes --KS 34.03654479980469-- p-value:0.00018208383698947728



HOSPITAL_REGION_CODE 		--Drift? No --KS 0.3227606415748596-- p-value:0.8509683609008789



DEPARTMENT 		--Drift? Yes --KS 27.982528686523438-- p-value:1.257504482055083e-05



WARD_TYPE 		--Drift? Yes --KS 35.699493408203125-- p-value:1.090751993615413e-06



WARD_FACILITY_CODE 		--Drift? No --KS 10.119813919067383-- p-value:0.07191070169210434



BED_GRADE 		--Drift? Yes --KS 11.477134704589844-- p-value:0.021693985909223557





Unnamed: 0,Time Period,Total Records,Features,is_Drift,Test,Test stat,P value
0,2022-12-02 to 2022-12-02,4724,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,1,KS,0.0,0.0
1,2022-12-02 to 2022-12-02,4724,ADMISSION_DEPOSIT,1,KS,0.0,0.0
2,2022-12-02 to 2022-12-02,4724,VISITORS_WITH_PATIENT,0,KS,0.0,0.0
3,2022-12-02 to 2022-12-02,4724,HOSPITAL_CODE,1,Chi2,96.0,0.0
4,2022-12-02 to 2022-12-02,4724,HOSPITAL_TYPE_CODE,1,Chi2,24.0,0.0
5,2022-12-02 to 2022-12-02,4724,CITY_CODE_HOSPITAL,1,Chi2,34.0,0.0
6,2022-12-02 to 2022-12-02,4724,HOSPITAL_REGION_CODE,0,Chi2,0.0,1.0
7,2022-12-02 to 2022-12-02,4724,DEPARTMENT,1,Chi2,28.0,0.0
8,2022-12-02 to 2022-12-02,4724,WARD_TYPE,1,Chi2,36.0,0.0
9,2022-12-02 to 2022-12-02,4724,WARD_FACILITY_CODE,0,Chi2,10.0,0.0


##  Model drift detector

Let's build a detector that detects model drift

In [5]:
# %%writefile utils.py -a

""" MODEL DRIFT DETECOR UTILITY FUNCTIONS"""

# defining a function which pulls data from snowflake for model drift detection
def get_model_drift_query(x):
    
    """ x : if the x is 1, then the function pulls last one week's data """
    
    query=f"""

    SELECT CASE_ID,
           COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
           COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
           COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
           COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
           COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
           COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
           COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
           COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
           COALESCE(BED_GRADE,0) AS BED_GRADE,
           PATIENTID,
           COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
           COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
           COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
           COALESCE(VISITORS_WITH_PATIENT,0) AS VISITORS_WITH_PATIENT,
           COALESCE(AGE,'None') AS AGE,
           COALESCE(ADMISSION_DEPOSIT,0) AS ADMISSION_DEPOSIT,
           ADMISSION_DATE,
           DISCHARGE_DATE,
           LOS,
           LOS_PREDICTED

    FROM HEALTH_DB.PUBLIC.PREDICTION_LOGGING
    WHERE ADMISSION_DATE>=CURRENT_DATE-580+{x*7} 

    """ 
    return query

Appending to utils.py


In [6]:
# %%writefile utils.py -a


# defining a function that checks whether the model is drifted by comparing the performance metrics on both train and new data
def check_model_drfit(ref_metric_dict,curr_metric_dict,type="regression",tol=0.1):
    
    """ref_metric_dict   : dictionary containing the performance metrics of model on train data
       curr_metric_dict  : dictionary containing the performance metrics of model on new unseen data
       type              : type of the problem (classification/regression)
       tolerance         : the minimum percentage difference between train and test metrics to decide the drift
       
       Returns floating values representing metrics change and a boolean variable is_model_drifted"""
    
    if type=="classification":
        
        # finding the deviation in the classification metrics
        precision_change = abs((curr_metric_dict['Precision']-ref_metric_dict['Precision'])/ref_metric_dict['Precision'])
        recall_change = abs((curr_metric_dict['Recall']-ref_metric_dict['Recall'])/ref_metric_dict['Recall'])
        roc_auc_change = abs((curr_metric_dict['Roc-Auc']-ref_metric_dict['Roc-Auc'])/ref_metric_dict['Roc-Auc'])
        
        # checking how many metrics are deviated beyond the tolerance threshold
        counter = 0
        for i in [precision_change,recall_change,roc_auc_change]:
            if i > tol:
                counter+=1
        
        if counter>0:
            print(f"ALERT ! There's a model drift")
            print("Change in Precision: "+ str(round(100*precision_change,2)) + "%" )
            print("Change in Recall: "+ str(round(100*recall_change,2)) + "%" )
            print("Change in Roc-Auc: "+ str(round(100*roc_auc_change,2)) + "%" )
            return 1,precision_change,recall_change,roc_auc_change
        else:
            print("There is no Model drift.")
            return 0,precision_change,recall_change,roc_auc_change
        
    
    elif type=="regression":
        
        # finding the deviation in the regression metrics
        rmse_change = abs((curr_metric_dict['RMSE']-ref_metric_dict['RMSE'])/ref_metric_dict['RMSE'])
        mae_change = abs((curr_metric_dict['MAE']-ref_metric_dict['MAE'])/ref_metric_dict['MAE'])
        
        # checking how many metrics are deviated beyond the tolerance threshold
        counter = 0
        for i in [rmse_change,mae_change]:
            if i > tol:
                counter+=1
        
        if counter>0:
            print(f"ALERT ! There's a model drift")
            print("Change in RMSE: "+ str(round(100*rmse_change,2)) + "%" )
            print("Change in MAE: "+ str(round(100*mae_change,2)) + "%" )
            return 1,rmse_change,mae_change
        else:
            print("There is no Model drift.")
            return 0,rmse_change,mae_change
        
            

Appending to utils.py


In [7]:
# %%writefile utils.py -a


# defining a function that checks if the model is drifted
def model_drift_monitor(batch_id):
    
    """ batch_id : if the batch_id is 1, then the function pulls last one week's data """
    
    # defining the query
    query = get_data_drift_query(batch_id)
    
    # creating the snowflake engine
    engine = create_engine(URL(
            account=ACCOUNT,
            user= USERNAME,
            password= PASSWORD,
            role="ACCOUNTADMIN",
            warehouse="COMPUTE_WH",
            database="HEALTH_DB",
            schema="PUBLIC"
        ))
    
    # connecting to the engine
    with engine.connect() as conn:
        result = conn.execute(text(query))
        batch = pd.DataFrame(result.fetchall())
        batch.columns = result.keys()
        batch.columns = [col.upper() for col in batch.columns.tolist()]
        
        
    # getting the actual LOS and predicted los
    actual = batch['LOS']
    predicted = batch['LOS_PREDICTED']
    
    # computing the metrics
    rmse = root_mean_squared_error(actual,predicted)
    mae = mean_absolute_error(actual,predicted)
    
    # storing the metrics in a dictionary for detecting model drift
    scoring_ref_metrics = dict()
    scoring_ref_metrics['RMSE'] = rmse
    scoring_ref_metrics['MAE'] = mae
          
          
    # loading the model_ref_metrics which conatains the metrics of model on train data
    with open('artifacts/model_ref_metrics.pkl','rb') as f:
          model_ref_metrics = pickle.load(f)
          
          
    # calling the check_model_drfit to compare the performance metrics
    model_drift,rmse_change,mae_change = check_model_drfit(model_ref_metrics,scoring_ref_metrics,type="regression",tol=0.1)
          
    # creating a log in the form of dictionary
    log = dict()
    log['Time-Period'] = str(batch['ADMISSION_DATE'].min()) + ' to ' + str(batch['ADMISSION_DATE'].max())
    log['Total records'] = batch.shape[0]
    log['Scoring metrics'] = scoring_ref_metrics
    log['Training metrics'] = model_ref_metrics
    log['Model Drift '] = model_drift
    log['RMSE change'] = rmse_change
    log['MAE change'] = mae_change
          
    return log

Appending to utils.py


#### Checking for Model drift

In [1]:
from utils import model_drift_monitor
model_drift_monitor(0)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


There is no Model drift.


{'Time-Period': '2022-12-02 to 2022-12-02',
 'Total records': 4724,
 'Scoring metrics': {'RMSE': 14.513505054207606, 'MAE': 10.757832345469941},
 'Training metrics': {'RMSE': 13.616544450377615, 'MAE': 10.055131527065848},
 'Model Drift ': 0,
 'RMSE change': 0.06587285100847416,
 'MAE change': 0.06988479628661264}