In [1]:
import numpy as np
import pandas as pd 
import math as math
import sys as sys
from evidently.test_suite import TestSuite
from evidently.test_preset import DataStabilityTestPreset
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.options import ColorOptions

color_scheme = ColorOptions(
    primary_color = "#5a86ad",
    fill_color = "#fff4f2",
    zero_line_color = "#016795",
    current_data_color = "#c292a1",
    reference_data_color = "#017b92"
)

path1 = "/Users/pooja/Desktop/GitHub/mrdc_model_monitoring/data/"
path2 = "/Users/pooja/Desktop/GitHub/mrdc_model_monitoring/reports/"

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_ref = pd.read_csv(path1 + 'mrdc_training.csv')
df_ref.columns = df_ref.columns.str.lower()
df_ref.shape

(371825, 32)

In [3]:
df_ref[:2]

Unnamed: 0,mrdc_id,created_status,total_amount,predictions,status,ach_c_median_past30d,ach_c_std_past30d,ach_d_avg_past10d,ach_d_median_past30d,amount,...,mrdc_c_median_past10by30d,od_count_past30d,past10by30d_between200and1000_ratio,past10by30d_check_ratio,past10by30d_returned_check_ratio,pd_avg_past10d,rejected_past10by30d_between200and1000_ratio,returned_past30d_avg_check_amount,rn_past30d_avg_check_amount,past10by30d_lessthan200_ratio
0,2cdf193d-4844-4eb6-891b-80f28f39c682,2022-01-01,15000,944.605947,deposited,0.0,0.0,0.0,0.0,150.0,...,0.0,0,0.0,0.2,0.0,0.0,0.0,0.0,75.0,0.2
1,77a413c3-e7b8-4d98-9e3e-38cafd052870,2022-01-01,50000,804.125965,deposited,0.0,0.0,0.0,0.0,500.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def load_dataset(month):
    df = pd.read_csv(path1 + "monthly/"+ month + ".csv")
    df.columns = df.columns.str.lower()
    return df

In [5]:
features = ['ach_c_median_past30d', 
            'ach_c_std_past30d',
            'ach_d_avg_past10d',
            'ach_d_median_past30d', 
            'amount', 
            'avg_running_balance_past30d',
            'card_txn_std_past10by30d', 
            'card_txn_std_past10d',
            'credit_txn_avg_past10by30d', 
            'credit_txn_avg_past10d',
            'credit_txn_count_past10by30d', 
            'credit_txn_std_past30d',
            'debit_txn_count_past10by30d', 
            'ein_ssn', 
            'is_between1000and5000',
            'is_between200and1000', 
            'mrdc_c_avg_past30d',
            'mrdc_c_median_past10by30d', 
            'od_count_past30d',
            'past10by30d_between200and1000_ratio', 
            'past10by30d_check_ratio',
            'past10by30d_returned_check_ratio', 
            'pd_avg_past10d',
            'rejected_past10by30d_between200and1000_ratio',
            'returned_past30d_avg_check_amount', 
            'rn_past30d_avg_check_amount',
            'past10by30d_lessthan200_ratio']

df_ref = df_ref[features]


In [6]:
# data stability report 
def data_stability(df_cur, month): 
    df_cur = df_cur[features]
    data_stability= TestSuite(tests=[
        DataStabilityTestPreset(),
    ],options=[color_scheme])
    data_stability.run(current_data=df_cur, reference_data=df_ref, column_mapping=None)
    data_stability.save_html(path2 + "mrdc_data_drift/data_stability_" + month + ".html")
    return data_stability

In [7]:
# data drift report 
def data_drift(df_cur,month): 
        data_drift_report = Report(metrics=[
                # DataDriftPreset(stattest="ks", stattest_threshold=0.35),  
                DataDriftPreset(stattest="psi", stattest_threshold=0.25),],options=[color_scheme])
        data_drift_report.run(current_data=df_cur, reference_data = df_ref, column_mapping=None)
        data_drift_report.save_html(path2 + "mrdc_data_drift/data_drift_" + month + ".html")
        return data_drift_report

In [8]:
# months = ['sept_23', 'oct_23', 'nov_23', 'dec_23', 'jan_24']

# for i in range(len(months)):
#     df = load_dataset(months[i])
#     df.fillna(0) 
#     data_stability(df, months[i])
#     data_drift(df, months[i])


In [9]:
sept = load_dataset('sept_23')


In [10]:
drift_cols = ['ach_d_avg_past10d', 'card_txn_std_past10by30d', 'card_txn_std_past10d', 'credit_txn_avg_past10by30d',
               'credit_txn_avg_past10d', 'credit_txn_count_past10by30d', 'debit_txn_count_past10by30d', 'mrdc_c_median_past10by30d',
                'pd_avg_past10d' ]

for i in range(len(drift_cols)): 
    print(sept[drift_cols[i]].value_counts())

ach_d_avg_past10d
0    18270
Name: count, dtype: int64
card_txn_std_past10by30d
0    18270
Name: count, dtype: int64
card_txn_std_past10d
0    18270
Name: count, dtype: int64
credit_txn_avg_past10by30d
0    18270
Name: count, dtype: int64
credit_txn_avg_past10d
0    18270
Name: count, dtype: int64
credit_txn_count_past10by30d
0    18270
Name: count, dtype: int64
debit_txn_count_past10by30d
0    18270
Name: count, dtype: int64
mrdc_c_median_past10by30d
0    18270
Name: count, dtype: int64
pd_avg_past10d
0    18270
Name: count, dtype: int64


##### Using the bins of the model 

In [11]:
# psi calculation feature level 
def calculate_psi(df_ref, df_cur, features): 
    name = features
    score_file_old = df_ref.copy()
    score_file_new = df_cur.copy()
    l = []
    bins = [0, 100, 200, 300, 500, 700, 800, 850, 900, 950, 1000]
    
    for m in range(len(name)):
        pred_score = pd.DataFrame()
        pred_score["y_predict"] = score_file_old[name[m]]

        Ndiv = len(bins) - 1
        N = len(pred_score)
        bin_obs_1 = [int(N/Ndiv)] * Ndiv
        bin_obs_2 = [1]*(N%Ndiv) + [0]*(Ndiv-(N%Ndiv))

        dN = [bin_obs_1[i] + bin_obs_2[i] for i in range(Ndiv)]

        pred_score["sr_no"] = pred_score.index.tolist() 

        pred_score_sorted = pred_score.sort_values(by=["y_predict", "sr_no"], ascending=False).reset_index().drop(columns=["sr_no"])

        istart = 0
        iend = 0
        pred_min_list = []
        pred_max_list = []
        for i in range(Ndiv):
            iend = iend + dN[i]
            pred_min_list.append(min(pred_score_sorted.iloc[istart:iend, pred_score_sorted.columns.get_loc("y_predict")]))
            pred_max_list.append(max(pred_score_sorted.iloc[istart:iend, pred_score_sorted.columns.get_loc("y_predict")]))
            istart = iend

        pred_min_list[Ndiv-1] = float("-inf")

        for i in range(Ndiv-1):
            pred_max_list[i+1] = pred_min_list[i]

        pred_max_list[0] = float("inf")

        pred_score_new = pd.DataFrame()
        pred_score_new["y_predict"] = score_file_new[name[m]]
        pred_score_new = pred_score_new.fillna(-99999)
        pred_score_new = pred_score_new[(pred_score_new.y_predict != -99999)]

        obs_new_vs_old = pd.DataFrame(data=np.NaN, index=range(Ndiv), columns=["bin", "min(inclusive)", "max(exclusive)", "new_obs", "old_obs", "PSI"])

        for i in range(Ndiv):
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("bin")] = f"{bins[i]}-{bins[i+1]}"
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("min(inclusive)")] = bins[i]
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("max(exclusive)")] = bins[i+1]
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("old_obs")] = ((pred_score_sorted["y_predict"] >= pred_min_list[i]) & (pred_score_sorted["y_predict"] < pred_max_list[i])).sum()
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("new_obs")] = ((pred_score_new["y_predict"] >= pred_min_list[i]) & (pred_score_new["y_predict"] < pred_max_list[i])).sum()
        
        total_old = obs_new_vs_old["old_obs"].sum()
        total_new = obs_new_vs_old["new_obs"].sum()

        for i in range(Ndiv):
            old_obs_i = obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("old_obs")]
            new_obs_i = obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("new_obs")]
            psi_value = (old_obs_i / total_old - new_obs_i / total_new) * math.log((old_obs_i / total_old) / (new_obs_i / total_new)) if new_obs_i != 0 else 0
            obs_new_vs_old.iloc[i, obs_new_vs_old.columns.get_loc("PSI")] = psi_value

        l.append(obs_new_vs_old["PSI"].sum())
        #print(name[m],"\n",obs_new_vs_old, "\n\n\n")
    return l


In [12]:
psi_table  = pd.DataFrame()
months = ['sept_23', 'oct_23', 'nov_23', 'dec_23', 'jan_24']
for i in range(len(months)): 
    # print("-"*75)
    df = load_dataset(months[i])
    psi_values = calculate_psi(df_ref, df, features)
    psi_table[months[i]] = psi_values



In [None]:
# psi_table.to_csv(path2 + 'mrdc_data_drift/psi_table.csv')

##### Bin = 10

In [None]:
#PSI calculation
import pandas as pd
import numpy as np
import math as math
import sys as sys
import logging


def calculate_psi(df_ref, df_cur, features):
    score_file_old = df_ref.copy()
    score_file_new = df_cur.copy()
    name = features 
    l = []
    for m in range(len(name)):
        bin_count_input = 10

        pred_score=pd.DataFrame()
        pred_score["y_predict"]=score_file_old[name[m]]


        Ndiv=int(bin_count_input)
        N=len(pred_score)
        bin_obs_1=[int(N/Ndiv)] * Ndiv
        bin_obs_2=[1]*(N%Ndiv) + [0]*(Ndiv-(N%Ndiv))

        dN=[]
        for i in range(Ndiv):
            dN.append(bin_obs_1[i] + bin_obs_2[i])

        pred_score["sr_no"] = pred_score.index.tolist()  #0 indexed sr_no

        pred_score_sorted = pred_score.sort_values(by=["y_predict","sr_no"],ascending=False).reset_index().drop(columns=["sr_no"])

        istart=0
        iend=0
        pred_min_list=[]
        pred_max_list=[]
        for i in range(Ndiv):
            iend=iend+dN[i]
            pred_min_list.append(min(pred_score_sorted.iloc[istart:iend,pred_score_sorted.columns.get_loc("y_predict")]))
            pred_max_list.append(max(pred_score_sorted.iloc[istart:iend,pred_score_sorted.columns.get_loc("y_predict")]))
            istart=iend

        pred_min_list[Ndiv-1]=float("-inf")

        for i in range(Ndiv-1):
            pred_max_list[i+1]=pred_min_list[i]

        pred_max_list[0]=float("inf")

        pred_score_new=pd.DataFrame()

        pred_score_new["y_predict"]=score_file_new[name[m]]
        pred_score_new = pred_score_new.fillna(-99999)
        pred_score_new = pred_score_new[(pred_score_new.y_predict != -99999)]

        obs_new_vs_old=pd.DataFrame(data=np.NaN,index=range(Ndiv),columns=["bin","min(inclusive)","max(exclusive)","new_obs","old_obs","PSI"])

        for i in range(Ndiv):
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("bin")]=str(int(i+1))
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("min(inclusive)")]=pred_min_list[i]
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("max(exclusive)")]=pred_max_list[i]
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("old_obs")]=int(pred_score_sorted["y_predict"][(pred_score_sorted["y_predict"]>=pred_min_list[i]) & (pred_score_sorted["y_predict"]<pred_max_list[i])].count())
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("new_obs")]=int(pred_score_new["y_predict"][(pred_score_new["y_predict"]>=pred_min_list[i]) & (pred_score_new["y_predict"]<pred_max_list[i])].count())

        total_old=obs_new_vs_old.iloc[:,obs_new_vs_old.columns.get_loc("old_obs")].sum()
        total_new=obs_new_vs_old.iloc[:,obs_new_vs_old.columns.get_loc("new_obs")].sum()
        

        for i in range(Ndiv):
            obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("PSI")]=(obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("old_obs")]/total_old-obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("new_obs")]/total_new)*math.log((obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("old_obs")]/total_old)/(obs_new_vs_old.iloc[i,obs_new_vs_old.columns.get_loc("new_obs")]/total_new))
        l.append(obs_new_vs_old["PSI"].sum())
    # print(l)
    return l

In [None]:
psi_table  = pd.DataFrame()
feature = ['ach_c_median_past30d', 
            'ach_c_std_past30d',
            'ach_d_avg_past10d',
            'ach_d_median_past30d', 
            'amount', 
            'avg_running_balance_past30d',
            'card_txn_std_past10by30d', 
            'card_txn_std_past10d',
            'credit_txn_avg_past10by30d', 
            'credit_txn_avg_past10d',
            'credit_txn_count_past10by30d', 
            'credit_txn_std_past30d',
            'debit_txn_count_past10by30d', 
            'ein_ssn', 
            'is_between1000and5000',
            'is_between200and1000', 
            'mrdc_c_avg_past30d',
            'mrdc_c_median_past10by30d', 
            'od_count_past30d',
            'past10by30d_between200and1000_ratio', 
            'past10by30d_check_ratio',
            'past10by30d_returned_check_ratio', 
            'pd_avg_past10d',
            'rejected_past10by30d_between200and1000_ratio',
            'returned_past30d_avg_check_amount', 
            'rn_past30d_avg_check_amount',
            'past10by30d_lessthan200_ratio']


months = ['sept_23', 'oct_23', 'nov_23', 'dec_23', 'jan_24']
for i in range(len(months)): 
    print("-"*75)
    df = load_dataset(months[i])
    psi_values = calculate_psi(df_ref, df, feature)
    psi_table[months[i]] = psi_values



---------------------------------------------------------------------------
[0.02168102571198926, 0.024367524696898464, inf, 0.037771092750431556, 0.003316121261713057, 0.022142355070301743, inf, inf, inf, inf, inf, 0.027984156593441008, inf, 0.015217070633027172, 0.0005458638593262771, 0.00018727047530324654, 0.008803724891569324, inf, 0.0, 0.005434620193087573, 0.010473259678896727, 0.0, inf, 0.0, 0.0, 0.003100383623616351, 0.005219476317360309]
---------------------------------------------------------------------------
[0.02650057194515871, 0.03034924282579933, inf, 0.03257612174391759, 0.0015655194410951103, 0.018488309932279567, inf, inf, inf, inf, inf, 0.03313549087257664, inf, 0.017843224212422865, 0.00012838135274165062, 5.230682026127185e-09, 0.008285360393961548, inf, 0.0, 0.003446840756825485, 0.011829981106141717, 0.0, inf, 0.0, 0.0, 0.0019081847357900654, 0.0029795574317451187]
---------------------------------------------------------------------------
[0.05610263818854441

In [None]:
psi_table.index = feature 
psi_table 

Unnamed: 0,sept_23,oct_23,nov_23,dec_23,jan_24
ach_c_median_past30d,0.021681,0.02650057,0.056103,0.054952,0.049768
ach_c_std_past30d,0.024368,0.03034924,0.05808,0.057361,0.051782
ach_d_avg_past10d,inf,inf,inf,inf,inf
ach_d_median_past30d,0.037771,0.03257612,0.079529,0.072835,0.075604
amount,0.003316,0.001565519,0.010847,0.007002,0.008195
avg_running_balance_past30d,0.022142,0.01848831,0.091118,0.092691,0.082229
card_txn_std_past10by30d,inf,inf,inf,inf,inf
card_txn_std_past10d,inf,inf,inf,inf,inf
credit_txn_avg_past10by30d,inf,inf,inf,inf,inf
credit_txn_avg_past10d,inf,inf,inf,inf,inf


In [None]:
# psi_table.to_csv(path2 + "mrdc_data_drift/psi_table2.csv")