In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import cloudpickle as pkl
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from IPython.display import display, HTML, Image

from column_utils import *

from pathlib import Path
import re
import yaml

In [2]:
target = 'personal_loan'
model_date = '20241204'
version = 1
algo = 'xgb'

model_name = f'{target}_{algo}_{model_date}_v{version}'
with open(f'{target}/{model_name}/calibrated_model.pkl', 'rb') as model_file:
    model = pkl.load(model_file)
    
with open('config-mukto.yaml', 'r') as file:
    config = yaml.safe_load(file)
    
product_window = config['product_window'][target]

In [3]:
snapshot_location = f'{target}/snapshots'

filepaths = []
for p in Path(snapshot_location).iterdir():
    if p.is_file():
        filepaths.append(str(p))

In [4]:
filepaths = sorted(filepaths)
filepaths[0]

'personal_loan/snapshots/model_base_2024-09-05_personal_loan.csv'

In [5]:
pattern = r"model_base_(\d{4}-\d{2}-\d{2})_.*\.csv"

gini_snapshot_df = pd.DataFrame()
prob_snapshot_df = pd.DataFrame()
for filepath in filepaths:
    date = re.search(pattern, filepath).group(1)
    data = pd.read_csv(filepath, low_memory=False)
    features = pd.read_csv(f'{target}/{model_name}/features.csv')['0'].tolist()    
    if not product_window == 'skip':
        drop_columns = get_drop_column_list(target, product_window)
        if drop_columns == 'all':
            if target == 'payments':
                products = ['bbps', 'upi']
                data = data.loc[:, ~data.columns.str.contains(r'homepage_(' + '|'.join(products) + r')_', regex=True)]
        else:
            rename_columns = get_update_column_dict(target, product_window)
            data = data.drop(columns=drop_columns)

            data = data.rename(columns=rename_columns)

    x = data[features].copy()
    y = data['target'].copy()
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    auc_scores = []
    for train_index, test_index in sss.split(x, y):
        x_val, y_val = x.iloc[train_index], y.iloc[train_index]
        y_pred_proba = model.predict_proba(x_val)
        auc_scores.append(roc_auc_score(y_val, y_pred_proba[:, 1]))

    gini_scores = [2 * auc - 1 for auc in auc_scores]
    gini_data = {
        'snapshot_date': [date] * len(auc_scores),
        'fold' : [f'Fold {i+1}' for i in range(len(auc_scores))],
        'gini': gini_scores
    }
    gini_df = pd.DataFrame(gini_data)

    average_row = {
        'snapshot_date': date,
        'fold': 'Average',
        'gini': gini_df.gini.mean()
    }

    gini_df = pd.concat([gini_df, pd.DataFrame([average_row])], ignore_index=True)
    gini_snapshot_df = pd.concat([gini_snapshot_df, gini_df], ignore_index=True)
    
    proba_data = {
        'snapshot_date': date,
        'unique_proba': np.unique(model.predict_proba(x)[:, 1]).shape[0],
        'dataset_size': y.shape[0]
    }
    
    proba_df = pd.DataFrame([proba_data])
    prob_snapshot_df = pd.concat([prob_snapshot_df, proba_df], ignore_index=True)
    display(HTML(f"<h2>Snapshot of {date}</h2><h3>Gini</h3>{gini_df.to_html(index=False)}<h3>Unique Probabilities</h3>{proba_df.to_html(index=False)}"))

snapshot_date,fold,gini
2024-09-05,Fold 1,0.441741
2024-09-05,Fold 2,0.444269
2024-09-05,Fold 3,0.441709
2024-09-05,Fold 4,0.441628
2024-09-05,Fold 5,0.440903
2024-09-05,Average,0.44205

snapshot_date,unique_proba,dataset_size
2024-09-05,175,485413


snapshot_date,fold,gini
2024-09-12,Fold 1,0.534937
2024-09-12,Fold 2,0.523676
2024-09-12,Fold 3,0.527569
2024-09-12,Fold 4,0.532068
2024-09-12,Fold 5,0.533457
2024-09-12,Average,0.530341

snapshot_date,unique_proba,dataset_size
2024-09-12,191,564746


snapshot_date,fold,gini
2024-09-19,Fold 1,0.644461
2024-09-19,Fold 2,0.644073
2024-09-19,Fold 3,0.641867
2024-09-19,Fold 4,0.643161
2024-09-19,Fold 5,0.640999
2024-09-19,Average,0.642912

snapshot_date,unique_proba,dataset_size
2024-09-19,208,659607


snapshot_date,fold,gini
2024-09-26,Fold 1,0.691924
2024-09-26,Fold 2,0.694406
2024-09-26,Fold 3,0.696422
2024-09-26,Fold 4,0.688986
2024-09-26,Fold 5,0.691084
2024-09-26,Average,0.692564

snapshot_date,unique_proba,dataset_size
2024-09-26,191,601295


snapshot_date,fold,gini
2024-10-17,Fold 1,0.603829
2024-10-17,Fold 2,0.618779
2024-10-17,Fold 3,0.60301
2024-10-17,Fold 4,0.616303
2024-10-17,Fold 5,0.59767
2024-10-17,Average,0.607918

snapshot_date,unique_proba,dataset_size
2024-10-17,186,381538


In [6]:
prob_snapshot_df

Unnamed: 0,snapshot_date,unique_proba,dataset_size
0,2024-09-05,175,485413
1,2024-09-12,191,564746
2,2024-09-19,208,659607
3,2024-09-26,191,601295
4,2024-10-17,186,381538


In [7]:
gini_snapshot_df[
    (gini_snapshot_df.fold == 'Average')
][['snapshot_date', 'gini']]

Unnamed: 0,snapshot_date,gini
5,2024-09-05,0.44205
11,2024-09-12,0.530341
17,2024-09-19,0.642912
23,2024-09-26,0.692564
29,2024-10-17,0.607918


In [8]:
overall_snapshot_df = gini_snapshot_df[
    (gini_snapshot_df.fold == 'Average')
][['snapshot_date', 'gini']].copy()
overall_snapshot_df

Unnamed: 0,snapshot_date,gini
5,2024-09-05,0.44205
11,2024-09-12,0.530341
17,2024-09-19,0.642912
23,2024-09-26,0.692564
29,2024-10-17,0.607918


In [9]:
overall_snapshot_df = pd.merge(
    overall_snapshot_df,
    prob_snapshot_df,
    on='snapshot_date',
    how='inner'
)
overall_snapshot_df

Unnamed: 0,snapshot_date,gini,unique_proba,dataset_size
0,2024-09-05,0.44205,175,485413
1,2024-09-12,0.530341,191,564746
2,2024-09-19,0.642912,208,659607
3,2024-09-26,0.692564,191,601295
4,2024-10-17,0.607918,186,381538


In [10]:
overall_snapshot_df

Unnamed: 0,snapshot_date,gini,unique_proba,dataset_size
0,2024-09-05,0.44205,175,485413
1,2024-09-12,0.530341,191,564746
2,2024-09-19,0.642912,208,659607
3,2024-09-26,0.692564,191,601295
4,2024-10-17,0.607918,186,381538


In [11]:
overall_snapshot_df.to_csv("Snapshot-Wise-Gini.csv",index=False)

In [13]:
snapshot_export = pd.merge(
    gini_snapshot_df,
    prob_snapshot_df,
    on='snapshot_date',
    how='left'
)
snapshot_export.to_csv(f'{target}/{model_name}/snapshot_wise_data.csv')