## Read Raw Data Files

In [1]:
import pandas as pd
fptrain = "../../../data/cleaned_sba_7a_loans_train.parquet"
fptest = "../../../data/cleaned_sba_7a_loans_test.parquet"
fp_model_loc = r"../../../models/WOE_calibrated_classifier.pkl"

In [2]:
df_train = pd.read_parquet(fptrain)
df_test = pd.read_parquet(fptest)
df = pd.concat([df_train, df_test])

## Load the Model

In [3]:
import pickle
with open(fp_model_loc , 'rb') as f:
    clf = pickle.load(f)

In [4]:
clf

In [5]:
preds = df.columns.tolist()
preds = [p for p in preds if p not in["LoanStatus", "SBAGuaranteedApproval"]]

In [6]:
df_X = df[preds]
df_Y = df["LoanStatus"]

In [7]:
df_X.shape

(7838, 6)

## Score Charge Off Probabilities

In [8]:
charge_off_prob = clf.predict_proba(df_X)[:,1]

In [9]:
NUM_BINS = 10
import numpy as np
import matplotlib.pyplot as plt
plabels = ["p-" + str(i+1) for i in range(NUM_BINS)]
bin_prob = pd.qcut(charge_off_prob, NUM_BINS, labels=plabels)

## Rebin for Annotation

While the decile binning scheme (where the probability range is binned into deciles) is good for calibration, it is not readily interpretable because there are many bins with very low counts because of the data imbalance. To make it easy to interpret and apply the model, we need to rebin so that we have a small number of bins and the bin count is such that it is easy to apply in practice. The approach I have used here is to rebin such that there are _similar_ (not the exact same) counts in each bin. I started with 4 bins, looked at the results, then merged bins 3 and 4 together. This gives a set of three bins with similar counts. It can also be easily interpreted and applied. So one way to circumvent the thresholding problem we discussed in the modeling notebook is to check which bin a new loan (received in 2024) falls into based on the predicted probability from the model. We then know its quality based on the bin profile. The bin profiles for each of the three bins are developed below.

In [10]:
df_probs = pd.DataFrame({"prob": charge_off_prob, "bin": bin_prob, "LoanStatus": df_Y})
df_probs.groupby("bin")["LoanStatus"].value_counts()

  df_probs.groupby("bin")["LoanStatus"].value_counts()


bin   LoanStatus
p-1   0             1109
      1               12
p-2   0              557
      1                8
p-3   0              775
      1               20
p-4   0              937
      1               51
p-5   0              964
      1               67
p-6   0              451
      1               41
p-7   0              810
      1              101
p-8   0              347
      1               58
p-9   0              584
      1              188
p-10  0              410
      1              348
Name: count, dtype: int64

Note that there are to many bins with low counts. We try 4 bins now

In [11]:
NUM_BINS = 4
import numpy as np
import matplotlib.pyplot as plt
plabels = ["p-" + str(i+1) for i in range(NUM_BINS)]
bin_prob = pd.qcut(charge_off_prob, NUM_BINS, labels=plabels)

In [12]:
df_probs = pd.DataFrame({"prob": charge_off_prob, "bin": bin_prob, "LoanStatus": df_Y})
df_probs.groupby("bin")["LoanStatus"].value_counts()

  df_probs.groupby("bin")["LoanStatus"].value_counts()


bin  LoanStatus
p-1  0             2159
     1               31
p-2  0             2183
     1              127
p-3  0             1261
     1              142
p-4  0             1341
     1              594
Name: count, dtype: int64

If we merge p-3 and p-4 together then we get a set of bin profiles with similar counts 

In [13]:
def label_bins(x):
    label = ""
    if x == "p-1":
        label = "GOOD"
    elif (x == "p-3") or (x == "p-4"):
        label = "POOR"
    else:
        label = "AVERAGE"
    return label


In [14]:
df_probs["credit_quality"] = df_probs["bin"].apply(label_bins)

## Rebinned Loan Quality
The rebinned loan quality now has three bins with similar number of loans in each bin. We can now annotate the loan with this quality annotation. The 1 represents a charge off. So the ordering of the bins in terms of credit quality is GOOD, AVERAGE, POOR.

In [15]:
df_probs.groupby("credit_quality")["LoanStatus"].value_counts()

credit_quality  LoanStatus
AVERAGE         0             2183
                1              127
GOOD            0             2159
                1               31
POOR            0             2602
                1              736
Name: count, dtype: int64

### Note: 
_It is possible to interpret this with borrower data, but I choose not to do this in this notebook. The goal here is to illustrate modeling features that we can use with KMDS and not credit quality_.

### KMDS Logging for Data Annotation

In [16]:
from kmds.ontology.kmds_ontology import *
from kmds.tagging.tag_types import DataRepresentationTags


In [17]:
from kmds.utils.load_utils import *
RP_DEP_KB = "../imbalanced_cost_based_learning/WOE_modeling_7a_charge_offs.xml"

try:
    onto2 = get_ontology(RP_DEP_KB).load()
    workflow_instances = Workflow.instances()
except Exception as e:
    print("Error opening KB, check if KB exists and permissions are right")
if onto is None:
    print("Could not load Ontology, check the file and try again.")



In [18]:
kaw = KnowledgeApplicationWorkflow("7a_credit_quality_data_annotation")

In [19]:
kaw.depends_on = [workflow_instances[0]]

In [20]:
kaw.depends_on[0].name

'WOE_modeling_7a_charge_offs'

In [25]:
from kmds.tagging.tag_types import DataRepresentationTags
dr_obs_list = []
dr_observation_count = 1

dr1 = DataRepresentationObservation(namespace=onto)
dr1.finding = "The 7a WOE based Logistic Regression model is used to score the 2023 7a lender dataset"
dr1.finding_sequence = dr_observation_count
dr1.model_selection_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr1)
dr_observation_count += 1

dr2 = DataRepresentationObservation(namespace=onto)
dr2.finding = "The decile binning scheme that was used for probability calibration yielded some bins with low counts, so rebinning was done."
dr2.finding_sequence = dr_observation_count
dr2.model_selection_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr2)
dr_observation_count += 1

dr3 = DataRepresentationObservation(namespace=onto)
dr3.finding = "A quartile binning scheme and subsequent adjustment yielded bins with similar counts - 3 bins seems like a good choice\
for annotation. The bins are ordered in terms of proportion of chargeoffs in the bin. We have three bins - GOOD, AVERAGE and POOR"
dr3.finding_sequence = dr_observation_count
dr3.model_selection_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr3)
dr_observation_count += 1

dr4 = DataRepresentationObservation(namespace=onto)
dr4.finding = "We can partition 2023 loans into 4 subsets; (1) Gold Quality Loans (2) Outliers: These are loans\
with too few instances for us to apply supervised learning because we cannot evaluate generalization on them.\
If you had to model these, unsupervised learning may be good choice (3) Loans Scored by the WOE Logistic regression model."
dr4.finding_sequence = dr_observation_count
dr4.model_selection_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr4)
dr_observation_count += 1

In [27]:
kaw.has_data_representation_observations = dr_obs_list
KNOWLEDGE_BASE = "WOE_data_annotation_7a_chargeoffs.xml"
onto.save(file=KNOWLEDGE_BASE, format="rdfxml")