In [1]:
fp_feat_train = "../../../data/sba_7a_loans_train_feat.npz"
fp_feat_test =  "../../../data/sba_7a_loans_test_feat.npz"
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix


In [2]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    m = csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])
    df = pd.DataFrame.sparse.from_spmatrix(m)
    return df

In [3]:
fp_label_train = "../../../data/sba_7a_loans_train_labels.parquet"
fp_label_test = "../../../data/sba_7a_loans_test_labels.parquet"

In [4]:
df_feat_train = load_sparse_csr(fp_feat_train)
y_label_train = pd.read_parquet(fp_label_train).values.ravel()

In [5]:
import imblearn

from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(
    n_estimators=100, random_state=0, sampling_strategy="all", replacement=True,
    bootstrap=False,
)
brf.fit(df_feat_train, y_label_train)

In [6]:
y_pred = brf.predict(df_feat_train)

In [7]:
from sklearn.metrics import balanced_accuracy_score

In [8]:
target_names = ['PIF', 'CHGOFF']
balanced_accuracy_score(y_label_train, y_pred)

0.959411596686661

In [9]:
from imblearn.metrics import sensitivity_score
sensitivity_score(y_label_train, y_pred, average='binary')

1.0

In [10]:
y_label_test = pd.read_parquet(fp_label_test).values.ravel()
y_pred_test = brf.predict(load_sparse_csr(fp_feat_test))
ba_bag = balanced_accuracy_score(y_label_test, y_pred_test)
sc_bag = sensitivity_score(y_label_test, y_pred_test, average='binary')
print("Balanced accuracy = {ba} , sensitivity = {s}".format(ba=ba_bag, s=sc_bag))

Balanced accuracy = 0.7194237031715712 , sensitivity = 0.5340314136125655


In [11]:
from imblearn.ensemble import RUSBoostClassifier
rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                              random_state=0)
rusboost.fit(df_feat_train, y_label_train)

In [12]:
y_label_test = pd.read_parquet(fp_label_test).values.ravel()
y_pred_test = rusboost.predict(load_sparse_csr(fp_feat_test))
ba_boost = balanced_accuracy_score(y_label_test, y_pred_test)
sc_boost = sensitivity_score(y_label_test, y_pred_test, average='binary')
print("Balanced accuracy = {ba} , sensitivity = {s}".format(ba=ba_boost, s=sc_boost))

Balanced accuracy = 0.5282130207354582 , sensitivity = 0.9319371727748691


In [13]:
from kmds.tagging.tag_types import ModelSelectionTags, ModellingChoiceTags
from owlready2 import *
from kmds.utils.load_utils import *
#from utils.path_utils import *
KNOWLEDGE_BASE = "sba_7a_loan_chargeoff_modelling.xml"

In [14]:
onto2 = load_kb(KNOWLEDGE_BASE)
with onto2:
    insts = Workflow.instances()
the_workflow_instance = insts[0]
insts

[sba_7a_loan_chargeoff_modelling.xml.sba_7a_loan_chargeoff_modelling]

In [15]:
ms_obs_list = []
ms_observation_count = 1

ms1 = ModelSelectionObservation(namespace=onto2)
ms1.finding = "Bagging and Boosting approaches to model development are evaluated."
ms1.finding_sequence = ms_observation_count
ms1.model_selection_observation_type = ModelSelectionTags.MODEL_SELECTION_OBSERVATION.value
ms_obs_list.append(ms1)
ms_observation_count += 1

mc_obs_list = []
mc_observation_count = 1

mc1 = ModellingChoiceObservation(namespace=onto2)
mc1.finding = "The feature dimension is 1024, this is a hyper-parameter choice."
mc1.finding_sequence = mc_observation_count
mc1.modelling_choice_observation_type = ModellingChoiceTags.MODELLING_CHOICE_OBSERVATION.value
mc_obs_list.append(mc1)
mc_observation_count += 1

mc2 = ModellingChoiceObservation(namespace=onto2)
mc2.finding = "Bagging uses 100 estimators, this is a hyper-parameter choice."
mc2.finding_sequence = mc_observation_count
mc2.modelling_choice_observation_type = ModellingChoiceTags.MODELLING_CHOICE_OBSERVATION.value
mc_obs_list.append(mc2)

mc_observation_count += 1
mc3 = ModellingChoiceObservation(namespace=onto2)
mc3.finding = "Boosting uses 200 estimators, this is a hyper-parameter choice."
mc3.finding_sequence = mc_observation_count
mc3.modelling_choice_observation_type = ModellingChoiceTags.MODELLING_CHOICE_OBSERVATION.value
mc_obs_list.append(mc3)

ms2 = ModelSelectionObservation(namespace=onto2)
ms2.finding = "Bagging has balanced accuracy: {ba:.2f}, sensitivity: {s:.2f}".format(ba=ba_bag, s=sc_bag)
ms2.finding_sequence = ms_observation_count
ms2.model_selection_observation_type = ModelSelectionTags.MODEL_SELECTION_OBSERVATION.value
ms_obs_list.append(ms2)
ms_observation_count += 1

ms2 = ModelSelectionObservation(namespace=onto2)
ms2.finding = "Boosting has balanced accuracy: {ba:.2f}, sensitivity: {s:.2f}".format(ba=ba_boost, s=sc_boost)
ms2.finding_sequence = ms_observation_count
ms2.model_selection_observation_type = ModelSelectionTags.MODEL_SELECTION_OBSERVATION.value
ms_obs_list.append(ms2)


the_workflow_instance.has_modeling_choice_observations = mc_obs_list
the_workflow_instance.has_model_selection_observations = ms_obs_list

onto2.save(file=KNOWLEDGE_BASE, format="rdfxml")
