In [6]:
import pickle
from sklearn.tree import export_text
import pandas as pd
import numpy as np

In [7]:
filename = 'model.pkl'
with open(filename, 'rb') as file:
    clf = pickle.load(file)

In [8]:
print(clf.get_depth())
print(clf.get_n_leaves())

30
18253


In [9]:
for name, value in zip(clf.features, clf.feature_importances_):
  print(f'{name}: {value}')

Debt Consolidation Loan: 0.007820864512607559
Changed_Credit_Limit: 0.07541078234808375
Annual_Income: 0.07014154376595008
Num_of_Loan: 0.022213565731604785
Credit_History_Age: 0.20521106958206323
Num_of_Delayed_Payment: 0.05939904587009031
Num_Bank_Accounts: 0.029854014570083683
Delay_from_due_date: 0.06795467907001519
Num_Credit_Inquiries: 0.03913662963232784
Num_Credit_Card: 0.03380598949826663
Outstanding_Debt: 0.18442786335654007
Interest_Rate: 0.0629668363167222
Credit_Mix: 0.14165711574564469


In [10]:
df = pd.read_csv("./test.csv")
X = df[clf.features]
X = np.array(X)

In [16]:
def get_decision_path(clf, X_test):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    node_indicator = clf.decision_path(X_test)
    leaf_id = clf.apply(X_test)
    classes = clf.predict(X_test)

    for sample_id in range(0, len(X_test)):
        node_index = node_indicator.indices[
            node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
        ]

        print("Rules used to predict sample {id}:\n".format(id=sample_id))
        for node_id in node_index:
            # continue to the next node if it is a leaf node
            if leaf_id[sample_id] == node_id:
                continue

            # check if value of the split feature for sample 0 is below threshold
            if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
                threshold_sign = "<="
            else:
                threshold_sign = ">"

            print(
                "decision node {node} : (X_test[{sample}, {feature}] = {value}) "
                "{inequality} {threshold})".format(
                    node=node_id,
                    sample=sample_id,
                    feature=clf.features[feature[node_id]],
                    value=X_test[sample_id, feature[node_id]],
                    inequality=threshold_sign,
                    threshold=threshold[node_id],
                )
            )
        print(f'Sample {sample_id} is {clf.labels[classes[sample_id]]}')
        print("*"*50)
        print()

In [17]:
get_decision_path(clf, X)

Rules used to predict sample 0:

decision node 0 : (X_test[0, Credit_Mix] = 1.0) <= 1.5)
decision node 1 : (X_test[0, Outstanding_Debt] = 758.44) <= 1255.469970703125)
decision node 2 : (X_test[0, Annual_Income] = 25546.26) <= 93708.23828125)
decision node 3 : (X_test[0, Num_Credit_Inquiries] = 5.0) > 4.5)
decision node 2703 : (X_test[0, Interest_Rate] = 14.0) <= 21.0)
decision node 2704 : (X_test[0, Changed_Credit_Limit] = 7.83) <= 8.21500015258789)
decision node 2705 : (X_test[0, Credit_Mix] = 1.0) > 0.5)
decision node 2711 : (X_test[0, Num_of_Loan] = 5.0) <= 5.5)
decision node 2712 : (X_test[0, Annual_Income] = 25546.26) > 9870.66748046875)
decision node 2740 : (X_test[0, Delay_from_due_date] = 16.0) > 5.5)
decision node 2764 : (X_test[0, Num_Bank_Accounts] = 8.0) > 6.5)
decision node 3236 : (X_test[0, Outstanding_Debt] = 758.44) <= 1009.5150146484375)
decision node 3237 : (X_test[0, Changed_Credit_Limit] = 7.83) > 7.329999923706055)
decision node 3371 : (X_test[0, Num_of_Delayed_Pa