In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.preprocessing import LabelEncoder
import graphviz

In [2]:
# Load the German Credit Data
df = pd.read_csv("German Credit Data.csv")

In [3]:
# Encode categorical features using LabelEncoder
df_encoded = df.copy()
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
# Define features and target (target: "status" where 1 = Good Credit, 0 = Bad Credit)
X = df_encoded.drop(columns=["status"])
y = df_encoded["status"]

In [5]:
# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 

In [6]:
# Set up hyperparameter grid for grid search
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": np.arange(2, 11)
}

In [7]:
# Initialize Decision Tree Classifier and GridSearchCV with 10-fold CV using ROC AUC score
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=10, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])},
             scoring='roc_auc')

In [8]:
# Best hyperparameters and ROC AUC score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters for German Credit Data:", best_params)
print("Best ROC AUC Score (CV):", best_score)

Best Parameters for German Credit Data: {'criterion': 'entropy', 'max_depth': 3}
Best ROC AUC Score (CV): 0.7207217261904761


In [9]:
# Fit the best estimator on the training set
best_dt = grid_search.best_estimator_
best_dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)

In [10]:
# --- Visualization using graphviz ---
# Generate DOT data for the tree visualization
dot_data = export_graphviz(
    best_dt, 
    out_file=None,
    feature_names=X.columns,
    class_names=['Bad Credit', 'Good Credit'],
    filled=True, rounded=True,
    special_characters=True
)
graph = graphviz.Source(dot_data)

In [11]:
# Display text representation of the decision rules
tree_rules = export_text(best_dt, feature_names=list(X.columns))
print("\nText Representation of Decision Rules:\n")
print(tree_rules)


Text Representation of Decision Rules:

|--- checkin_acc <= 1.50
|   |--- duration <= 22.50
|   |   |--- credit_history <= 1.50
|   |   |   |--- class: 1
|   |   |--- credit_history >  1.50
|   |   |   |--- class: 0
|   |--- duration >  22.50
|   |   |--- savings_acc <= 2.50
|   |   |   |--- class: 1
|   |   |--- savings_acc >  2.50
|   |   |   |--- class: 0
|--- checkin_acc >  1.50
|   |--- inst_plans <= 0.50
|   |   |--- amount <= 3741.00
|   |   |   |--- class: 0
|   |   |--- amount >  3741.00
|   |   |   |--- class: 1
|   |--- inst_plans >  0.50
|   |   |--- present_emp_since <= 2.50
|   |   |   |--- class: 0
|   |   |--- present_emp_since >  2.50
|   |   |   |--- class: 0



In [12]:
 
# Generate DOT data as a string (instead of writing to file)
dot_data = export_graphviz(
    best_dt, 
    out_file=None,  # This tells the function to return a string
    feature_names=X.columns,
    class_names=['Bad Credit', 'Good Credit'],
    filled=True, rounded=True,
    special_characters=True
)
print(dot_data)

digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label=<checkin_acc &le; 1.5<br/>entropy = 0.881<br/>samples = 800<br/>value = [560, 240]<br/>class = Bad Credit>, fillcolor="#f0b78e"] ;
1 [label=<duration &le; 22.5<br/>entropy = 0.988<br/>samples = 433<br/>value = [244, 189]<br/>class = Bad Credit>, fillcolor="#f9e3d2"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label=<credit_history &le; 1.5<br/>entropy = 0.928<br/>samples = 242<br/>value = [159, 83]<br/>class = Bad Credit>, fillcolor="#f3c3a0"] ;
1 -> 2 ;
3 [label=<entropy = 0.863<br/>samples = 21<br/>value = [6, 15]<br/>class = Good Credit>, fillcolor="#88c4ef"] ;
2 -> 3 ;
4 [label=<entropy = 0.89<br/>samples = 221<br/>value = [153, 68]<br/>class = Bad Credit>, fillcolor="#f1b991"] ;
2 -> 4 ;
5 [label=<savings_acc &le; 2.5<br/>entropy = 0.991<br/>samples = 191<br/>value = [85, 106]<br/>class = Good Credit>, fillcolor="#d8ecfa"] ;


<img src="Tree.png" width = 600 height=500>