In [1]:
#importing Useful DataStructures
import pandas as pd
import numpy as np
from scipy.stats import uniform

#importing plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# from prettytable import PrettyTable

#importing Misc Libraries
import os
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

#for 100% jupyter notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

In [3]:
# Đường dẫn tới file .pkl
data_path = '/kaggle/input/data-final/train_data_final.pkl'
test_path = '/kaggle/input/data-final/test_data_final.pkl'
# Mở file .pkl và đọc dữ liệu
with open(data_path, 'rb') as file:
    data = pickle.load(file)

with open(test_path, 'rb') as file:
    test = pickle.load(file)

In [4]:
data.drop(labels="Unnamed: 0",axis=1,inplace=True)
test.drop(labels="Unnamed: 0",axis=1,inplace=True)

In [5]:
y = data[["TARGET"]]
X = data.drop(columns=["TARGET"])
# Tạm thời del data và test để tối ưu RAM
del data

In [6]:
empty_columns = []
for col in X.columns:
    if len(X[col].unique()) <=1:
        empty_columns.append(col)
    
print(f"There are {len(empty_columns)} columns with just 1 unique value")
print("Removing these from dataset")
X = X.drop(empty_columns, axis = 1)
test = test.drop(empty_columns, axis = 1)

There are 23 columns with just 1 unique value
Removing these from dataset


In [7]:
#removing the SK_ID_CURR from training and test data
X = X.drop(['SK_ID_CURR'], axis = 1)
skid_test = test.pop('SK_ID_CURR')

In [8]:
#replacing nan values with 0
X[np.isnan(X)] = 0
test[np.isnan(test)] = 0

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import numpy as np

class DecisionTreePruner:
    def __init__(self, X_train, X_test, y_train, y_test, multiclass=False):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.best_model = None
        self.best_score = 0
        self.multiclass = multiclass
        self.base_tree = None
        self.best_ccp = 0

        # If multiclass, binarize labels for ROC-AUC
        if self.multiclass:
            self.y_train_bin = label_binarize(y_train, classes=np.unique(y_train))
            self.y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
        else:
            self.y_train_bin = y_train
            self.y_test_bin = y_test

    def evaluate_model(self, model):
        # Get predicted probabilities for the positive class (class 1)
        y_prob = model.predict_proba(self.X_test)
    
        # Calculate ROC-AUC score for binary classification
        score = roc_auc_score(self.y_test, y_prob[:, 1])
        return score

    def cost_complexity_pruning(self):
        path = self.base_tree.cost_complexity_pruning_path(self.X_train, self.y_train)
        ccp_alphas = path.ccp_alphas
        ccp_alphas_subset = np.linspace(ccp_alphas.min(), ccp_alphas.max(), num=30)
        print(f"ccp alpha in range ({ccp_alphas.min()},{ccp_alphas.max()})")
        print("==============================================")
        epoch = 1
        N = len(ccp_alphas)
        
        best_tree = None
        best_score = 0
        
        for alpha in ccp_alphas_subset:
            tree = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
            tree.fit(self.X_train, self.y_train)
            score = self.evaluate_model(tree)
            
            if score > best_score:
                best_score = score
                best_tree = tree
                self.best_ccp = alpha

            print(f"Done Epoch {epoch} in {N}")
            epoch += 1
        return best_tree, best_score

    def reduced_error_pruning(self, max_depth_range):
        best_tree = None
        best_score = 0
        
        for max_depth in max_depth_range:
            tree = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
            tree.fit(self.X_train, self.y_train)
            score = self.evaluate_model(tree)
            
            if score > best_score:
                best_score = score
                best_tree = tree
        
        return best_tree, best_score

    def minimum_impurity_decrease(self, impurity_range):
        best_tree = None
        best_score = 0
        
        for min_impurity_decrease in impurity_range:
            tree = DecisionTreeClassifier(random_state=42, min_impurity_decrease=min_impurity_decrease)
            tree.fit(self.X_train, self.y_train)
            score = self.evaluate_model(tree)
            
            if score > best_score:
                best_score = score
                best_tree = tree
        
        return best_tree, best_score

    def minimum_leaf_size(self, min_samples_leaf_range):
        best_tree = None
        best_score = 0
        
        for min_samples_leaf in min_samples_leaf_range:
            tree = DecisionTreeClassifier(random_state=42, min_samples_leaf=min_samples_leaf)
            tree.fit(self.X_train, self.y_train)
            score = self.evaluate_model(tree)
            
            if score > best_score:
                best_score = score
                best_tree = tree
        
        return best_tree, best_score

    def run_all_pruning_methods(self):
        # Train the base tree
        self.base_tree = DecisionTreeClassifier(random_state=42)
        self.base_tree.fit(self.X_train, self.y_train)
        
        self.best_model, self.best_score = self.cost_complexity_pruning()
        print(f"Best CCP ROC-AUC: {self.best_score:.4f}")
        
        tree, score = self.reduced_error_pruning(range(1, 10))
        if score > self.best_score:
            self.best_model = tree
            self.best_score = score
        print(f"Best Reduced Error ROC-AUC: {score:.4f}")
        
        tree, score = self.minimum_impurity_decrease([0.0, 0.01, 0.02, 0.05])
        if score > self.best_score:
            self.best_model = tree
            self.best_score = score
        print(f"Best Minimum Impurity Decrease ROC-AUC: {score:.4f}")
        
        tree, score = self.minimum_leaf_size(range(1, 20))
        if score > self.best_score:
            self.best_model = tree
            self.best_score = score
        print(f"Best Minimum Leaf Size ROC-AUC: {score:.4f}")
        
        print(f"Overall Best ROC-AUC: {self.best_score:.4f}")

In [10]:

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # # Initialize and run the pruner for multiclass
    # pruner = DecisionTreePruner(X_train, X_test, y_train, y_test, multiclass=True)
    # pruner.run_all_pruning_methods()

    # The final best model is stored in `pruner.best_model`

In [11]:
# Khởi tạo class DecisionTreePruner
pruner = DecisionTreePruner(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    multiclass=False  # Bài toán nhị phân
)

pruner.base_tree = DecisionTreeClassifier(random_state=42)
pruner.base_tree.fit(pruner.X_train, pruner.y_train)

# Chạy Cost-Complexity Pruning
best_model, best_score = pruner.cost_complexity_pruning()

# Hiển thị kết quả
print(f"Best CCP: {pruner.best_ccp}")
print(f"Best CCP model: {best_model}")
print(f"Best ROC-AUC score: {best_score:.4f}")

ccp alpha in range (0.0,0.007397956008650508)
Done Epoch 1 in 3555
Done Epoch 2 in 3555
Done Epoch 3 in 3555
Done Epoch 4 in 3555
Done Epoch 5 in 3555
Done Epoch 6 in 3555
Done Epoch 7 in 3555
Done Epoch 8 in 3555
Done Epoch 9 in 3555
Done Epoch 10 in 3555
Done Epoch 11 in 3555
Done Epoch 12 in 3555
Done Epoch 13 in 3555
Done Epoch 14 in 3555
Done Epoch 15 in 3555
Done Epoch 16 in 3555
Done Epoch 17 in 3555
Done Epoch 18 in 3555
Done Epoch 19 in 3555
Done Epoch 20 in 3555
Done Epoch 21 in 3555
Done Epoch 22 in 3555
Done Epoch 23 in 3555
Done Epoch 24 in 3555
Done Epoch 25 in 3555
Done Epoch 26 in 3555
Done Epoch 27 in 3555
Done Epoch 28 in 3555
Done Epoch 29 in 3555
Done Epoch 30 in 3555
Best CCP: 0.0002551019313327761
Best CCP model: DecisionTreeClassifier(ccp_alpha=0.0002551019313327761, random_state=42)
Best ROC-AUC score: 0.7320


In [12]:
pd.DataFrame({'SK_ID_CURR': skid_test, 'TARGET' : best_model.predict_proba(test)[:,1]}).to_csv('submission_DT_.csv',index = False)
print('Successfully submitted to Home Credit Default Risk')

Successfully submitted to Home Credit Default Risk


In [13]:
with open('model.pkl','wb') as f:
    pickle.dump(pruner.best_model,f)