In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.stats.mstats import winsorize
from copy import deepcopy
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.utils.validation import check_array, check_X_y

In [None]:
DATASET_PATH = 'loan_data_processed.csv'
df = pd.read_csv(DATASET_PATH, sep=',')
df.head(5)

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,...,home_MORTGAGE,home_OTHER,home_OWN,home_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE
0,-1.118481,0.0,1.500068,-0.106043,-1.026337,2.455128,1.792355,0.31,-0.794462,-1.564522,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-1.118481,0.0,-1.283563,-1.271863,-1.026337,-1.264998,0.063765,0.08,-1.085922,-2.04087,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.505518,0.0,-1.283563,-1.271863,-0.423164,-0.67316,0.679086,0.31,-0.794462,0.037741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.91416,0.0,0.572191,0.102745,-1.026337,2.455128,1.518483,0.31,-1.085922,0.903829,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.709839,1.0,1.500068,-0.261545,-0.82528,2.455128,1.177034,0.31,-0.503002,-1.023217,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
df = df.dropna()
missing_stats = df.isnull().sum()
missing_stats

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
loan_amnt,0
loan_int_rate,0
loan_percent_income,0
cb_person_cred_hist_length,0
credit_score,0


In [None]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

X_train: (13281, 21), y_train: (13281,)
X_test: (3321, 21), y_test: (3321,)


In [None]:
class GBMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3, random_state=0):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))

    def _update_leaf_values(self, tree, X, y, probabilities):
        leaf_indexes = tree.apply(X)
        gradients = y - probabilities
        hessians = probabilities * (1 - probabilities)
        unique_leaves = np.unique(leaf_indexes)
        for leaf in unique_leaves:
            indices = np.where(leaf_indexes == leaf)[0]
            numerator = np.sum(gradients[indices])
            denominator = np.sum(hessians[indices]) + 1e-12
            gamma = numerator / denominator
            tree.tree_.value[leaf, 0, 0] = gamma

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        y = np.array(y)
        self.trees = []
        predictions = np.zeros(len(y))

        for i in range(self.n_estimators):
            probabilities = self._sigmoid(predictions)
            target = y - probabilities
            weights = None

            tree = DecisionTreeRegressor(criterion='friedman_mse',max_depth=self.max_depth,random_state=self.random_state)
            tree.fit(X, target, sample_weight=weights)
            self._update_leaf_values(tree, X, y, probabilities)
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)

        return self

    def predict_proba(self, X):
        X = check_array(X)
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)

        proba_1 = self._sigmoid(predictions)
        return np.column_stack([1 - proba_1, proba_1])

    def predict(self, X):
        X = check_array(X)
        return (self.predict_proba(X)[:, 1] > 0.5).astype(int)

In [None]:
gbm = GBMClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    random_state=42
)

gbm.fit(X_train, y_train)

predictions = gbm.predict(X_test)
accuracy = accuracy_score(y_test_np, predictions)
cm = confusion_matrix(y_test_np, predictions)

TN, FP, FN, TP = cm.ravel()

precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"TP={TP}, TN={TN}, FP={FP}, FN={FN}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F: {f:.4f}")

TP=573, TN=2490, FP=80, FN=178
Accuracy: 0.9223
Precision: 0.8775
Recall: 0.7630
F: 0.8162


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [2, 3, 4]
}
model = GBMClassifier(random_state = 42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"\nЛучшие параметры: {grid_search.best_params_}")
print(f"Лучшая точность на кросс-валидации: {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Точность на тесте: {accuracy_score(y_test, y_pred):.4f}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits

Лучшие параметры: {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 150}
Лучшая точность на кросс-валидации: 0.9301
Точность на тесте: 0.9256


`mean_test_score` - cредняя точность на кросс-валидации

`std_test_score` - насколько стабильно работает модель.

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)
columns_to_show = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
results_df = results_df[columns_to_show].sort_values(by='rank_test_score')
pd.set_option('display.max_colwidth', None)
print(results_df)

                                                          params  \
23   {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 150}   
22   {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 100}   
26   {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 150}   
25   {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 100}   
24    {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 50}   
20   {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 150}   
21    {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 50}   
17   {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150}   
19   {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 100}   
16   {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}   
14   {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}   
13   {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}   
18    {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 50}   
15    {'learning_rate': 0.1, 'max_depth': 4, 'n_