In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### EDA и построение графиков

In [3]:
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
training_set = pd.read_csv('train_c.csv')
testing_set = pd.read_csv('test_c.csv')
print(f"Training set size: {training_set.shape}")
print(f"Testing set size: {testing_set.shape}")

Размер обучающей выборки: (11017, 35)
Размер тестовой выборки: (5000, 35)


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11017 entries, 0 to 11016
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             10487 non-null  object 
 1   Age                         10487 non-null  float64
 2   AnnualIncome                10487 non-null  float64
 3   CreditScore                 9986 non-null   float64
 4   LoanAmount                  9986 non-null   float64
 5   LoanDuration                10487 non-null  float64
 6   MaritalStatus               10487 non-null  object 
 7   NumberOfDependents          10487 non-null  float64
 8   HomeOwnershipStatus         10487 non-null  object 
 9   MonthlyDebtPayments         9986 non-null   float64
 10  CreditCardUtilizationRate   10487 non-null  float64
 11  NumberOfOpenCreditLines     10487 non-null  float64
 12  NumberOfCreditInquiries     10487 non-null  float64
 13  DebtToIncomeRatio           104

In [6]:
train_df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,MonthlyDebtPayments,...,JobTenure,EmploymentStatus,EducationLevel,Experience,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved
0,2010-06-26,27.0,66829.0,549.0,17290.0,60.0,Divorced,1.0,Rent,1095.0,...,4.0,Employed,Associate,4.0,35067.0,0.25779,0.251465,508.97023,0.288013,0.0
1,1996-09-23,55.0,172147.0,850.0,16110.0,36.0,Widowed,1.0,Mortgage,211.0,...,2.0,Employed,High School,33.0,27001.0,0.08611,0.093173,514.675859,0.050585,1.0
2,2015-01-19,51.0,300000.0,850.0,38436.0,36.0,Married,0.0,Mortgage,546.0,...,3.0,Employed,Bachelor,28.0,278382.0,0.108436,0.115443,1268.276385,0.072571,1.0
3,1981-05-12,25.0,34683.0,847.0,19186.0,48.0,Married,0.0,Other,153.0,...,3.0,Employed,High School,0.0,9224.0,0.100686,0.112822,498.505187,0.225415,1.0
4,1995-05-07,55.0,300000.0,850.0,30437.0,48.0,Single,2.0,Rent,562.0,...,5.0,Employed,Bachelor,31.0,4502.0,0.110437,0.089037,756.035156,0.052721,1.0


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='LoanApproved', data=training_set)
plt.title('Distribution of target variable (LoanApproved)')
plt.show()

print(training_set['LoanApproved'].value_counts(normalize=True))

numeric_columns = training_set.select_dtypes(include=[np.number]).columns
correlation_matrix = training_set[numeric_columns].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature correlation matrix')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=training_set, x='AnnualIncome', y='LoanAmount', hue='LoanApproved', alpha=0.6)
plt.title('Loan amount vs annual income')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=training_set, x='LoanApproved', y='CreditScore')
plt.title('Credit score distribution for approved and rejected loans')
plt.show()

print("Top features correlated with LoanApproved:")
print(correlation_matrix['LoanApproved'].sort_values(ascending=False).head(10))
print("\nBottom features correlated with LoanApproved:")
print(correlation_matrix['LoanApproved'].sort_values(ascending=False).tail(5))

Классы целевой переменной сбалансированы, это видно из графика распределения.
Сильная положительная корреляция с целевой переменной наблюдается у признаков: MonthlyIncome и AnnualIncome: чем выше доход, тем выше вероятность одобрения кредита. Сильная отрицательная корреляция: BaseInterestRate и InterestRate: более высокие ставки - откзаз.
Основными факторами успеха являются доход и кредитный рейтинг. Данные готовы к обработке.

### Подготовка данных

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

train_set_clean = training_set.dropna(subset=['LoanApproved']).copy()
train_set_clean['LoanApproved'] = train_set_clean['LoanApproved'].astype(int) 

print(f"Training set size after removing NaN in target: {train_set_clean.shape}")

combined_data = pd.concat([train_set_clean.drop(columns=['LoanApproved']), testing_set], axis=0)

categorical_features = combined_data.select_dtypes(include=['object']).columns
numerical_features = combined_data.select_dtypes(exclude=['object']).columns

categorical_imputer = SimpleImputer(strategy='most_frequent')
combined_data[categorical_features] = categorical_imputer.fit_transform(combined_data[categorical_features])

numerical_imputer = SimpleImputer(strategy='median')
combined_data[numerical_features] = numerical_imputer.fit_transform(combined_data[numerical_features])

label_encoders = {}
for column in categorical_features:
    encoder = LabelEncoder()
    combined_data[column] = encoder.fit_transform(combined_data[column].astype(str))
    label_encoders[column] = encoder

X_features = combined_data.iloc[:len(train_set_clean)]
X_test_final = combined_data.iloc[len(train_set_clean):]
target_values = train_set_clean['LoanApproved']

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_features, target_values, test_size=0.2, random_state=42, stratify=target_values
)

print("Data prepared for model training.")
print(f"X_train shape: {X_train_split.shape}")
print(f"y_train distribution:\n{y_train_split.value_counts(normalize=True)}")

### Реализация метрик (+доп)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc, precision_recall_curve

class ClassificationMetrics:
    @staticmethod
    def accuracy(true_labels, predicted_labels):
        correct = np.sum(true_labels == predicted_labels)
        return correct / len(true_labels)

    @staticmethod
    def precision(true_labels, predicted_labels):
        # TP / (TP + FP)
        true_positives = np.sum((true_labels == 1) & (predicted_labels == 1))
        false_positives = np.sum((true_labels == 0) & (predicted_labels == 1))
        return true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

    @staticmethod
    def recall(true_labels, predicted_labels):
        # TP / (TP + FN)
        true_positives = np.sum((true_labels == 1) & (predicted_labels == 1))
        false_negatives = np.sum((true_labels == 1) & (predicted_labels == 0))
        return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    @staticmethod
    def f1_score(true_labels, predicted_labels):
        p = ClassificationMetrics.precision(true_labels, predicted_labels)
        r = ClassificationMetrics.recall(true_labels, predicted_labels)
        return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

    @staticmethod
    def roc_auc_score(true_labels, predicted_probabilities):
        positive_samples = predicted_probabilities[true_labels == 1]
        negative_samples = predicted_probabilities[true_labels == 0]
        
        if len(positive_samples) == 0 or len(negative_samples) == 0:
            return 0.5
        
        if len(positive_samples) > 10000 or len(negative_samples) > 10000:
            return roc_auc_score(true_labels, predicted_probabilities)
        
        comparison_matrix = positive_samples[:, np.newaxis] > negative_samples
        auc_value = np.mean(comparison_matrix)
        return auc_value

    @staticmethod
    def pr_auc_score(true_labels, predicted_probabilities):
        sorted_indices = np.argsort(-predicted_probabilities)
        true_labels_sorted = true_labels[sorted_indices]
        
        positive_count = np.sum(true_labels == 1)
        total_count = len(true_labels)
        
        if positive_count == 0:
            return 0.0
        
        true_positives = 0
        false_positives = 0
        precision_values = [1.0] 
        recall_values = [0.0]
        
        for i in range(total_count):
            if true_labels_sorted[i] == 1:
                true_positives += 1
            else:
                false_positives += 1
            
            current_precision = true_positives / (true_positives + false_positives)
            current_recall = true_positives / positive_count
            precision_values.append(current_precision)
            recall_values.append(current_recall)
        
        precision_array = np.array(precision_values)
        recall_array = np.array(recall_values)
        
        sorted_idx = np.argsort(recall_array)
        return np.trapz(precision_array[sorted_idx], recall_array[sorted_idx])

test_true = np.array([1, 0, 1, 1, 0, 1, 0, 0, 1, 1])
test_predicted = np.array([1, 0, 1, 0, 0, 1, 0, 0, 1, 1])
test_probabilities = np.array([0.9, 0.1, 0.8, 0.75, 0.2, 0.85, 0.3, 0.15, 0.95, 0.7])

print("--- Comparison of custom metrics with sklearn ---")
print(f"Accuracy:  Custom={ClassificationMetrics.accuracy(test_true, test_predicted):.4f}, Sklearn={accuracy_score(test_true, test_predicted):.4f}")
print(f"Precision: Custom={ClassificationMetrics.precision(test_true, test_predicted):.4f}, Sklearn={precision_score(test_true, test_predicted):.4f}")
print(f"Recall:    Custom={ClassificationMetrics.recall(test_true, test_predicted):.4f}, Sklearn={recall_score(test_true, test_predicted):.4f}")
print(f"F1-score:  Custom={ClassificationMetrics.f1_score(test_true, test_predicted):.4f}, Sklearn={f1_score(test_true, test_predicted):.4f}")
print(f"ROC-AUC:   Custom={ClassificationMetrics.roc_auc_score(test_true, test_probabilities):.4f}, Sklearn={roc_auc_score(test_true, test_probabilities):.4f}")

precision_curve, recall_curve, _ = precision_recall_curve(test_true, test_probabilities)
sklearn_pr_auc = auc(recall_curve, precision_curve)
print(f"PR-AUC:    Custom={ClassificationMetrics.pr_auc_score(test_true, test_probabilities):.4f}, Sklearn={sklearn_pr_auc:.4f}")

print("\nConclusion: Custom metric implementations match sklearn reference implementations.")

### Бэггинг

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.base import clone

class MyBaggingClassifier:
    def __init__(self, base_estimator=None, n_estimators=10, random_state=42):
        self.base_estimator = base_estimator if base_estimator else DecisionTreeClassifier()
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []
        
    def fit(self, X, y):
        self.models = []
        rng = np.random.default_rng(self.random_state)
        n_samples = X.shape[0]
        
        X_arr = np.array(X)
        y_arr = np.array(y)
        
        for i in range(self.n_estimators):
            indices = rng.choice(n_samples, size=n_samples, replace=True)
            X_sample = X_arr[indices]
            y_sample = y_arr[indices]
            
            model = clone(self.base_estimator)
            model.fit(X_sample, y_sample)
            self.models.append(model)
            
        return self

    def predict_proba(self, X):
        probas = []
        X_arr = np.array(X)
        
        for model in self.models:
            probas.append(model.predict_proba(X_arr))
            
        return np.mean(probas, axis=0)

    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)

print("Обучаем MyBaggingClassifier...")
my_bagging = MyBaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    random_state=42
)
my_bagging.fit(X_train, y_train)

y_pred_my = my_bagging.predict(X_val)
y_prob_my = my_bagging.predict_proba(X_val)[:, 1]

print("Обучаем sklearn BaggingClassifier...")
sk_bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    random_state=42
)
sk_bagging.fit(X_train, y_train)

y_pred_sk = sk_bagging.predict(X_val)
y_prob_sk = sk_bagging.predict_proba(X_val)[:, 1]

print("\n--- Результаты ---")
print(f"My Bagging ROC-AUC: {roc_auc_score(y_val, y_prob_my):.4f}")
print(f"Sklearn Bagging ROC-AUC: {roc_auc_score(y_val, y_prob_sk):.4f}")

print(f"My Bagging Accuracy: {accuracy_score(y_val, y_pred_my):.4f}")
print(f"Sklearn Bagging Accuracy: {accuracy_score(y_val, y_pred_sk):.4f}")


Реализованный алгоритм Бэггинга показывает метрики, почти идентичные к реализации из sklearn.

### Градиентный бустинг

In [None]:
from sklearn.tree import DecisionTreeRegressor

class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.random_state = random_state
        self.models = []
        self.initial_prediction = None

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.models = []
        X_arr = np.array(X)
        y_arr = np.array(y)
        
        mean_y = np.mean(y_arr)
        mean_y = np.clip(mean_y, 1e-10, 1 - 1e-10) 
        self.initial_prediction = np.log(mean_y / (1 - mean_y))
        
        current_preds = np.full(y_arr.shape, self.initial_prediction)
        
        for i in range(self.n_estimators):
            probas = self._sigmoid(current_preds)
            residuals = y_arr - probas
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=self.random_state + i)
            tree.fit(X_arr, residuals)
            
            update = tree.predict(X_arr)
            
            current_preds += self.learning_rate * update
            
            self.models.append(tree)
            
        return self

    def predict_proba(self, X):
        X_arr = np.array(X)
        
        preds = np.full(X_arr.shape[0], self.initial_prediction)
        
        for tree in self.models:
            update = tree.predict(X_arr)
            preds += self.learning_rate * update
            
        probas = self._sigmoid(preds)
        
        return np.vstack([1 - probas, probas]).T

    def predict(self, X):
        probas = self.predict_proba(X)[:, 1]
        return (probas > 0.5).astype(int)

from sklearn.ensemble import GradientBoostingClassifier

print("Обучаем MyGradientBoostingClassifier...")
my_gb = MyGradientBoostingClassifier(
    n_estimators=50, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)
my_gb.fit(X_train, y_train)

y_pred_my_gb = my_gb.predict(X_val)
y_prob_my_gb = my_gb.predict_proba(X_val)[:, 1]

print("Обучаем sklearn GradientBoostingClassifier...")
sk_gb = GradientBoostingClassifier(
    n_estimators=50, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)
sk_gb.fit(X_train, y_train)

y_pred_sk_gb = sk_gb.predict(X_val)
y_prob_sk_gb = sk_gb.predict_proba(X_val)[:, 1]

print("\n--- Результаты Бустинга ---")
print(f"My GB ROC-AUC: {roc_auc_score(y_val, y_prob_my_gb):.4f}")
print(f"Sklearn GB ROC-AUC: {roc_auc_score(y_val, y_prob_sk_gb):.4f}")

print(f"My GB Accuracy: {accuracy_score(y_val, y_pred_my_gb):.4f}")
print(f"Sklearn GB Accuracy: {accuracy_score(y_val, y_pred_sk_gb):.4f}")


Реализованный класс градиентного бустинга показывает высокую точность. Алгоритм корректно обучается, уменьшая ошибку LogLoss на каждом шаге.

### сравнение библиотек

In [None]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

print("\n--- Comparison of Boosting Libraries ---")

print("Training LightGBM")
lightgbm_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
lightgbm_model.fit(X_train_split, y_train_split)
predicted_probabilities_lgb = lightgbm_model.predict_proba(X_val_split)[:, 1]
print(f"LightGBM ROC-AUC: {roc_auc_score(y_val_split, predicted_probabilities_lgb):.4f}")

print("Training XGBoost")
xgboost_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgboost_model.fit(X_train_split, y_train_split)
predicted_probabilities_xgb = xgboost_model.predict_proba(X_val_split)[:, 1]
print(f"XGBoost ROC-AUC: {roc_auc_score(y_val_split, predicted_probabilities_xgb):.4f}")

print("Training CatBoost")
catboost_model = cb.CatBoostClassifier(random_state=42, verbose=0)
catboost_model.fit(X_train_split, y_train_split)
predicted_probabilities_cb = catboost_model.predict_proba(X_val_split)[:, 1]
print(f"CatBoost ROC-AUC: {roc_auc_score(y_val_split, predicted_probabilities_cb):.4f}")

Сравнение показало, что все библиотеки дают очень высокий результат. Лучший результат показал CatBoost, поэтому выберем его для дальнейших шагов.

### Подбор гиперпараметров с Optuna

In [None]:
import optuna
from optuna.pruners import MedianPruner

def objective(trial):
    parameters = {
        'iterations': trial.suggest_int('iterations', 100, 500),  
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),  
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_state': 42,
        'verbose': 0,
        'allow_writing_files': False
    }
    
    model = cb.CatBoostClassifier(**parameters)
    model.fit(X_train_split, y_train_split)
    
    predictions = model.predict_proba(X_val_split)[:, 1]
    return roc_auc_score(y_val_split, predictions)

print("\n--- Running Optuna for CatBoost ---")
study = optuna.create_study(direction='maximize', pruner=MedianPruner())
study.optimize(objective, n_trials=10, show_progress_bar=True)

print('\n Optimization completed!')
print('Best trial:')
best_trial = study.best_trial
print(f'  ROC-AUC: {best_trial.value:.4f}')
print('  Parameters: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')

X_combined_train = pd.concat([X_train_split, X_val_split])
y_combined_train = pd.concat([y_train_split, y_val_split])

optimized_parameters = best_trial.params
optimized_parameters['random_state'] = 42
optimized_parameters['verbose'] = 0

print("\n Training final model on combined data")
final_model = cb.CatBoostClassifier(**optimized_parameters)
final_model.fit(X_combined_train, y_combined_train)

print(" Final model ready for submission!")

In [None]:
final_predictions = final_model.predict(X_val_split)
final_probabilities = final_model.predict_proba(X_val_split)[:, 1]

print("\n" + "="*60)
print("FINAL METRICS OF BEST MODEL")
print("="*60)
print(f"Accuracy:  {ClassificationMetrics.accuracy(y_val_split.values, final_predictions):.4f}")
print(f"Precision: {ClassificationMetrics.precision(y_val_split.values, final_predictions):.4f}")
print(f"Recall:    {ClassificationMetrics.recall(y_val_split.values, final_predictions):.4f}")
print(f"F1-score:  {ClassificationMetrics.f1_score(y_val_split.values, final_predictions):.4f}")
print(f"ROC-AUC:   {ClassificationMetrics.roc_auc_score(y_val_split.values, final_probabilities):.4f}")
print(f"PR-AUC:    {ClassificationMetrics.pr_auc_score(y_val_split.values, final_probabilities):.4f}")
print(f"\nComparison with sklearn:")
print(f"Sklearn ROC-AUC: {roc_auc_score(y_val_split, final_probabilities):.4f}")
from sklearn.metrics import precision_recall_curve, auc
precision_curve, recall_curve, _ = precision_recall_curve(y_val_split, final_probabilities)
sklearn_pr_auc = auc(recall_curve, precision_curve)
print(f"Sklearn PR-AUC:  {sklearn_pr_auc:.4f}")
print("="*60)

In [None]:
test_predictions = final_model.predict_proba(X_test_final)[:, 1]

submission_file = pd.DataFrame({
    'ID': range(len(test_predictions)), 
    'LoanApproved': test_predictions
})

submission_file.to_csv('submission_lab2.csv', index=False)
print(f"Submission file size: {submission_file.shape}")
print("\nPrediction examples:")
print(submission_file.head(10))