# Подготовка данных

In [2]:
import pandas as pd
import numpy as np

import optuna
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

### Загружаем датасет

In [3]:
full_path = "loan_data.csv"

df = pd.read_csv(full_path)
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


### Предобработка датасета

In [4]:
num_cols = df.select_dtypes("number").columns[:-1]
cat_cols = df.select_dtypes("object").columns

scaler = StandardScaler()
le = LabelEncoder()

df[num_cols] = scaler.fit_transform(df[num_cols])

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,-0.953538,0,4,-0.10409,-0.892284,3,4.024953,4,1.683039,4.016394,-0.739109,-1.419814,0,1
1,-1.118963,0,3,-0.846005,-0.892284,2,-1.359209,1,0.044782,-0.684829,-0.996863,-2.549975,1,0
2,-0.457264,0,3,-0.844065,-0.397517,0,-0.6466,3,0.625557,3.443074,-0.739109,0.047412,0,1
3,-0.788113,0,1,-0.007039,-0.892284,3,4.024953,3,1.417829,3.443074,-0.996863,0.840507,0,1
4,-0.622689,1,4,-0.176371,-0.727362,3,4.024953,3,1.095549,4.47505,-0.481354,-0.92413,0,1


In [5]:
X = df.drop(columns='loan_status').values
y = df["loan_status"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Обучение моделей

### Подготовка библиотек

In [6]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def get_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_macro": precision_score(y_true, y_pred, average="macro"),
        "recall_macro": recall_score(y_true, y_pred, average="macro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro")
    }

def mlflow_save_model_metrics(model_name, model, best_params):
    with mlflow.start_run(run_name=model_name):

        mlflow.log_params(best_params)

        model = model(**best_params)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        metrics = get_metrics(y_test, pred)
        mlflow.log_metrics(metrics)

        mlflow.sklearn.log_model(model, artifact_path=model_name)

        print(f"Логирование модели {model_name} завершено.")

mlflow.set_tracking_uri("file:///C:\\Users\\Михаил\\Proga\\ml\\artsofte\\mlruns")
mlflow.set_experiment("loan status classification")

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location='file:///C:\\Users\\Михаил\\Proga\\ml\\artsofte\\mlruns/167519363609596686', creation_time=1765188905273, experiment_id='167519363609596686', last_update_time=1765188905273, lifecycle_stage='active', name='loan status classification', tags={}>

### LogisticRegression

In [7]:
def objective_logreg(trial, X, y):
    params = {
        "C": trial.suggest_float("C", 1e-4, 10.0, log=True),
        "penalty": trial.suggest_categorical("penalty", ["l2"]),
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear"]),
        "max_iter": 1000
    }

    model = LogisticRegression(**params)
    score = cross_val_score(model, X, y, cv=3, scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_logreg(trial, X_train, y_train), n_trials=50)

print(f"Лучшие параметры: {study.best_params}")
print(f"Лучшая accuracy: {study.best_value}")

mlflow_save_model_metrics("LogisticRegression", LogisticRegression, study.best_params)



[I 2025-12-08 17:39:56,014] A new study created in memory with name: no-name-de3761fe-313b-463e-9aef-9e617ef992cf
[I 2025-12-08 17:39:56,151] Trial 0 finished with value: 0.892388888888889 and parameters: {'C': 1.8403216761015833, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 0 with value: 0.892388888888889.
[I 2025-12-08 17:39:56,229] Trial 1 finished with value: 0.8670555555555555 and parameters: {'C': 0.0007173668583505726, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 0 with value: 0.892388888888889.
[I 2025-12-08 17:39:56,307] Trial 2 finished with value: 0.8453888888888889 and parameters: {'C': 0.0003269717747671665, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.892388888888889.
[I 2025-12-08 17:39:56,403] Trial 3 finished with value: 0.8841111111111112 and parameters: {'C': 0.002081914378378003, 'penalty': 'l2', 'solver': 'liblinear'}. Best is trial 0 with value: 0.892388888888889.
[I 2025-12-08 17:39:56,471] Trial 4 finished with value: 0.7855

Лучшие параметры: {'C': 0.07501847408607885, 'penalty': 'l2', 'solver': 'lbfgs'}
Лучшая accuracy: 0.8927222222222223




Логирование модели LogisticRegression завершено.


### RandomForest

In [None]:
def objective_rf(trial, X, y):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 2, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }

    model = RandomForestClassifier(**params)

    score = cross_val_score(model, X, y, cv=3, scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_rf(trial, X_train, y_train), n_trials=50)

print(f"Лучшие параметры: {study.best_params}")
print(f"Лучшая accuracy: {study.best_value}")

mlflow_save_model_metrics("RandomForest", RandomForestClassifier, study.best_params)

[I 2025-12-08 17:40:09,984] A new study created in memory with name: no-name-db6af92e-387e-46f0-85bc-a68a524ad95d
[I 2025-12-08 17:40:20,704] Trial 0 finished with value: 0.9267500000000001 and parameters: {'n_estimators': 155, 'max_depth': 22, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.9267500000000001.
[I 2025-12-08 17:40:43,221] Trial 1 finished with value: 0.9274722222222223 and parameters: {'n_estimators': 312, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.9274722222222223.
[I 2025-12-08 17:41:08,407] Trial 2 finished with value: 0.9268888888888888 and parameters: {'n_estimators': 354, 'max_depth': 16, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': False}. Best is trial 1 with value: 0.9274722222222223.
[I 2025-12-08 17:41:32,285] Trial 3 finished with value: 0.9261944444444

Лучшие параметры: {'n_estimators': 328, 'max_depth': 21, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}
Лучшая accuracy: 0.9284722222222221




Логирование модели RandomForest завершено.


### Gradient Boosting

In [9]:
def objective_gb(trial, X, y):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 400),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
    }

    model = GradientBoostingClassifier(**params)

    score = cross_val_score(model, X, y, cv=3, scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_gb(trial, X_train, y_train), n_trials=50)

print(f"Лучшие параметры: {study.best_params}")
print(f"Лучшая accuracy: {study.best_value}")

mlflow_save_model_metrics("GradBoost", GradientBoostingClassifier, study.best_params)

[I 2025-12-08 17:55:47,985] A new study created in memory with name: no-name-d268302f-38fb-4bcd-82ce-53490993ae8e
[I 2025-12-08 17:56:10,627] Trial 0 finished with value: 0.924 and parameters: {'n_estimators': 178, 'learning_rate': 0.4909350960852229, 'max_depth': 7, 'subsample': 0.5720592363581373, 'min_samples_split': 7, 'min_samples_leaf': 17}. Best is trial 0 with value: 0.924.
[I 2025-12-08 17:56:25,111] Trial 1 finished with value: 0.928 and parameters: {'n_estimators': 182, 'learning_rate': 0.09612191814117353, 'max_depth': 3, 'subsample': 0.8330933125417772, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.928.
[I 2025-12-08 17:56:31,785] Trial 2 finished with value: 0.9269166666666667 and parameters: {'n_estimators': 92, 'learning_rate': 0.09980299976150092, 'max_depth': 4, 'subsample': 0.5513557481292535, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.928.
[I 2025-12-08 17:56:42,817] Trial 3 finished with value: 0.9

Лучшие параметры: {'n_estimators': 254, 'learning_rate': 0.09679919387889922, 'max_depth': 7, 'subsample': 0.8907630149846777, 'min_samples_split': 5, 'min_samples_leaf': 19}
Лучшая accuracy: 0.9358888888888889




Логирование модели GradBoost завершено.


### KNN

In [10]:
def objective_knn(trial, X, y):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 50),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "p": trial.suggest_int("p", 1, 2),  # 1=manhattan, 2=euclidean
        "leaf_size": trial.suggest_int("leaf_size", 10, 100),
    }

    model = KNeighborsClassifier(**params)

    score = cross_val_score(model, X, y, cv=3, scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_knn(trial, X_train, y_train), n_trials=50)

print(f"Лучшие параметры: {study.best_params}")
print(f"Лучшая accuracy: {study.best_value}")

mlflow_save_model_metrics("KNN", KNeighborsClassifier, study.best_params)


[I 2025-12-08 18:34:41,683] A new study created in memory with name: no-name-7bce7e81-3526-4dd5-9538-d42ea6dd0220
[I 2025-12-08 18:34:59,675] Trial 0 finished with value: 0.8989722222222222 and parameters: {'n_neighbors': 25, 'weights': 'distance', 'p': 1, 'leaf_size': 49}. Best is trial 0 with value: 0.8989722222222222.
[I 2025-12-08 18:35:16,882] Trial 1 finished with value: 0.8996666666666666 and parameters: {'n_neighbors': 16, 'weights': 'distance', 'p': 1, 'leaf_size': 90}. Best is trial 1 with value: 0.8996666666666666.
[I 2025-12-08 18:35:30,301] Trial 2 finished with value: 0.8879444444444444 and parameters: {'n_neighbors': 35, 'weights': 'distance', 'p': 2, 'leaf_size': 16}. Best is trial 1 with value: 0.8996666666666666.
[I 2025-12-08 18:35:44,285] Trial 3 finished with value: 0.88475 and parameters: {'n_neighbors': 47, 'weights': 'distance', 'p': 2, 'leaf_size': 94}. Best is trial 1 with value: 0.8996666666666666.
[I 2025-12-08 18:36:06,322] Trial 4 finished with value: 0.89

Лучшие параметры: {'n_neighbors': 11, 'weights': 'distance', 'p': 1, 'leaf_size': 10}
Лучшая accuracy: 0.8998333333333334




Логирование модели KNN завершено.


### SVC

In [12]:
def objective_svc(trial, X, y):
    params = {
        "C": trial.suggest_float("C", 1e-4, 100.0, log=True),
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
        "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
        "degree": trial.suggest_int("degree", 2, 5),
        "probability": True
    }

    model = SVC(**params)

    score = cross_val_score(model, X, y, cv=3, scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_svc(trial, X_train, y_train), n_trials=5)

print(f"Лучшие параметры: {study.best_params}")
print(f"Лучшая accuracy: {study.best_value}")

mlflow_save_model_metrics("SVC", SVC, study.best_params)


[I 2025-12-08 19:24:24,378] A new study created in memory with name: no-name-317c71f6-9ef9-4797-88e4-611cd0dc45d6
[I 2025-12-08 19:27:03,386] Trial 0 finished with value: 0.7778333333333333 and parameters: {'C': 0.00024185217586091245, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 4}. Best is trial 0 with value: 0.7778333333333333.
[I 2025-12-08 19:28:09,918] Trial 1 finished with value: 0.8939722222222222 and parameters: {'C': 0.08254049571596378, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3}. Best is trial 1 with value: 0.8939722222222222.
[I 2025-12-08 19:29:41,111] Trial 2 finished with value: 0.7871666666666667 and parameters: {'C': 0.00014686743147630395, 'kernel': 'linear', 'gamma': 'auto', 'degree': 5}. Best is trial 1 with value: 0.8939722222222222.
[I 2025-12-08 19:32:02,791] Trial 3 finished with value: 0.9067777777777778 and parameters: {'C': 30.256852919878668, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 2}. Best is trial 3 with value: 0.9067777777777778.
[I 2025-12-08 19

Лучшие параметры: {'C': 30.256852919878668, 'kernel': 'rbf', 'gamma': 'auto', 'degree': 2}
Лучшая accuracy: 0.9067777777777778




Логирование модели SVC завершено.
