In [1]:
import wandb
from wandb.integration.catboost import WandbCallback
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!poetry run wandb login


[34m[1mwandb[0m: Currently logged in as: [33mtalverinat[0m ([33mloko-bank[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import os

os.environ["WANDB_NOTEBOOK_NAME"] = "../notebooks/modeling_catboost_D36.ipynb"


In [4]:
SEED = 2023
TEST_SIZE = 0.2

NUM_EPOCHES = 5
N_SPLITS = 4
N_REPEATS = 10
N_TRIALS = 200


In [5]:
df = pd.read_parquet("../data/data.parquet")


In [6]:
X = df.drop(["date", "D_12", "D_24", "D_36", "D_48", "D"], axis=1)
y = df["D_36"]


In [7]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED, shuffle=True, stratify=y
)


In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_temp,
    y_train_temp,
    test_size=TEST_SIZE,
    random_state=SEED,
    shuffle=True,
    stratify=y_train_temp,
)


In [9]:
# Инициализируйте wandb с вашим ключом авторизации
settings = wandb.Settings(job_source="artifact")
wandb.init(
    project="modeling first run",
    config={
        "learning_rate": 0.01,
        "depth": 6,
        "iterations": 100,
    },
)

# Создайте объект DMatrix для обучающего и тестового набора данных
dtrain = catboost.Pool(X_train, label=y_train)
dtest = catboost.Pool(X_valid, label=y_valid)


# Задайте параметры модели
params = {
    "learning_rate": wandb.config.learning_rate,
    "depth": wandb.config.depth,
    "iterations": wandb.config.iterations,
}


[34m[1mwandb[0m: Currently logged in as: [33mtalverinat[0m ([33mloko-bank[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
model = catboost.CatBoostClassifier(**params)


In [11]:
model.fit(dtrain, eval_set=dtest, early_stopping_rounds=5, callbacks=[WandbCallback()])


0:	learn: 0.6633966	test: 0.6635001	best: 0.6635001 (0)	total: 471ms	remaining: 46.6s
1:	learn: 0.6347677	test: 0.6348592	best: 0.6348592 (1)	total: 710ms	remaining: 34.8s
2:	learn: 0.6073910	test: 0.6075012	best: 0.6075012 (2)	total: 858ms	remaining: 27.7s
3:	learn: 0.5811437	test: 0.5812835	best: 0.5812835 (3)	total: 1.01s	remaining: 24.4s
4:	learn: 0.5559421	test: 0.5560757	best: 0.5560757 (4)	total: 1.19s	remaining: 22.7s
5:	learn: 0.5316630	test: 0.5317875	best: 0.5317875 (5)	total: 1.37s	remaining: 21.5s
6:	learn: 0.5087051	test: 0.5088230	best: 0.5088230 (6)	total: 1.52s	remaining: 20.2s
7:	learn: 0.4866777	test: 0.4867712	best: 0.4867712 (7)	total: 1.65s	remaining: 19s
8:	learn: 0.4655135	test: 0.4656022	best: 0.4656022 (8)	total: 1.8s	remaining: 18.2s
9:	learn: 0.4452490	test: 0.4453186	best: 0.4453186 (9)	total: 1.98s	remaining: 17.8s
10:	learn: 0.4261143	test: 0.4261880	best: 0.4261880 (10)	total: 2.15s	remaining: 17.4s
11:	learn: 0.4077216	test: 0.4077872	best: 0.4077872 (1

<catboost.core.CatBoostClassifier at 0x126be4c10>

In [12]:
# Сделайте предсказания
y_pred = model.predict_proba(dtest)[:, 1]

# Оцените точность модели
gini_score = 2 * roc_auc_score(y_valid, y_pred) - 1

# Залогируйте метрики
wandb.log({"gini": gini_score})


In [13]:
# Функция, которую Optuna будет оптимизировать (Objective function)
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
    }

    # Подготовка данных
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    train_data = catboost.Pool(data=X_train, label=y_train)
    validation_data = catboost.Pool(data=X_val, label=y_val)

    # Создание модели с параметрами из Optuna
    model = catboost.CatBoostClassifier(**params, custom_metric=['AUC'])
    

    # Обучение модели с callback для логирования в wandb
    model.fit(train_data, eval_set=validation_data, verbose=False, callbacks=[WandbCallback()])

    # Вычисление метрики для оптимизации (AUC)
    predictions = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict_proba(dtest)[:, 1]

    # Оцените точность модели
    gini_score = 2 * roc_auc_score(y_valid, y_pred) - 1

    # Залогируйте метрики
    wandb.log({"gini": gini_score})

    
    return gini_score


In [14]:
# Запуск оптимизации с помощью Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

# Получение лучших гиперпараметров
best_params = study.best_params
print(f"Лучшие параметры: {best_params}")


[I 2023-11-21 20:41:32,573] A new study created in memory with name: no-name-f758930b-69f3-4aee-9bd8-f6794afce8cc
[I 2023-11-21 20:43:53,617] Trial 0 finished with value: 0.9046089111027458 and parameters: {'iterations': 522, 'learning_rate': 0.10537870093309225, 'depth': 5, 'l2_leaf_reg': 0.23206919458499523}. Best is trial 0 with value: 0.9046089111027458.


Лучшие параметры: {'iterations': 522, 'learning_rate': 0.10537870093309225, 'depth': 5, 'l2_leaf_reg': 0.23206919458499523}


In [15]:
model = catboost.CatBoostClassifier(**best_params)


In [16]:
model.fit(dtrain, eval_set=dtest, early_stopping_rounds=5, callbacks=[WandbCallback()])


0:	learn: 0.4250564	test: 0.4250588	best: 0.4250588 (0)	total: 168ms	remaining: 1m 27s
1:	learn: 0.2631964	test: 0.2632327	best: 0.2632327 (1)	total: 463ms	remaining: 2m
2:	learn: 0.1681584	test: 0.1682596	best: 0.1682596 (2)	total: 604ms	remaining: 1m 44s
3:	learn: 0.1120373	test: 0.1121949	best: 0.1121949 (3)	total: 818ms	remaining: 1m 45s
4:	learn: 0.0772897	test: 0.0774704	best: 0.0774704 (4)	total: 1.03s	remaining: 1m 47s
5:	learn: 0.0567635	test: 0.0569271	best: 0.0569271 (5)	total: 1.65s	remaining: 2m 22s
6:	learn: 0.0438301	test: 0.0439655	best: 0.0439655 (6)	total: 2.16s	remaining: 2m 38s
7:	learn: 0.0354011	test: 0.0355157	best: 0.0355157 (7)	total: 2.53s	remaining: 2m 42s
8:	learn: 0.0298814	test: 0.0299879	best: 0.0299879 (8)	total: 2.77s	remaining: 2m 37s
9:	learn: 0.0257918	test: 0.0259220	best: 0.0259220 (9)	total: 3.15s	remaining: 2m 41s
10:	learn: 0.0232057	test: 0.0233333	best: 0.0233333 (10)	total: 3.52s	remaining: 2m 43s
11:	learn: 0.0212527	test: 0.0213854	best: 0.

<catboost.core.CatBoostClassifier at 0x12712b8e0>

In [17]:
# Сделайте предсказания
y_pred = model.predict_proba(dtest)[:, 1]

# Оцените точность модели
gini_score = 2 * roc_auc_score(y_valid, y_pred) - 1

# Залогируйте метрики
wandb.log({"gini": gini_score})


In [18]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "../models/catboost_D36.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
