In [18]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
%%shell
pip install catboost
pip install optuna





In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    log_loss
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold
)

from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler

from scipy.stats import uniform, randint

In [21]:
base_path = '/content/drive/MyDrive/LG-Aimers/phase2'
df_train = pd.read_csv(f'{base_path}/train.csv')
df_test = pd.read_csv(f'{base_path}/submission.csv')

In [22]:
# bant_submit 전처리
df_train['bant_submit'] = df_train['bant_submit'].apply(
    lambda x: 0 if x == 0 else (1 if x == 0.25 else (2 if x == 0.5 else (3 if x == 0.75 else 4)))
)
df_test['bant_submit'] = df_test['bant_submit'].apply(
    lambda x: 0 if x == 0 else (1 if x == 0.25 else (2 if x == 0.5 else (3 if x == 0.75 else 4)))
)

# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

# customer_job 전처리
df_all['customer_job'].replace(to_replace=['systems engineer','system engineer'], value="engineering", inplace=True)
df_all['customer_job'].replace(to_replace=['information technology', 'information_technology','it - information technology',  'it', 'information technology\u200b',], value="IT", inplace=True)
df_all['customer_job'].replace(to_replace=['human resources','human_resources','hr',], value="administration", inplace=True)
df_all['customer_job'].replace(to_replace=['director it', 'it director','director,it','director of it',"i'm directing it",'it dairector',], value="IT director", inplace=True)
df_all['customer_job'].replace(to_replace=['research and developement','research & development'], value="research and development", inplace=True)
df_all['customer_job'].replace(to_replace=['other', 'nothing','otro','others', 'otros', 'n.a','sonstiges', 'altro','egyéb','autres'], value="other", inplace=True)
df_all['customer_job'].replace(to_replace=['sales', 'sale','vendite'], value="sales", inplace=True)
df_all['customer_job'].replace(to_replace=['media and communication','media and communications', 'media_e_comunicazione','média_és_kommunikáció','medien_und_kommunikation','medios_de_comunicación','media_and_communication'],value='media and communication', inplace=True)
df_all['customer_job'].replace(to_replace=['healthcare services', 'healthcare_services'], value='healthcare services', inplace=True)
df_all['customer_job'].replace(to_replace=['pénzügy','finanzen','finanzas'], value='finance', inplace=True)
df_all['customer_job'].replace(to_replace=['arts and design','arts_and_design','art and design','művészet_és_design', 'arte_e_design','kreation und design','kreation_und_design','arte y diseño'], value='art and design', inplace=True)
df_all['customer_job'].replace(to_replace=['military and protective services','military_and_protective_services'], value='military', inplace=True)

# product category 전처리
df_all['product_category'].replace(to_replace=['etc.','outros', 'other','autre', 'khác','آخر','not specified',], value="other", inplace=True)
df_all['product_category'].replace(to_replace=['חימום','aquecimento',], value="heating", inplace=True)
df_all['product_category'].replace(to_replace=['one:quick series','lg one:quick','lg one:quick series', 'one quick works''onequick series','one quick:flex','one:quick flex','one:quick'], value="one quick", inplace=True)
df_all['product_category'].replace(to_replace=['מזגנים למקום מגורים','ar condicionado residencial','เครื่องปรับอากาศเผื่อที่อยู่อาศัย', 'aire acondicionado residencial','climatiseur résidentiel'], value="residential air conditioner", inplace=True)
df_all['product_category'].replace(to_replace=['酒店電視'], value="hotel tv", inplace=True)
df_all['product_category'].replace(to_replace=['軟體'], value="software", inplace=True)
df_all['product_category'].replace(to_replace=['multi-split (plusieurs pièces)'], value="multi-split", inplace=True)

# customer_type 전처리
df_all['customer_type'].replace(to_replace=['End-Customer', 'End Customer',], value="end customer", inplace=True)
df_all['customer_type'].replace(to_replace=['Other','Others'], value="other", inplace=True)
df_all['customer_type'].replace(to_replace=['Software/Solution Provider', 'Software / Solution Provider'], value="software/solution provider", inplace=True)

special_characters = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def remove_special_characters(text):
    for char in special_characters:
        text = text.replace(char, '')
    return text

for col in label_columns:
    df_all[col] = df_all[col].apply(
        lambda x: x if pd.isna(x) else remove_special_characters(x.replace(' ', '')).upper()
    )

# inquiry_type 전처리
df_all['inquiry_type'] = df_all['inquiry_type'].replace('OTHER', 'ETC').replace('OTHERS', 'ETC')

# expected_timeline 전처리

def preprocess_timeline(timeline):
    # 빈도수가 너무 높아 변형하지 않을 값
    allowed_fixed = [
        'LESSTHAN3MONTHS', '3MONTHS6MONTHS', '6MONTHS9MONTHS', '9MONTHS1YEAR', 'MORETHANAYEAR',
        'LESSTHAN6MONTHS'
    ]
    allowed_date = [
        # 기간이 올바르게 정해진 값
        'DAY', 'MONTH', 'YEAR',
        'JAN', 'FEB', 'MAR', 'APRIL', 'MAY', 'JUNE', 'JULY', 'AUGUST', 'SEPTEMBER', 'OCTOBER', 'NOVEMBER', 'DECEMBER'
    ]
    # BEINGFOLLOWEDUP으로 통합할 값
    allowed_fu = [
        'FOLLOWEDUP', 'FOLLOWINGUP', 'FUUNDERPROGRESS', 'FOLLOWUP'
    ]
    # DROPPED로 통합할 값
    allowed_dropped = [
        'DROP', 'CLOSING', 'PROBLEM', 'ISSUE'
    ]
    # NOTCONNECTED로 통합할 값
    allowed_connect = [
        'SWITCHEDOFF', 'NORESPON', 'NOTANSWER', 'NOTCONNECT', 'NOTREACH', 'NOTRESPON',
        'NTRESPON', 'NTANSWER', 'NTCON', 'NTREACH'
    ]
    # NOREQ로 통합할 값
    allowed_no_req = [
        # 기한을 요구하지 않은 값
        'NOTHAVINGANYREQUIRE', 'NOTINTEREST', 'NOREQ', 'NOINTEREST',
        'NOTREQ'
    ]
    # DISCUSSWITHCLI로 통합할 값
    allowed_discussed = [
        'DISCUSSEDWITHCLI', 'DISCUSSEDWITHTHE'
    ]
    for word in allowed_fixed:
        if word in timeline:
            return timeline
    for word in allowed_date:
        if word in timeline:
            return 'DATESPECIFIED'
    for word in allowed_fu:
        if word in timeline:
            return 'FOLLOWEDUP'
    for word in allowed_dropped:
        if word in timeline:
            return 'DROPPED'
    for word in allowed_connect:
        if word in timeline:
            return 'NOTCONNECTED'
    for word in allowed_no_req:
        if word in timeline:
            return 'NOREQ'
    for word in allowed_discussed:
        if word in timeline:
            return 'DISCUSSEDWITHCLI'
    return timeline

df_all['expected_timeline'] = df_all['expected_timeline'].apply(
    lambda x: x if pd.isna(x) else preprocess_timeline(x)
)

# business_subarea 전처리
df_all['business_subarea'] = df_all['business_subarea'].replace(
    'OTHERSTORES', 'OTHERS').replace(
    'MANUFACTURINGFACTORYPLANT', 'MANUFACTURING').replace(
    'GENERALHOSPITAL', 'HOSPITAL').replace(
    'PHARMACEUTICAL', 'PHARMACY').replace(
    'ELECTRONICSTELCO', 'TELECOMMUNICATION')

def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

# label encoding
for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [23]:
used_features = [
    'bant_submit',
    'customer_country',
    'business_unit',
    'com_reg_ver_win_rate',
    'customer_idx',
    'customer_type',
    'enterprise',
    'historical_existing_cnt',
    'id_strategic_ver',
    'it_strategic_ver',
    'idit_strategic_ver',
    'customer_job',
    'lead_desc_length',
    'inquiry_type',
    'product_category',
    'product_subcategory',
    'product_modelname',
    'customer_country.1',
    'customer_position',
    'response_corporate',
    'expected_timeline',
    'ver_cus',
    'ver_pro',
    'ver_win_rate_x',
    'ver_win_ratio_per_bu',
    'business_area',
    'business_subarea',
    'lead_owner',
    # 'is_converted'
]

In [24]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [25]:
def f_score(beta, precision, recall):
    return (beta * beta + 1) * precision * recall / (beta * beta * precision + recall)

In [26]:
f_score(1, 0.609, 0.934), f_score(1.5, 0.609, 0.934)

(0.7372728451069345, 0.8022651622002821)

In [27]:
def objective(trial):
    param = {
        "random_state": 42,
        "iterations": trial.suggest_int("iterations", 90, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 5, 50),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 11.0, 20.0)
    }

    X_train, X_val, y_train, y_val = train_test_split(
        df_train[used_features], df_train['is_converted'].astype(int),
        test_size=0.2, stratify=df_train['is_converted'],
        random_state=42
    )

    neg_pos_ratio = np.sum(y_train == 0) / np.sum(y_train == 1)

    model = CatBoostClassifier(
        iterations=param['iterations'],
        learning_rate=None,
        depth=param['max_depth'],
        l2_leaf_reg=None,
        colsample_bylevel=param['colsample_bylevel'],
        random_seed=42,
        scale_pos_weight=neg_pos_ratio,
        early_stopping_rounds=param['early_stopping_rounds'],
        cat_features=[
            used_features.index('bant_submit'),
            used_features.index('customer_country'),
            used_features.index('business_unit'),
            used_features.index('customer_type'),
            used_features.index('enterprise'),
            used_features.index('customer_job'),
            used_features.index('inquiry_type'),
            used_features.index('product_category'),
            used_features.index('product_subcategory'),
            used_features.index('product_modelname'),
            used_features.index('customer_country.1'),
            used_features.index('customer_position'),
            used_features.index('response_corporate'),
            used_features.index('expected_timeline'),
            used_features.index('business_area'),
            used_features.index('business_subarea'),
        ],  # train set에서 categorical feature의 indices
        verbose=True
    )

    model.fit(
        X_train.fillna(-1), y_train,
        eval_set=(X_val.fillna(-1), y_val)
    )

    pred = model.predict(X_val.fillna(-1))
    precision = precision_score(y_val, pred)
    recall = recall_score(y_val, pred)
    f_res = f_score(1.5, precision, recall)

    return f_res

In [28]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name='cat_param_opt',
    direction='maximize',
    sampler=sampler
)
study.optimize(objective, n_trials=10)

[I 2024-02-25 18:40:34,903] A new study created in memory with name: cat_param_opt


Learning rate set to 0.152116
0:	learn: 0.5857130	test: 0.5883277	best: 0.5883277 (0)	total: 387ms	remaining: 1m 33s
1:	learn: 0.4636729	test: 0.4715702	best: 0.4715702 (1)	total: 800ms	remaining: 1m 36s
2:	learn: 0.4105978	test: 0.4217040	best: 0.4217040 (2)	total: 1.06s	remaining: 1m 24s
3:	learn: 0.3724031	test: 0.3846052	best: 0.3846052 (3)	total: 1.29s	remaining: 1m 17s
4:	learn: 0.3457519	test: 0.3584577	best: 0.3584577 (4)	total: 1.72s	remaining: 1m 21s
5:	learn: 0.3167055	test: 0.3326088	best: 0.3326088 (5)	total: 2.02s	remaining: 1m 19s
6:	learn: 0.2939002	test: 0.3110045	best: 0.3110045 (6)	total: 2.3s	remaining: 1m 17s
7:	learn: 0.2733087	test: 0.2916129	best: 0.2916129 (7)	total: 2.59s	remaining: 1m 16s
8:	learn: 0.2633507	test: 0.2815149	best: 0.2815149 (8)	total: 2.85s	remaining: 1m 14s
9:	learn: 0.2564200	test: 0.2753017	best: 0.2753017 (9)	total: 3.19s	remaining: 1m 14s
10:	learn: 0.2462367	test: 0.2660665	best: 0.2660665 (10)	total: 3.9s	remaining: 1m 22s
11:	learn: 0.

[I 2024-02-25 18:41:49,052] Trial 0 finished with value: 0.8034633253554836 and parameters: {'iterations': 243, 'max_depth': 8, 'colsample_bylevel': 0.8659969709057025, 'early_stopping_rounds': 32, 'scale_pos_weight': 12.404167763981928}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.1855
0:	learn: 0.5894966	test: 0.5883618	best: 0.5883618 (0)	total: 48.1ms	remaining: 7.37s
1:	learn: 0.4999931	test: 0.5021990	best: 0.5021990 (1)	total: 93ms	remaining: 7.07s
2:	learn: 0.4782238	test: 0.4802314	best: 0.4802314 (2)	total: 130ms	remaining: 6.53s
3:	learn: 0.4289933	test: 0.4310082	best: 0.4310082 (3)	total: 171ms	remaining: 6.43s
4:	learn: 0.4150085	test: 0.4162940	best: 0.4162940 (4)	total: 229ms	remaining: 6.82s
5:	learn: 0.4082160	test: 0.4093356	best: 0.4093356 (5)	total: 265ms	remaining: 6.53s
6:	learn: 0.3983442	test: 0.3999827	best: 0.3999827 (6)	total: 298ms	remaining: 6.26s
7:	learn: 0.3900896	test: 0.3931008	best: 0.3931008 (7)	total: 336ms	remaining: 6.14s
8:	learn: 0.3794195	test: 0.3791611	best: 0.3791611 (8)	total: 375ms	remaining: 6.04s
9:	learn: 0.3533459	test: 0.3552944	best: 0.3552944 (9)	total: 410ms	remaining: 5.91s
10:	learn: 0.3486705	test: 0.3497315	best: 0.3497315 (10)	total: 453ms	remaining: 5.88s
11:	learn: 0.3446567	tes

[I 2024-02-25 18:41:57,356] Trial 1 finished with value: 0.7430725608970227 and parameters: {'iterations': 154, 'max_depth': 3, 'colsample_bylevel': 0.9330880728874675, 'early_stopping_rounds': 32, 'scale_pos_weight': 17.372653200164407}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.225804
0:	learn: 0.5232926	test: 0.5291309	best: 0.5291309 (0)	total: 96ms	remaining: 9.31s
1:	learn: 0.3991743	test: 0.4104255	best: 0.4104255 (1)	total: 186ms	remaining: 8.91s
2:	learn: 0.3413161	test: 0.3547668	best: 0.3547668 (2)	total: 278ms	remaining: 8.81s
3:	learn: 0.3111187	test: 0.3249492	best: 0.3249492 (3)	total: 358ms	remaining: 8.42s
4:	learn: 0.2834195	test: 0.3008877	best: 0.3008877 (4)	total: 444ms	remaining: 8.26s
5:	learn: 0.2698735	test: 0.2858529	best: 0.2858529 (5)	total: 529ms	remaining: 8.12s
6:	learn: 0.2574394	test: 0.2741588	best: 0.2741588 (6)	total: 607ms	remaining: 7.89s
7:	learn: 0.2504607	test: 0.2690268	best: 0.2690268 (7)	total: 684ms	remaining: 7.69s
8:	learn: 0.2340226	test: 0.2531582	best: 0.2531582 (8)	total: 768ms	remaining: 7.6s
9:	learn: 0.2293551	test: 0.2481794	best: 0.2481794 (9)	total: 845ms	remaining: 7.44s
10:	learn: 0.2180746	test: 0.2387984	best: 0.2387984 (10)	total: 921ms	remaining: 7.28s
11:	learn: 0.2130281	tes

[I 2024-02-25 18:42:05,606] Trial 2 finished with value: 0.7945561963707976 and parameters: {'iterations': 98, 'max_depth': 8, 'colsample_bylevel': 0.9162213204002109, 'early_stopping_rounds': 14, 'scale_pos_weight': 12.636424704863906}. Best is trial 0 with value: 0.8034633253554836.


96:	learn: 0.0905242	test: 0.1805033	best: 0.1774835 (88)	total: 7.67s	remaining: 79.1ms
97:	learn: 0.0901059	test: 0.1809399	best: 0.1774835 (88)	total: 7.74s	remaining: 0us

bestTest = 0.1774834689
bestIteration = 88

Shrink model to first 89 iterations.
Learning rate set to 0.180015
0:	learn: 0.5823801	test: 0.5847448	best: 0.5847448 (0)	total: 57.2ms	remaining: 9.39s
1:	learn: 0.5412540	test: 0.5439137	best: 0.5439137 (1)	total: 156ms	remaining: 12.7s
2:	learn: 0.4677995	test: 0.4703743	best: 0.4703743 (2)	total: 223ms	remaining: 12.1s
3:	learn: 0.4480759	test: 0.4498783	best: 0.4498783 (3)	total: 292ms	remaining: 11.7s
4:	learn: 0.4363330	test: 0.4375978	best: 0.4375978 (4)	total: 379ms	remaining: 12.1s
5:	learn: 0.4239712	test: 0.4245520	best: 0.4245520 (5)	total: 454ms	remaining: 12s
6:	learn: 0.4091581	test: 0.4098302	best: 0.4098302 (6)	total: 530ms	remaining: 12s
7:	learn: 0.3968733	test: 0.3966270	best: 0.3966270 (7)	total: 610ms	remaining: 12s
8:	learn: 0.3435538	test: 0.34

[I 2024-02-25 18:42:15,094] Trial 3 finished with value: 0.7533533012110952 and parameters: {'iterations': 165, 'max_depth': 4, 'colsample_bylevel': 0.762378215816119, 'early_stopping_rounds': 24, 'scale_pos_weight': 13.621062261782377}. Best is trial 0 with value: 0.8034633253554836.


164:	learn: 0.1567845	test: 0.1862274	best: 0.1857277 (155)	total: 8.91s	remaining: 0us

bestTest = 0.1857276738
bestIteration = 155

Shrink model to first 156 iterations.
Learning rate set to 0.13127
0:	learn: 0.6164589	test: 0.6174858	best: 0.6174858 (0)	total: 68.1ms	remaining: 23.1s
1:	learn: 0.5463896	test: 0.5487516	best: 0.5487516 (1)	total: 157ms	remaining: 26.6s
2:	learn: 0.5216430	test: 0.5244555	best: 0.5244555 (2)	total: 243ms	remaining: 27.4s
3:	learn: 0.5055103	test: 0.5086101	best: 0.5086101 (3)	total: 302ms	remaining: 25.4s
4:	learn: 0.4875713	test: 0.4904643	best: 0.4904643 (4)	total: 374ms	remaining: 25.1s
5:	learn: 0.4724055	test: 0.4754264	best: 0.4754264 (5)	total: 428ms	remaining: 23.9s
6:	learn: 0.4604331	test: 0.4626492	best: 0.4626492 (6)	total: 493ms	remaining: 23.5s
7:	learn: 0.4505403	test: 0.4525227	best: 0.4525227 (7)	total: 561ms	remaining: 23.4s
8:	learn: 0.4364566	test: 0.4383754	best: 0.4383754 (8)	total: 631ms	remaining: 23.3s
9:	learn: 0.4075141	test

[I 2024-02-25 18:42:42,293] Trial 4 finished with value: 0.7677840834103208 and parameters: {'iterations': 341, 'max_depth': 3, 'colsample_bylevel': 0.6460723242676091, 'early_stopping_rounds': 21, 'scale_pos_weight': 15.104629857953324}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.120902
0:	learn: 0.6097527	test: 0.6118859	best: 0.6118859 (0)	total: 108ms	remaining: 44.5s
1:	learn: 0.5263258	test: 0.5300010	best: 0.5300010 (1)	total: 212ms	remaining: 43.5s
2:	learn: 0.5020158	test: 0.5060229	best: 0.5060229 (2)	total: 329ms	remaining: 44.8s
3:	learn: 0.4543840	test: 0.4589783	best: 0.4589783 (3)	total: 430ms	remaining: 43.9s
4:	learn: 0.4383892	test: 0.4436296	best: 0.4436296 (4)	total: 529ms	remaining: 43s
5:	learn: 0.4269777	test: 0.4313428	best: 0.4313428 (5)	total: 627ms	remaining: 42.4s
6:	learn: 0.3979212	test: 0.4036158	best: 0.4036158 (6)	total: 724ms	remaining: 41.9s
7:	learn: 0.3867494	test: 0.3929169	best: 0.3929169 (7)	total: 828ms	remaining: 41.8s
8:	learn: 0.3786412	test: 0.3851863	best: 0.3851863 (8)	total: 916ms	remaining: 41s
9:	learn: 0.3715395	test: 0.3776333	best: 0.3776333 (9)	total: 1.03s	remaining: 41.5s
10:	learn: 0.3640023	test: 0.3701804	best: 0.3701804 (10)	total: 1.14s	remaining: 41.5s
11:	learn: 0.3466798	test:

[I 2024-02-25 18:43:32,853] Trial 5 finished with value: 0.7912199755733478 and parameters: {'iterations': 412, 'max_depth': 4, 'colsample_bylevel': 0.7571172192068059, 'early_stopping_rounds': 32, 'scale_pos_weight': 11.41805371447998}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.131607
0:	learn: 0.6482031	test: 0.6475947	best: 0.6475947 (0)	total: 102ms	remaining: 34.6s
1:	learn: 0.6146754	test: 0.6140911	best: 0.6140911 (1)	total: 193ms	remaining: 32.5s
2:	learn: 0.5840107	test: 0.5830383	best: 0.5830383 (2)	total: 306ms	remaining: 34.3s
3:	learn: 0.5303574	test: 0.5316691	best: 0.5316691 (3)	total: 405ms	remaining: 33.9s
4:	learn: 0.4999326	test: 0.5002280	best: 0.5002280 (4)	total: 494ms	remaining: 33s
5:	learn: 0.4182529	test: 0.4223227	best: 0.4223227 (5)	total: 571ms	remaining: 31.7s
6:	learn: 0.4019253	test: 0.4067397	best: 0.4067397 (6)	total: 662ms	remaining: 31.4s
7:	learn: 0.3922453	test: 0.3960353	best: 0.3960353 (7)	total: 735ms	remaining: 30.4s
8:	learn: 0.3811635	test: 0.3856836	best: 0.3856836 (8)	total: 824ms	remaining: 30.2s
9:	learn: 0.3726781	test: 0.3778570	best: 0.3778570 (9)	total: 900ms	remaining: 29.6s
10:	learn: 0.3630441	test: 0.3675948	best: 0.3675948 (10)	total: 1s	remaining: 29.9s
11:	learn: 0.3413281	test: 

[I 2024-02-25 18:44:07,008] Trial 6 finished with value: 0.7876106194690266 and parameters: {'iterations': 339, 'max_depth': 4, 'colsample_bylevel': 0.5325257964926398, 'early_stopping_rounds': 48, 'scale_pos_weight': 19.690688297671034}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.119647
0:	learn: 0.6309156	test: 0.6321690	best: 0.6321690 (0)	total: 111ms	remaining: 46.6s
1:	learn: 0.5606486	test: 0.5610853	best: 0.5610853 (1)	total: 205ms	remaining: 43s
2:	learn: 0.5054302	test: 0.5074575	best: 0.5074575 (2)	total: 280ms	remaining: 39.2s
3:	learn: 0.4818988	test: 0.4841527	best: 0.4841527 (3)	total: 394ms	remaining: 41.2s
4:	learn: 0.4663337	test: 0.4686831	best: 0.4686831 (4)	total: 498ms	remaining: 41.6s
5:	learn: 0.4307973	test: 0.4346710	best: 0.4346710 (5)	total: 580ms	remaining: 40.2s
6:	learn: 0.4157695	test: 0.4191983	best: 0.4191983 (6)	total: 696ms	remaining: 41.2s
7:	learn: 0.4030973	test: 0.4063685	best: 0.4063685 (7)	total: 793ms	remaining: 41s
8:	learn: 0.3953933	test: 0.3986806	best: 0.3986806 (8)	total: 871ms	remaining: 40s
9:	learn: 0.3835630	test: 0.3868300	best: 0.3868300 (9)	total: 958ms	remaining: 39.5s
10:	learn: 0.3751501	test: 0.3784410	best: 0.3784410 (10)	total: 1.06s	remaining: 39.7s
11:	learn: 0.3701669	test: 0

[I 2024-02-25 18:44:50,113] Trial 7 finished with value: 0.7915587072139643 and parameters: {'iterations': 422, 'max_depth': 4, 'colsample_bylevel': 0.5488360570031919, 'early_stopping_rounds': 36, 'scale_pos_weight': 14.961372443656412}. Best is trial 0 with value: 0.8034633253554836.


Learning rate set to 0.193352
0:	learn: 0.5546819	test: 0.5536744	best: 0.5536744 (0)	total: 52.8ms	remaining: 7.33s
1:	learn: 0.4684693	test: 0.4658852	best: 0.4658852 (1)	total: 102ms	remaining: 7.07s
2:	learn: 0.4226201	test: 0.4224744	best: 0.4224744 (2)	total: 152ms	remaining: 6.96s
3:	learn: 0.4047364	test: 0.4047636	best: 0.4047636 (3)	total: 200ms	remaining: 6.79s
4:	learn: 0.3905864	test: 0.3918009	best: 0.3918009 (4)	total: 243ms	remaining: 6.57s
5:	learn: 0.3736211	test: 0.3758890	best: 0.3758890 (5)	total: 306ms	remaining: 6.83s
6:	learn: 0.3492041	test: 0.3529703	best: 0.3529703 (6)	total: 343ms	remaining: 6.52s
7:	learn: 0.3384611	test: 0.3427567	best: 0.3427567 (7)	total: 382ms	remaining: 6.3s
8:	learn: 0.3307753	test: 0.3320777	best: 0.3320777 (8)	total: 433ms	remaining: 6.31s
9:	learn: 0.3151265	test: 0.3170581	best: 0.3170581 (9)	total: 478ms	remaining: 6.21s
10:	learn: 0.3031149	test: 0.3058023	best: 0.3058023 (10)	total: 517ms	remaining: 6.06s
11:	learn: 0.2966106	t

[I 2024-02-25 18:44:58,723] Trial 8 finished with value: 0.7559735973597359 and parameters: {'iterations': 140, 'max_depth': 5, 'colsample_bylevel': 0.5171942605576092, 'early_stopping_rounds': 46, 'scale_pos_weight': 13.329019834400153}. Best is trial 0 with value: 0.8034633253554836.


138:	learn: 0.1449388	test: 0.1836736	best: 0.1836736 (138)	total: 8s	remaining: 57.6ms
139:	learn: 0.1448544	test: 0.1836717	best: 0.1836717 (139)	total: 8.04s	remaining: 0us

bestTest = 0.183671715
bestIteration = 139

Learning rate set to 0.127902
0:	learn: 0.6141916	test: 0.6160113	best: 0.6160113 (0)	total: 125ms	remaining: 45.1s
1:	learn: 0.5416127	test: 0.5458238	best: 0.5458238 (1)	total: 241ms	remaining: 43.4s
2:	learn: 0.5145370	test: 0.5200851	best: 0.5200851 (2)	total: 347ms	remaining: 41.5s
3:	learn: 0.4882926	test: 0.4928142	best: 0.4928142 (3)	total: 459ms	remaining: 41.1s
4:	learn: 0.4477035	test: 0.4535270	best: 0.4535270 (4)	total: 551ms	remaining: 39.4s
5:	learn: 0.4326487	test: 0.4389848	best: 0.4389848 (5)	total: 652ms	remaining: 38.7s
6:	learn: 0.4207390	test: 0.4265112	best: 0.4265112 (6)	total: 738ms	remaining: 37.4s
7:	learn: 0.4033330	test: 0.4086471	best: 0.4086471 (7)	total: 851ms	remaining: 37.7s
8:	learn: 0.3725199	test: 0.3789334	best: 0.3789334 (8)	total

[I 2024-02-25 18:45:41,388] Trial 9 finished with value: 0.7854424357754519 and parameters: {'iterations': 362, 'max_depth': 4, 'colsample_bylevel': 0.7600340105889054, 'early_stopping_rounds': 30, 'scale_pos_weight': 12.663690099729743}. Best is trial 0 with value: 0.8034633253554836.


In [32]:
study.best_trial.params

{'iterations': 243,
 'max_depth': 8,
 'colsample_bylevel': 0.8659969709057025,
 'early_stopping_rounds': 32,
 'scale_pos_weight': 12.404167763981928}

In [33]:
study.best_value

0.8034633253554836

In [34]:
study.trials_dataframe().sort_values(by='value', ascending=False)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_early_stopping_rounds,params_iterations,params_max_depth,params_scale_pos_weight,state
0,0,0.803463,2024-02-25 18:40:34.906856,2024-02-25 18:41:49.051186,0 days 00:01:14.144330,0.865997,32,243,8,12.404168,COMPLETE
2,2,0.794556,2024-02-25 18:41:57.358362,2024-02-25 18:42:05.605849,0 days 00:00:08.247487,0.916221,14,98,8,12.636425,COMPLETE
7,7,0.791559,2024-02-25 18:44:07.014313,2024-02-25 18:44:50.113239,0 days 00:00:43.098926,0.548836,36,422,4,14.961372,COMPLETE
5,5,0.79122,2024-02-25 18:42:42.295526,2024-02-25 18:43:32.852646,0 days 00:00:50.557120,0.757117,32,412,4,11.418054,COMPLETE
6,6,0.787611,2024-02-25 18:43:32.861789,2024-02-25 18:44:07.007717,0 days 00:00:34.145928,0.532526,48,339,4,19.690688,COMPLETE
9,9,0.785442,2024-02-25 18:44:58.724770,2024-02-25 18:45:41.388344,0 days 00:00:42.663574,0.760034,30,362,4,12.66369,COMPLETE
4,4,0.767784,2024-02-25 18:42:15.096441,2024-02-25 18:42:42.293053,0 days 00:00:27.196612,0.646072,21,341,3,15.10463,COMPLETE
8,8,0.755974,2024-02-25 18:44:50.115285,2024-02-25 18:44:58.722812,0 days 00:00:08.607527,0.517194,46,140,5,13.32902,COMPLETE
3,3,0.753353,2024-02-25 18:42:05.610207,2024-02-25 18:42:15.094615,0 days 00:00:09.484408,0.762378,24,165,4,13.621062,COMPLETE
1,1,0.743073,2024-02-25 18:41:49.054040,2024-02-25 18:41:57.356662,0 days 00:00:08.302622,0.933088,32,154,3,17.372653,COMPLETE
