In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install rdkit
!pip install duckdb

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (763 bytes)
Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: duckdb
Successfully installed duckdb-0.10.3


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import optuna
import lightgbm as lgb
import duckdb
import xgboost as xgb
from catboost import CatBoostClassifier
import numpy as np

In [4]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

In [5]:
con = duckdb.connect()

# 각 단백질에 대해 binds=0,1 데이터를 각각 12000개씩 불러오기
df_brd4_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 12000
""").df()

df_brd4_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 12000
""").df()

df_hsa_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 12000
""").df()

df_hsa_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 12000
""").df()

df_seh_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 12000
""").df()

df_seh_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 12000
""").df()

# 데이터프레임 결합
df = pd.concat([df_brd4_0, df_brd4_1, df_hsa_0, df_hsa_1, df_seh_0, df_seh_1], axis=0).reset_index(drop=True)
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
# ECFP 생성 함수
def generate_ecfp(molecule, radius=3, bits=2048):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

In [7]:
# 분자의 물리화학적 특성을 추출하는 함수
def generate_physchem_features(molecule):
    if molecule is None:
        return [np.nan] * 7
    return [
        Descriptors.MolWt(molecule),                   # 분자량
        Descriptors.MolLogP(molecule),                 # 로그 P
        Descriptors.NumHDonors(molecule),              # 수소 결합 공여자 수
        Descriptors.NumHAcceptors(molecule),           # 수소 결합 수용체 수
        Descriptors.TPSA(molecule),                    # TPSA (극성 표면적)
        Descriptors.NumRotatableBonds(molecule),       # 회전 가능한 결합 수
        Descriptors.RingCount(molecule)                # 고리의 수
    ]

In [8]:
# 데이터 전처리
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['physchem'] = df['molecule'].apply(generate_physchem_features)
df['ecfp'] = df['molecule'].apply(generate_ecfp)

In [9]:
# 물리화학적 특성을 정규화
physchem_features = np.array(df['physchem'].tolist())
scaler = StandardScaler()
physchem_features_scaled = scaler.fit_transform(physchem_features)

In [10]:
# ECFP와 정규화된 물리화학적 특성을 결합
df['features'] = [ecfp + physchem_scaled.tolist() for ecfp, physchem_scaled in zip(df['ecfp'], physchem_features_scaled)]

In [11]:
# 단백질 이름을 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

In [12]:
# 최종 입력 데이터 생성
X = [features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())]
y = df['binds'].tolist()

In [13]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# LightGBM 하이퍼파라미터 최적화
def objective_lgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    map_score = average_precision_score(y_test, y_pred_proba)
    return map_score

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50)
best_params_lgb = study_lgb.best_params

[I 2024-06-02 12:50:33,071] A new study created in memory with name: no-name-0242bb24-a65a-4b28-a2ab-21e1c0592f2a


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.028036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:51:03,397] Trial 0 finished with value: 0.9572007582252896 and parameters: {'n_estimators': 124, 'max_depth': 6, 'learning_rate': 0.0859222580689771, 'subsample': 0.6682925681651174, 'colsample_bytree': 0.9488506342429788, 'num_leaves': 200}. Best is trial 0 with value: 0.9572007582252896.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.488959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:51:50,271] Trial 1 finished with value: 0.9677296258634012 and parameters: {'n_estimators': 225, 'max_depth': 8, 'learning_rate': 0.07373430538341771, 'subsample': 0.6720872823357572, 'colsample_bytree': 0.7313661762245987, 'num_leaves': 223}. Best is trial 1 with value: 0.9677296258634012.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.569790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:52:29,890] Trial 2 finished with value: 0.9708043811420604 and parameters: {'n_estimators': 157, 'max_depth': 10, 'learning_rate': 0.24228745525755482, 'subsample': 0.7926616373930826, 'colsample_bytree': 0.9170693923469794, 'num_leaves': 126}. Best is trial 2 with value: 0.9708043811420604.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.528771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:53:13,077] Trial 3 finished with value: 0.9715828786845951 and parameters: {'n_estimators': 253, 'max_depth': 9, 'learning_rate': 0.13072570114174817, 'subsample': 0.7258511306183091, 'colsample_bytree': 0.7144802855451061, 'num_leaves': 174}. Best is trial 3 with value: 0.9715828786845951.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.610533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:53:48,285] Trial 4 finished with value: 0.9260381634973358 and parameters: {'n_estimators': 128, 'max_depth': 4, 'learning_rate': 0.040990363563125895, 'subsample': 0.8721857823663055, 'colsample_bytree': 0.8892151338536902, 'num_leaves': 135}. Best is trial 3 with value: 0.9715828786845951.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.576636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:54:32,357] Trial 5 finished with value: 0.9413420239962774 and parameters: {'n_estimators': 190, 'max_depth': 4, 'learning_rate': 0.052533708701605286, 'subsample': 0.8700683050975553, 'colsample_bytree': 0.6051058385494902, 'num_leaves': 32}. Best is trial 3 with value: 0.9715828786845951.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.547942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:55:16,486] Trial 6 finished with value: 0.9717066269503403 and parameters: {'n_estimators': 280, 'max_depth': 9, 'learning_rate': 0.11444730716801396, 'subsample': 0.6614221414013876, 'colsample_bytree': 0.6986612665697279, 'num_leaves': 129}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.582752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:55:52,653] Trial 7 finished with value: 0.9144044308338335 and parameters: {'n_estimators': 125, 'max_depth': 5, 'learning_rate': 0.016433492929003438, 'subsample': 0.7002792140237458, 'colsample_bytree': 0.8363524662278032, 'num_leaves': 156}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.616911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:56:32,822] Trial 8 finished with value: 0.95723100731245 and parameters: {'n_estimators': 161, 'max_depth': 8, 'learning_rate': 0.03374745925376782, 'subsample': 0.6232371812048636, 'colsample_bytree': 0.8924009227867411, 'num_leaves': 105}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.646665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:57:05,390] Trial 9 finished with value: 0.9498063128381387 and parameters: {'n_estimators': 59, 'max_depth': 5, 'learning_rate': 0.16613696436021547, 'subsample': 0.8751045353549758, 'colsample_bytree': 0.8876111786258392, 'num_leaves': 213}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.560365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:58:09,398] Trial 10 finished with value: 0.9587182147941665 and parameters: {'n_estimators': 298, 'max_depth': 10, 'learning_rate': 0.013550905900249767, 'subsample': 0.9886956879274666, 'colsample_bytree': 0.6012925381462463, 'num_leaves': 91}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.504442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:58:53,113] Trial 11 finished with value: 0.9716977243516937 and parameters: {'n_estimators': 292, 'max_depth': 8, 'learning_rate': 0.1309771306751198, 'subsample': 0.7586356242497526, 'colsample_bytree': 0.7132596287761032, 'num_leaves': 177}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.553113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 12:59:36,581] Trial 12 finished with value: 0.9709551229161961 and parameters: {'n_estimators': 288, 'max_depth': 8, 'learning_rate': 0.26536128350730115, 'subsample': 0.7680446386990318, 'colsample_bytree': 0.7149508142345458, 'num_leaves': 248}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.541982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:00:17,793] Trial 13 finished with value: 0.968537421962314 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.1192331594438496, 'subsample': 0.7437281042145828, 'colsample_bytree': 0.7638227938768093, 'num_leaves': 175}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.574709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:01:01,184] Trial 14 finished with value: 0.9705586529228192 and parameters: {'n_estimators': 264, 'max_depth': 9, 'learning_rate': 0.09542719846234851, 'subsample': 0.6090810890128502, 'colsample_bytree': 0.6686586381131745, 'num_leaves': 73}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.686948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:01:41,057] Trial 15 finished with value: 0.9708069912710955 and parameters: {'n_estimators': 217, 'max_depth': 7, 'learning_rate': 0.1805005193229722, 'subsample': 0.809558135118289, 'colsample_bytree': 0.7991569368600151, 'num_leaves': 168}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.688561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:02:28,223] Trial 16 finished with value: 0.9649750943495866 and parameters: {'n_estimators': 277, 'max_depth': 9, 'learning_rate': 0.031617971707842495, 'subsample': 0.6538241429981602, 'colsample_bytree': 0.6638579673586154, 'num_leaves': 125}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.614812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:03:07,592] Trial 17 finished with value: 0.9600723862860147 and parameters: {'n_estimators': 216, 'max_depth': 6, 'learning_rate': 0.06558216079338579, 'subsample': 0.8358239958526618, 'colsample_bytree': 0.6716102974682965, 'num_leaves': 61}. Best is trial 6 with value: 0.9717066269503403.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.591936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:03:54,167] Trial 18 finished with value: 0.9719440363925316 and parameters: {'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.1649362263591542, 'subsample': 0.9563702696697495, 'colsample_bytree': 0.8228943847018343, 'num_leaves': 202}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.562265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:04:38,245] Trial 19 finished with value: 0.9707763945334331 and parameters: {'n_estimators': 238, 'max_depth': 10, 'learning_rate': 0.20802080463243144, 'subsample': 0.964658080368639, 'colsample_bytree': 0.8215370164933175, 'num_leaves': 242}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.638308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:05:14,502] Trial 20 finished with value: 0.9583998468172754 and parameters: {'n_estimators': 192, 'max_depth': 3, 'learning_rate': 0.29356334618477065, 'subsample': 0.9339054282370287, 'colsample_bytree': 0.8571754828052492, 'num_leaves': 194}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.525720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:05:59,892] Trial 21 finished with value: 0.971486129649108 and parameters: {'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.12772850917100742, 'subsample': 0.9254018118234951, 'colsample_bytree': 0.7757673261302178, 'num_leaves': 150}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.478006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:06:42,916] Trial 22 finished with value: 0.9712189754026188 and parameters: {'n_estimators': 272, 'max_depth': 8, 'learning_rate': 0.15836989266261275, 'subsample': 0.7060880289801464, 'colsample_bytree': 0.9861122928188168, 'num_leaves': 189}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.521164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:07:28,818] Trial 23 finished with value: 0.9715148937938003 and parameters: {'n_estimators': 275, 'max_depth': 10, 'learning_rate': 0.101417262613249, 'subsample': 0.7692732005306954, 'colsample_bytree': 0.7580662248972921, 'num_leaves': 226}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.521563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:08:13,093] Trial 24 finished with value: 0.9683780960487905 and parameters: {'n_estimators': 242, 'max_depth': 9, 'learning_rate': 0.05757978222937, 'subsample': 0.8281698726001004, 'colsample_bytree': 0.6375541617938444, 'num_leaves': 106}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.577371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:08:55,374] Trial 25 finished with value: 0.9702473764030124 and parameters: {'n_estimators': 283, 'max_depth': 7, 'learning_rate': 0.14773922143433732, 'subsample': 0.910420168579828, 'colsample_bytree': 0.693827445481615, 'num_leaves': 152}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.498389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:09:39,367] Trial 26 finished with value: 0.97119363928776 and parameters: {'n_estimators': 299, 'max_depth': 8, 'learning_rate': 0.20087166739126416, 'subsample': 0.6408320333883443, 'colsample_bytree': 0.7999212617128182, 'num_leaves': 208}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.501302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:10:13,897] Trial 27 finished with value: 0.9630041620996176 and parameters: {'n_estimators': 62, 'max_depth': 10, 'learning_rate': 0.1083145649565976, 'subsample': 0.7124580310336766, 'colsample_bytree': 0.7350388039052961, 'num_leaves': 179}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.509662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:10:58,526] Trial 28 finished with value: 0.9708172546932645 and parameters: {'n_estimators': 262, 'max_depth': 9, 'learning_rate': 0.0787847944879455, 'subsample': 0.7552877883566694, 'colsample_bytree': 0.6456008170661981, 'num_leaves': 164}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.563692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:11:39,161] Trial 29 finished with value: 0.9671984572999653 and parameters: {'n_estimators': 231, 'max_depth': 7, 'learning_rate': 0.08680383077921107, 'subsample': 0.6789073130392078, 'colsample_bytree': 0.6960560515258495, 'num_leaves': 194}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.617419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:12:14,071] Trial 30 finished with value: 0.9667460647484051 and parameters: {'n_estimators': 87, 'max_depth': 8, 'learning_rate': 0.2038313194562549, 'subsample': 0.9967389634124213, 'colsample_bytree': 0.8427619913783587, 'num_leaves': 116}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.622023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:12:57,582] Trial 31 finished with value: 0.971302149130769 and parameters: {'n_estimators': 250, 'max_depth': 9, 'learning_rate': 0.1368217628688631, 'subsample': 0.7266715063107552, 'colsample_bytree': 0.742974575521913, 'num_leaves': 141}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.493653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:13:41,436] Trial 32 finished with value: 0.9714512961527888 and parameters: {'n_estimators': 261, 'max_depth': 9, 'learning_rate': 0.12073339952511769, 'subsample': 0.6901391507416064, 'colsample_bytree': 0.7087204419397712, 'num_leaves': 230}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.600509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:14:27,088] Trial 33 finished with value: 0.9718145195184935 and parameters: {'n_estimators': 281, 'max_depth': 10, 'learning_rate': 0.23802729067527886, 'subsample': 0.7908496176622076, 'colsample_bytree': 0.7805856529952236, 'num_leaves': 177}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.624964 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:15:13,734] Trial 34 finished with value: 0.97171661558898 and parameters: {'n_estimators': 286, 'max_depth': 10, 'learning_rate': 0.2377672221107912, 'subsample': 0.7892778002448114, 'colsample_bytree': 0.8049224053071052, 'num_leaves': 214}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.532935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:15:59,169] Trial 35 finished with value: 0.9715090855046266 and parameters: {'n_estimators': 278, 'max_depth': 10, 'learning_rate': 0.21963252179824355, 'subsample': 0.8000538023670655, 'colsample_bytree': 0.7802763712406479, 'num_leaves': 213}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.606584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:16:40,697] Trial 36 finished with value: 0.9709350607240064 and parameters: {'n_estimators': 202, 'max_depth': 10, 'learning_rate': 0.2394946208391635, 'subsample': 0.8406911777957984, 'colsample_bytree': 0.817216432326843, 'num_leaves': 202}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.619298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:17:20,027] Trial 37 finished with value: 0.9705628631898966 and parameters: {'n_estimators': 147, 'max_depth': 10, 'learning_rate': 0.2911832985481902, 'subsample': 0.7837723578609862, 'colsample_bytree': 0.8691881393869564, 'num_leaves': 231}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.648902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:18:04,859] Trial 38 finished with value: 0.9709510478057088 and parameters: {'n_estimators': 264, 'max_depth': 10, 'learning_rate': 0.17705499708769992, 'subsample': 0.8976976532672545, 'colsample_bytree': 0.7900713825891303, 'num_leaves': 187}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.518365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:18:41,449] Trial 39 finished with value: 0.9687942935975669 and parameters: {'n_estimators': 108, 'max_depth': 9, 'learning_rate': 0.26426782089886236, 'subsample': 0.8583543162797205, 'colsample_bytree': 0.9141541501340179, 'num_leaves': 129}. Best is trial 18 with value: 0.9719440363925316.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.506796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:19:27,587] Trial 40 finished with value: 0.9722327871148015 and parameters: {'n_estimators': 286, 'max_depth': 10, 'learning_rate': 0.1782248298545351, 'subsample': 0.7313703841846635, 'colsample_bytree': 0.8227076996034106, 'num_leaves': 218}. Best is trial 40 with value: 0.9722327871148015.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.530023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:20:13,872] Trial 41 finished with value: 0.9711677871748582 and parameters: {'n_estimators': 287, 'max_depth': 10, 'learning_rate': 0.2339792763209856, 'subsample': 0.7368748140006612, 'colsample_bytree': 0.8215966610115182, 'num_leaves': 255}. Best is trial 40 with value: 0.9722327871148015.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.644092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:21:00,824] Trial 42 finished with value: 0.9721747974531871 and parameters: {'n_estimators': 286, 'max_depth': 10, 'learning_rate': 0.16558697071319547, 'subsample': 0.8114303254448518, 'colsample_bytree': 0.8441876649151097, 'num_leaves': 216}. Best is trial 40 with value: 0.9722327871148015.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.543514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:21:45,815] Trial 43 finished with value: 0.970586440334581 and parameters: {'n_estimators': 270, 'max_depth': 10, 'learning_rate': 0.17960973449961537, 'subsample': 0.8094642533386665, 'colsample_bytree': 0.8480204234329454, 'num_leaves': 214}. Best is trial 40 with value: 0.9722327871148015.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.646808 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:22:32,301] Trial 44 finished with value: 0.9723988412971686 and parameters: {'n_estimators': 287, 'max_depth': 10, 'learning_rate': 0.15361272393386424, 'subsample': 0.7801838829760902, 'colsample_bytree': 0.8664852897313672, 'num_leaves': 237}. Best is trial 44 with value: 0.9723988412971686.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.480551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:23:17,165] Trial 45 finished with value: 0.9718342503513341 and parameters: {'n_estimators': 253, 'max_depth': 10, 'learning_rate': 0.15025319714838511, 'subsample': 0.8253254453104464, 'colsample_bytree': 0.8720646820300323, 'num_leaves': 236}. Best is trial 44 with value: 0.9723988412971686.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.693248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:24:01,923] Trial 46 finished with value: 0.9720276469027916 and parameters: {'n_estimators': 258, 'max_depth': 10, 'learning_rate': 0.15355748967524374, 'subsample': 0.8217384637669813, 'colsample_bytree': 0.9310088903250457, 'num_leaves': 235}. Best is trial 44 with value: 0.9723988412971686.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.465903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:24:49,794] Trial 47 finished with value: 0.9673079284320739 and parameters: {'n_estimators': 294, 'max_depth': 9, 'learning_rate': 0.04183752257545602, 'subsample': 0.9606132398741215, 'colsample_bytree': 0.9367182647208648, 'num_leaves': 221}. Best is trial 44 with value: 0.9723988412971686.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.550563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:25:31,943] Trial 48 finished with value: 0.9343248061280454 and parameters: {'n_estimators': 254, 'max_depth': 5, 'learning_rate': 0.017335936156931717, 'subsample': 0.8885571777813396, 'colsample_bytree': 0.9678156464776111, 'num_leaves': 243}. Best is trial 44 with value: 0.9723988412971686.


[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.662824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


[I 2024-06-02 13:26:12,662] Trial 49 finished with value: 0.9687966393064786 and parameters: {'n_estimators': 177, 'max_depth': 9, 'learning_rate': 0.09327481204825568, 'subsample': 0.8535591746892414, 'colsample_bytree': 0.9136464498184963, 'num_leaves': 252}. Best is trial 44 with value: 0.9723988412971686.


In [15]:
# XGBoost 하이퍼파라미터 최적화
def objective_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    map_score = average_precision_score(y_test, y_pred_proba)
    return map_score

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)
best_params_xgb = study_xgb.best_params

[I 2024-06-02 13:26:13,179] A new study created in memory with name: no-name-641016ea-76ec-424b-a770-413059e7b270
[I 2024-06-02 13:30:22,897] Trial 0 finished with value: 0.9709088779394104 and parameters: {'n_estimators': 222, 'max_depth': 10, 'learning_rate': 0.2023389591601935, 'subsample': 0.6709054059133913, 'colsample_bytree': 0.7306175080935562}. Best is trial 0 with value: 0.9709088779394104.
[I 2024-06-02 13:33:43,870] Trial 1 finished with value: 0.960872985690399 and parameters: {'n_estimators': 147, 'max_depth': 8, 'learning_rate': 0.05429783359347274, 'subsample': 0.7878962711205596, 'colsample_bytree': 0.9025281914806954}. Best is trial 0 with value: 0.9709088779394104.
[I 2024-06-02 13:37:03,012] Trial 2 finished with value: 0.9594618774521935 and parameters: {'n_estimators': 140, 'max_depth': 8, 'learning_rate': 0.04774858865521196, 'subsample': 0.614962471065997, 'colsample_bytree': 0.9821638024854997}. Best is trial 0 with value: 0.9709088779394104.
[I 2024-06-02 13:3

In [16]:
# CatBoost 하이퍼파라미터 최적화
def objective_cat(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
    }

    model = CatBoostClassifier(**param, verbose=0)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    map_score = average_precision_score(y_test, y_pred_proba)
    return map_score

study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=50)
best_params_cat = study_cat.best_params

[I 2024-06-02 16:49:51,446] A new study created in memory with name: no-name-88a7ede1-4cde-4d54-b937-fb210d65b462
[I 2024-06-02 16:52:23,651] Trial 0 finished with value: 0.9683394018448709 and parameters: {'iterations': 262, 'depth': 7, 'learning_rate': 0.2170632940657459, 'l2_leaf_reg': 0.07105089587273596, 'border_count': 198}. Best is trial 0 with value: 0.9683394018448709.
[I 2024-06-02 16:54:03,366] Trial 1 finished with value: 0.9369968393090451 and parameters: {'iterations': 145, 'depth': 3, 'learning_rate': 0.15782463446468453, 'l2_leaf_reg': 0.00520947801026005, 'border_count': 240}. Best is trial 0 with value: 0.9683394018448709.
[I 2024-06-02 16:56:07,193] Trial 2 finished with value: 0.9514326458963398 and parameters: {'iterations': 82, 'depth': 8, 'learning_rate': 0.10567422319758611, 'l2_leaf_reg': 0.1302810665457657, 'border_count': 41}. Best is trial 0 with value: 0.9683394018448709.
[I 2024-06-02 16:58:39,226] Trial 3 finished with value: 0.9155197867848557 and parame

In [17]:
# 최적의 파라미터로 모델 학습
model_lgb = lgb.LGBMClassifier(**best_params_lgb)
model_lgb.fit(X_train, y_train)

model_xgb = xgb.XGBClassifier(**best_params_xgb)
model_xgb.fit(X_train, y_train)

model_cat = CatBoostClassifier(**best_params_cat, verbose=0)
model_cat.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 28821, number of negative: 28779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.527230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4881
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 2040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500365 -> initscore=0.001458
[LightGBM] [Info] Start training from score 0.001458


<catboost.core.CatBoostClassifier at 0x7cf2c74ab5e0>

In [18]:
# 앙상블 예측 함수
def ensemble_predict_proba(models, X):
    predictions = np.zeros(len(X))
    for model in models:
        predictions += model.predict_proba(X)[:, 1]
    return predictions / len(models)

# 테스트 데이터 예측 및 저장
test_file = '/kaggle/input/leash-BELKA/test.csv'  
output_file = '/kaggle/working/submission6.csv'  # 출력 파일 경로

In [19]:
import os

models = [model_lgb, model_xgb, model_cat]

for df_test in pd.read_csv(test_file, chunksize=100000):
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(generate_ecfp)
    df_test['physchem'] = df_test['molecule'].apply(generate_physchem_features)
    
    # 테스트 데이터의 물리화학적 특성 정규화
    physchem_features_test = np.array(df_test['physchem'].tolist())
    physchem_features_test_scaled = scaler.transform(physchem_features_test)
    
    df_test['features'] = [ecfp + physchem_scaled.tolist() for ecfp, physchem_scaled in zip(df_test['ecfp'], physchem_features_test_scaled)]
    protein_onehot = onehot_encoder.transform(df_test['protein_name'].values.reshape(-1, 1))
    X_test = [features + list(protein) for features, protein in zip(df_test['features'].tolist(), protein_onehot.tolist())]
    
    probabilities = ensemble_predict_proba(models, X_test)
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))