In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [2]:
!pip install duckdb

Collecting duckdb
  Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (763 bytes)
Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-0.10.3


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder
import optuna
import lightgbm as lgb
import numpy as np

In [4]:
import duckdb
import pandas as pd

train_path = '/kaggle/input/leash-predict-chemical-bindings/train.parquet'
test_path = '/kaggle/input/leash-predict-chemical-bindings/test.parquet'

con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT 35000)
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT 35000)""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [5]:
df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,98216988,O=C(N[C@@H](Cc1ccc(F)cc1)C(=O)O)OCC1c2ccccc2-c...,CNC(=O)c1ccc(N)cc1F,Cl.NCCC1CN(c2ncnc3[nH]ncc23)c2ccccc21,CNC(=O)c1ccc(Nc2nc(NCCC3CN(c4ncnc5[nH]ncc45)c4...,BRD4,0
1,278543698,O=C(O)Cc1ccc(NC(=O)OCC2c3ccccc3-c3ccccc32)cc1,NCC(O)COc1ccccc1Br,N#Cc1ncc(N)cc1C(F)(F)F,N#Cc1ncc(Nc2nc(NCC(O)COc3ccccc3Br)nc(Nc3ccc(CC...,HSA,0
2,145368917,O=C(Nc1c(Cl)cc(Cl)nc1C(=O)O)OCC1c2ccccc2-c2ccc...,Cl.Cl.NCCC(=O)Nc1ccncc1,Cc1ccc(C)c(CN)n1,Cc1ccc(C)c(CNc2nc(NCCC(=O)Nc3ccncc3)nc(Nc3c(Cl...,sEH,0
3,163978090,O=C(Nc1cc(C(=O)O)ccc1Br)OCC1c2ccccc2-c2ccccc21,Cl.NCC1CC(C(N)=O)=NO1,COc1cc(C)c(N)cn1,COc1cc(C)c(Nc2nc(NCC3CC(C(N)=O)=NO3)nc(Nc3cc(C...,HSA,0
4,166294371,O=C(Nc1cc(C(=O)O)ccc1C(F)(F)F)OCC1c2ccccc2-c2c...,Cn1nccc1CN1C[C@@H](F)C[C@H]1CN,Cc1cc(Cl)nnc1N,Cc1cc(Cl)nnc1Nc1nc(NC[C@@H]2C[C@H](F)CN2Cc2ccn...,BRD4,0


In [6]:
# ECFP 생성 함수
def generate_ecfp(molecule, radius=2, bits=2048):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# 분자의 물리화학적 특성을 추출하는 함수
def generate_physchem_features(molecule):
    if molecule is None:
        return [np.nan] * 12
    return [
        Descriptors.MolWt(molecule),                   # 분자량
        Descriptors.MolLogP(molecule),                 # 로그 P
        Descriptors.NumHDonors(molecule),              # 수소 결합 공여자 수
        Descriptors.NumHAcceptors(molecule),           # 수소 결합 수용체 수
        Descriptors.TPSA(molecule)                     # TPSA (극성 표면적)
    ]

In [7]:
# 데이터 전처리
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['physchem'] = df['molecule'].apply(generate_physchem_features)
df['ecfp'] = df['molecule'].apply(generate_ecfp)

In [8]:
# ECFP와 물리화학적 특성을 결합
df['features'] = df.apply(lambda row: row['ecfp'] + row['physchem'], axis=1)

In [9]:
# 단백질 이름을 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

## Train Model

In [10]:
# 최종 입력 데이터 생성
X = [features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())]
y = df['binds'].tolist()

In [11]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Optuna를 사용하여 하이퍼파라미터 최적화
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),

    }

    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    map_score = average_precision_score(y_test, y_pred_proba)
    return map_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-05-31 06:38:37,391] A new study created in memory with name: no-name-4031ef84-b5ef-4503-8a02-d6d1c33c7d31
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.519117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:39:05,363] Trial 0 finished with value: 0.9652201206769182 and parameters: {'n_estimators': 99, 'max_depth': 8, 'learning_rate': 0.19171048600266727, 'subsample': 0.9835131250866096, 'colsample_bytree': 0.9691701393277286, 'num_leaves': 163}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.931588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:39:41,900] Trial 1 finished with value: 0.9638720843661411 and parameters: {'n_estimators': 188, 'max_depth': 8, 'learning_rate': 0.06703000994099725, 'subsample': 0.9985354371708882, 'colsample_bytree': 0.9630258395923298, 'num_leaves': 39}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.993874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:40:11,837] Trial 2 finished with value: 0.9060393045822148 and parameters: {'n_estimators': 83, 'max_depth': 6, 'learning_rate': 0.01020766509054991, 'subsample': 0.9705647825653804, 'colsample_bytree': 0.6789401010407619, 'num_leaves': 45}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.966816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:40:42,452] Trial 3 finished with value: 0.925034744799915 and parameters: {'n_estimators': 124, 'max_depth': 5, 'learning_rate': 0.0274657508755815, 'subsample': 0.8632719671731257, 'colsample_bytree': 0.7563465286633466, 'num_leaves': 94}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.968757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:41:12,529] Trial 4 finished with value: 0.9552143785289513 and parameters: {'n_estimators': 75, 'max_depth': 9, 'learning_rate': 0.0566062124474121, 'subsample': 0.9451489437493216, 'colsample_bytree': 0.8302819039423414, 'num_leaves': 171}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.011116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:41:51,208] Trial 5 finished with value: 0.9532405792790661 and parameters: {'n_estimators': 297, 'max_depth': 9, 'learning_rate': 0.0148196610952377, 'subsample': 0.8748109796847269, 'colsample_bytree': 0.7269815382869679, 'num_leaves': 43}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.005209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:42:20,690] Trial 6 finished with value: 0.925454905514708 and parameters: {'n_estimators': 122, 'max_depth': 3, 'learning_rate': 0.0740267558005112, 'subsample': 0.7788679560338126, 'colsample_bytree': 0.8021162050172849, 'num_leaves': 33}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.991523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:42:59,064] Trial 7 finished with value: 0.9533248487678911 and parameters: {'n_estimators': 201, 'max_depth': 10, 'learning_rate': 0.01652363326885563, 'subsample': 0.7727474187545108, 'colsample_bytree': 0.869394431691106, 'num_leaves': 95}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.962310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:43:30,947] Trial 8 finished with value: 0.902147620684344 and parameters: {'n_estimators': 237, 'max_depth': 3, 'learning_rate': 0.01841688025528152, 'subsample': 0.9394528453027904, 'colsample_bytree': 0.6927465205117176, 'num_leaves': 225}. Best is trial 0 with value: 0.9652201206769182.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.944317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:44:06,145] Trial 9 finished with value: 0.9664177858043663 and parameters: {'n_estimators': 220, 'max_depth': 9, 'learning_rate': 0.05839394340707794, 'subsample': 0.8677891980671255, 'colsample_bytree': 0.8506402698166184, 'num_leaves': 137}. Best is trial 9 with value: 0.9664177858043663.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.944010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:44:48,121] Trial 10 finished with value: 0.9699919089944906 and parameters: {'n_estimators': 259, 'max_depth': 7, 'learning_rate': 0.17872575842790986, 'subsample': 0.6439978936087959, 'colsample_bytree': 0.6006652340045533, 'num_leaves': 242}. Best is trial 10 with value: 0.9699919089944906.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.072610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:45:27,647] Trial 11 finished with value: 0.9704092749359812 and parameters: {'n_estimators': 260, 'max_depth': 7, 'learning_rate': 0.24070711730673242, 'subsample': 0.6251814486099299, 'colsample_bytree': 0.6080809273658964, 'num_leaves': 248}. Best is trial 11 with value: 0.9704092749359812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.981804 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:46:09,064] Trial 12 finished with value: 0.9694193950996116 and parameters: {'n_estimators': 274, 'max_depth': 6, 'learning_rate': 0.2641156703596831, 'subsample': 0.6122066697092471, 'colsample_bytree': 0.6033986609223295, 'num_leaves': 248}. Best is trial 11 with value: 0.9704092749359812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.987346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:46:50,840] Trial 13 finished with value: 0.9691012441466589 and parameters: {'n_estimators': 254, 'max_depth': 7, 'learning_rate': 0.13345481440642878, 'subsample': 0.6119966357646554, 'colsample_bytree': 0.6006584336740103, 'num_leaves': 219}. Best is trial 11 with value: 0.9704092749359812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.971194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:47:21,545] Trial 14 finished with value: 0.9594691213142035 and parameters: {'n_estimators': 157, 'max_depth': 5, 'learning_rate': 0.12006438194262915, 'subsample': 0.6855682429553982, 'colsample_bytree': 0.6486931812878307, 'num_leaves': 252}. Best is trial 11 with value: 0.9704092749359812.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.948195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:47:57,502] Trial 15 finished with value: 0.9706939722472142 and parameters: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.23638863016137007, 'subsample': 0.6792533174091213, 'colsample_bytree': 0.6402655844110696, 'num_leaves': 210}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.166914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:48:31,730] Trial 16 finished with value: 0.9692764470522988 and parameters: {'n_estimators': 297, 'max_depth': 5, 'learning_rate': 0.2342799846929056, 'subsample': 0.707152980028102, 'colsample_bytree': 0.6555602299160355, 'num_leaves': 196}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.960961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:49:06,007] Trial 17 finished with value: 0.9694577274991576 and parameters: {'n_estimators': 226, 'max_depth': 8, 'learning_rate': 0.108825536499696, 'subsample': 0.7222128605563857, 'colsample_bytree': 0.7651768578720334, 'num_leaves': 196}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.974921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:49:39,345] Trial 18 finished with value: 0.9660524089859416 and parameters: {'n_estimators': 275, 'max_depth': 4, 'learning_rate': 0.28424587285062775, 'subsample': 0.666028354109556, 'colsample_bytree': 0.7115046346355636, 'num_leaves': 205}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.982704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:50:11,957] Trial 19 finished with value: 0.9502765154348951 and parameters: {'n_estimators': 157, 'max_depth': 7, 'learning_rate': 0.032092676062163775, 'subsample': 0.739225271081247, 'colsample_bytree': 0.6391915817585063, 'num_leaves': 138}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.972489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:50:46,831] Trial 20 finished with value: 0.9666087057785242 and parameters: {'n_estimators': 297, 'max_depth': 6, 'learning_rate': 0.0933442823079782, 'subsample': 0.6387222243275174, 'colsample_bytree': 0.75315660727879, 'num_leaves': 170}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.983090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:51:22,903] Trial 21 finished with value: 0.9702106112732737 and parameters: {'n_estimators': 254, 'max_depth': 7, 'learning_rate': 0.17083206915781424, 'subsample': 0.6618331692933761, 'colsample_bytree': 0.6275572720733291, 'num_leaves': 233}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.978759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:51:57,639] Trial 22 finished with value: 0.9699830774092596 and parameters: {'n_estimators': 251, 'max_depth': 7, 'learning_rate': 0.16519259090242056, 'subsample': 0.6819933836295643, 'colsample_bytree': 0.6382909500938224, 'num_leaves': 222}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.963875 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:52:33,119] Trial 23 finished with value: 0.970255470358079 and parameters: {'n_estimators': 277, 'max_depth': 8, 'learning_rate': 0.20641812557831757, 'subsample': 0.6001348928537785, 'colsample_bytree': 0.9124212984893227, 'num_leaves': 233}. Best is trial 15 with value: 0.9706939722472142.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.986986 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:53:08,644] Trial 24 finished with value: 0.9708391678857494 and parameters: {'n_estimators': 284, 'max_depth': 8, 'learning_rate': 0.22343974480023457, 'subsample': 0.6011854094755926, 'colsample_bytree': 0.9022548121586703, 'num_leaves': 255}. Best is trial 24 with value: 0.9708391678857494.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.936962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:53:43,063] Trial 25 finished with value: 0.9710234220968118 and parameters: {'n_estimators': 210, 'max_depth': 10, 'learning_rate': 0.28745283131613203, 'subsample': 0.6379929447414436, 'colsample_bytree': 0.8980425068985276, 'num_leaves': 254}. Best is trial 25 with value: 0.9710234220968118.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.170133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:54:17,856] Trial 26 finished with value: 0.9705389040229249 and parameters: {'n_estimators': 208, 'max_depth': 10, 'learning_rate': 0.1482690263729586, 'subsample': 0.7469450608421382, 'colsample_bytree': 0.907464904562953, 'num_leaves': 208}. Best is trial 25 with value: 0.9710234220968118.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.912034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:54:46,610] Trial 27 finished with value: 0.965544933961611 and parameters: {'n_estimators': 55, 'max_depth': 10, 'learning_rate': 0.29995295791889426, 'subsample': 0.8219855892351837, 'colsample_bytree': 0.93284431269528, 'num_leaves': 255}. Best is trial 25 with value: 0.9710234220968118.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.958340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:55:19,854] Trial 28 finished with value: 0.9670820543371297 and parameters: {'n_estimators': 173, 'max_depth': 9, 'learning_rate': 0.09522362048870682, 'subsample': 0.7004906901936773, 'colsample_bytree': 0.9923973513690179, 'num_leaves': 179}. Best is trial 25 with value: 0.9710234220968118.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.933252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:55:57,047] Trial 29 finished with value: 0.963207800003276 and parameters: {'n_estimators': 281, 'max_depth': 8, 'learning_rate': 0.037761173796576765, 'subsample': 0.6525476864565394, 'colsample_bytree': 0.8855120785663644, 'num_leaves': 153}. Best is trial 25 with value: 0.9710234220968118.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.966221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:56:31,550] Trial 30 finished with value: 0.9710291195212355 and parameters: {'n_estimators': 236, 'max_depth': 9, 'learning_rate': 0.20057030593593406, 'subsample': 0.6360352361965923, 'colsample_bytree': 0.8177503728230144, 'num_leaves': 111}. Best is trial 30 with value: 0.9710291195212355.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.970332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:57:05,908] Trial 31 finished with value: 0.9711493794126911 and parameters: {'n_estimators': 225, 'max_depth': 9, 'learning_rate': 0.2111418551263328, 'subsample': 0.6348983477917665, 'colsample_bytree': 0.8304685961982354, 'num_leaves': 111}. Best is trial 31 with value: 0.9711493794126911.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.961458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:57:40,482] Trial 32 finished with value: 0.9715741869326938 and parameters: {'n_estimators': 238, 'max_depth': 9, 'learning_rate': 0.20700096452492145, 'subsample': 0.6381176369635262, 'colsample_bytree': 0.8127032950711086, 'num_leaves': 105}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.968161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:58:14,313] Trial 33 finished with value: 0.9708844270725236 and parameters: {'n_estimators': 195, 'max_depth': 10, 'learning_rate': 0.18854644514425384, 'subsample': 0.6485401895685466, 'colsample_bytree': 0.8146032740640532, 'num_leaves': 115}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.017189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:58:49,321] Trial 34 finished with value: 0.9709048560446445 and parameters: {'n_estimators': 234, 'max_depth': 9, 'learning_rate': 0.14726166680920635, 'subsample': 0.6393237957006992, 'colsample_bytree': 0.7811538016364192, 'num_leaves': 68}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.967344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:59:24,051] Trial 35 finished with value: 0.9678937970841461 and parameters: {'n_estimators': 180, 'max_depth': 10, 'learning_rate': 0.0813371691460515, 'subsample': 0.6297252529028118, 'colsample_bytree': 0.8453781580112204, 'num_leaves': 117}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.974531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 06:59:58,857] Trial 36 finished with value: 0.9635564516214986 and parameters: {'n_estimators': 210, 'max_depth': 9, 'learning_rate': 0.04305816969949548, 'subsample': 0.7109032442929457, 'colsample_bytree': 0.8263874449704229, 'num_leaves': 76}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.970843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:00:33,014] Trial 37 finished with value: 0.9706771219771692 and parameters: {'n_estimators': 220, 'max_depth': 9, 'learning_rate': 0.1944962405225732, 'subsample': 0.7331054280581751, 'colsample_bytree': 0.9424269697815446, 'num_leaves': 114}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.014055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:01:08,097] Trial 38 finished with value: 0.9708899944641115 and parameters: {'n_estimators': 239, 'max_depth': 10, 'learning_rate': 0.2984800733258963, 'subsample': 0.675894003884023, 'colsample_bytree': 0.7928822051855386, 'num_leaves': 97}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.991452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:01:41,102] Trial 39 finished with value: 0.967945278201471 and parameters: {'n_estimators': 188, 'max_depth': 8, 'learning_rate': 0.11546213202437808, 'subsample': 0.7580921484918322, 'colsample_bytree': 0.864297771898044, 'num_leaves': 74}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.985259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:02:15,771] Trial 40 finished with value: 0.9536290528979763 and parameters: {'n_estimators': 164, 'max_depth': 9, 'learning_rate': 0.02199535636952915, 'subsample': 0.8103139705457161, 'colsample_bytree': 0.738027455002132, 'num_leaves': 126}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.987374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:02:50,032] Trial 41 finished with value: 0.9702832093947761 and parameters: {'n_estimators': 234, 'max_depth': 9, 'learning_rate': 0.14380929777331164, 'subsample': 0.6265898946592492, 'colsample_bytree': 0.7855249149319529, 'num_leaves': 68}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.987406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:03:24,285] Trial 42 finished with value: 0.9712751577792944 and parameters: {'n_estimators': 242, 'max_depth': 9, 'learning_rate': 0.16433513240958958, 'subsample': 0.6608663399652774, 'colsample_bytree': 0.7806249229122932, 'num_leaves': 54}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.001578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:03:57,979] Trial 43 finished with value: 0.9714232439542989 and parameters: {'n_estimators': 213, 'max_depth': 10, 'learning_rate': 0.21020552177976734, 'subsample': 0.6954911761196891, 'colsample_bytree': 0.8127571145611854, 'num_leaves': 49}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.033847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:04:32,426] Trial 44 finished with value: 0.9703766679779198 and parameters: {'n_estimators': 243, 'max_depth': 8, 'learning_rate': 0.20203866177203575, 'subsample': 0.6954486663496664, 'colsample_bytree': 0.8104413441237192, 'num_leaves': 49}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.022141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:05:09,900] Trial 45 finished with value: 0.9473834238097467 and parameters: {'n_estimators': 224, 'max_depth': 10, 'learning_rate': 0.011609788195536064, 'subsample': 0.6575491976574045, 'colsample_bytree': 0.8346024887300347, 'num_leaves': 54}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.003422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:05:45,631] Trial 46 finished with value: 0.9710377086539214 and parameters: {'n_estimators': 264, 'max_depth': 9, 'learning_rate': 0.16880319572152788, 'subsample': 0.6196143969779352, 'colsample_bytree': 0.7730415941615023, 'num_leaves': 87}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.036656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:06:20,852] Trial 47 finished with value: 0.9714564123653937 and parameters: {'n_estimators': 262, 'max_depth': 9, 'learning_rate': 0.16358492240871536, 'subsample': 0.6150235210451214, 'colsample_bytree': 0.7734409216876837, 'num_leaves': 85}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.999262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:06:56,257] Trial 48 finished with value: 0.966933237622045 and parameters: {'n_estimators': 267, 'max_depth': 10, 'learning_rate': 0.06708746924261558, 'subsample': 0.834225272951177, 'colsample_bytree': 0.7227821701882273, 'num_leaves': 32}. Best is trial 32 with value: 0.9715741869326938.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),


[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.006526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


[I 2024-05-31 07:07:30,609] Trial 49 finished with value: 0.9701414178570871 and parameters: {'n_estimators': 250, 'max_depth': 8, 'learning_rate': 0.1298417955682621, 'subsample': 0.6674185913849512, 'colsample_bytree': 0.7464001060618694, 'num_leaves': 57}. Best is trial 32 with value: 0.9715741869326938.


In [13]:
# 최적의 하이퍼파라미터로 모델 생성
best_params = study.best_params
best_model = lgb.LGBMClassifier(**best_params)
best_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 27969, number of negative: 28031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.990723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499446 -> initscore=-0.002214
[LightGBM] [Info] Start training from score -0.002214


## Submission

In [18]:
# 테스트 데이터 예측 및 저장
test_file = '/kaggle/input/leash-predict-chemical-bindings/test.csv'  
output_file = '/kaggle/working/submission3.csv'  # 출력 파일 경로

In [19]:
import os

In [20]:
for df_test in pd.read_csv(test_file, chunksize=100000):
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(generate_ecfp)
    df_test['physchem'] = df_test['molecule'].apply(generate_physchem_features)
    df_test['features'] = df_test.apply(lambda row: row['ecfp'] + row['physchem'], axis=1)
    protein_onehot = onehot_encoder.transform(df_test['protein_name'].values.reshape(-1, 1))
    X_test = [features + list(protein) for features, protein in zip(df_test['features'].tolist(), protein_onehot.tolist())]
    probabilities = best_model.predict_proba(X_test)[:, 1]
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))