In [1]:
!pip install rdkit
!pip install duckdb

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (763 bytes)
Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: duckdb
Successfully installed duckdb-0.10.3


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import average_precision_score
import numpy as np
import lightgbm as lgb
import optuna
import duckdb
from sklearn.metrics import roc_auc_score

In [4]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

In [5]:
con = duckdb.connect()

# 각 단백질에 대해 binds=0,1 데이터를 각각 12000개씩 불러오기
df_brd4_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 20000
""").df()

df_brd4_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 20000
""").df()

df_hsa_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 20000
""").df()

df_hsa_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 20000
""").df()

df_seh_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 20000
""").df()

df_seh_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 20000
""").df()

# 데이터프레임 결합
df = pd.concat([df_brd4_0, df_brd4_1, df_hsa_0, df_hsa_1, df_seh_0, df_seh_1], axis=0).reset_index(drop=True)
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
# ECFP 생성 함수
def generate_ecfp(molecule, radius=2, bits=2048):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# MACCS keys 생성 함수
def generate_maccs_keys(molecule):
    if molecule is None:
        return None
    return list(AllChem.GetMACCSKeysFingerprint(molecule))

# 분자의 물리화학적 특성을 추출하는 함수
def generate_physchem_features(molecule):
    if molecule is None:
        return [np.nan] * 6
    return [
        Descriptors.MolWt(molecule),                   # 분자량
        Descriptors.MolLogP(molecule),                 # 로그 P
        Descriptors.NumHDonors(molecule),              # 수소 결합 공여자 수
        Descriptors.NumHAcceptors(molecule),           # 수소 결합 수용체 수
        Descriptors.TPSA(molecule),                    # TPSA (극성 표면적)
        Descriptors.NumRotatableBonds(molecule)        # 회전 가능한 결합 수
    ]

In [7]:
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['physchem'] = df['molecule'].apply(generate_physchem_features)
df['ecfp'] = df['molecule'].apply(generate_ecfp)
df['maccs'] = df['molecule'].apply(generate_maccs_keys)

In [8]:
# 물리화학적 특성을 정규화
physchem_features = np.array(df['physchem'].tolist())
scaler = StandardScaler()
physchem_features_scaled = scaler.fit_transform(physchem_features)


In [9]:
# ECFP, MACCS keys, 정규화된 물리화학적 특성을 결합
df['features'] = [ecfp + maccs + physchem_scaled.tolist() for ecfp, maccs, physchem_scaled in zip(df['ecfp'], df['maccs'], physchem_features_scaled)]


In [10]:
# 단백질 이름을 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))


In [11]:
# 최종 입력 데이터 생성
X = np.array([features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())])
y = np.array(df['binds'].tolist())


In [12]:
# 최종 입력 데이터 생성
X = np.array([features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())])
y = np.array(df['binds'].tolist())


In [13]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna를 사용한 하이퍼파라미터 최적화
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255)
    }
    
    model = lgb.LGBMClassifier(**param)
    
    # fit 메서드에서 early_stopping_rounds와 eval_metric을 명시적으로 지정
    model.fit(X_train, y_train, 
              eval_set=[(X_test, y_test)], 
              eval_metric='auc')
    
    preds = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, preds)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 최적의 파라미터 출력
print("Best parameters found: ", study.best_params)

[I 2024-06-03 12:50:04,190] A new study created in memory with name: no-name-4d427a49-7dbf-411d-b34c-3fcbcec36337


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.013059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:50:15,567] Trial 0 finished with value: 0.9671749531328786 and parameters: {'n_estimators': 101, 'max_depth': 8, 'learning_rate': 0.2366605912888248, 'subsample': 0.6987756361967712, 'colsample_bytree': 0.7653816648957348, 'num_leaves': 93}. Best is trial 0 with value: 0.9671749531328786.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.421207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:50:44,701] Trial 1 finished with value: 0.9720653112637178 and parameters: {'n_estimators': 164, 'max_depth': 11, 'learning_rate': 0.18587475087902058, 'subsample': 0.7131432847962176, 'colsample_bytree': 0.7206836234583778, 'num_leaves': 222}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.938035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:51:30,950] Trial 2 finished with value: 0.9616600820519557 and parameters: {'n_estimators': 412, 'max_depth': 8, 'learning_rate': 0.027476894209544935, 'subsample': 0.7561104394869239, 'colsample_bytree': 0.6876777214055415, 'num_leaves': 196}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.009203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:52:02,619] Trial 3 finished with value: 0.9707851347276117 and parameters: {'n_estimators': 302, 'max_depth': 7, 'learning_rate': 0.15876839923801778, 'subsample': 0.7115919332389624, 'colsample_bytree': 0.8931676694741644, 'num_leaves': 230}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.991571 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:52:36,673] Trial 4 finished with value: 0.9227806225875144 and parameters: {'n_estimators': 277, 'max_depth': 6, 'learning_rate': 0.013226004261547988, 'subsample': 0.6703376334863078, 'colsample_bytree': 0.8840032838966783, 'num_leaves': 205}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.089676 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:53:13,623] Trial 5 finished with value: 0.9627244197558542 and parameters: {'n_estimators': 174, 'max_depth': 12, 'learning_rate': 0.027490085602279335, 'subsample': 0.7693969291372188, 'colsample_bytree': 0.6254884044074238, 'num_leaves': 175}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.031618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:53:36,482] Trial 6 finished with value: 0.8911655921341038 and parameters: {'n_estimators': 125, 'max_depth': 6, 'learning_rate': 0.010568518199412538, 'subsample': 0.8161797805026074, 'colsample_bytree': 0.8361652786551647, 'num_leaves': 171}. Best is trial 1 with value: 0.9720653112637178.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.007625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:54:32,184] Trial 7 finished with value: 0.9740213489442218 and parameters: {'n_estimators': 477, 'max_depth': 12, 'learning_rate': 0.14945602656030735, 'subsample': 0.7005340873643074, 'colsample_bytree': 0.7304360403477379, 'num_leaves': 182}. Best is trial 7 with value: 0.9740213489442218.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.037678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:55:09,583] Trial 8 finished with value: 0.9597450923666363 and parameters: {'n_estimators': 226, 'max_depth': 11, 'learning_rate': 0.022810770458545557, 'subsample': 0.7285997402450082, 'colsample_bytree': 0.8181413493706897, 'num_leaves': 121}. Best is trial 7 with value: 0.9740213489442218.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.969219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:56:06,191] Trial 9 finished with value: 0.9726616740404539 and parameters: {'n_estimators': 484, 'max_depth': 11, 'learning_rate': 0.05378077936220079, 'subsample': 0.7212651677949614, 'colsample_bytree': 0.7827909538179368, 'num_leaves': 221}. Best is trial 7 with value: 0.9740213489442218.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.074312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:56:45,890] Trial 10 finished with value: 0.9715080447923781 and parameters: {'n_estimators': 383, 'max_depth': 10, 'learning_rate': 0.09231788523372858, 'subsample': 0.6014194496682608, 'colsample_bytree': 0.6516328747706008, 'num_leaves': 36}. Best is trial 7 with value: 0.9740213489442218.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.060352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:57:41,496] Trial 11 finished with value: 0.9737644134982228 and parameters: {'n_estimators': 479, 'max_depth': 12, 'learning_rate': 0.0633980225212455, 'subsample': 0.6425937145063385, 'colsample_bytree': 0.7693818487520294, 'num_leaves': 149}. Best is trial 7 with value: 0.9740213489442218.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.077669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:58:37,399] Trial 12 finished with value: 0.9741785329000621 and parameters: {'n_estimators': 494, 'max_depth': 12, 'learning_rate': 0.08553296446784235, 'subsample': 0.6320567449891585, 'colsample_bytree': 0.7212043607937464, 'num_leaves': 137}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.087410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 12:59:20,255] Trial 13 finished with value: 0.9729228115231849 and parameters: {'n_estimators': 409, 'max_depth': 9, 'learning_rate': 0.10919775412980497, 'subsample': 0.8895020479876475, 'colsample_bytree': 0.7095914303104909, 'num_leaves': 112}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.056131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:00:09,686] Trial 14 finished with value: 0.972672995230327 and parameters: {'n_estimators': 496, 'max_depth': 12, 'learning_rate': 0.2843601896199085, 'subsample': 0.6088792404898664, 'colsample_bytree': 0.6657161756177087, 'num_leaves': 72}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.100494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:00:50,549] Trial 15 finished with value: 0.973278602487874 and parameters: {'n_estimators': 352, 'max_depth': 10, 'learning_rate': 0.0981515883772233, 'subsample': 0.6534716299501576, 'colsample_bytree': 0.7285281137990444, 'num_leaves': 140}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.078982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:01:41,424] Trial 16 finished with value: 0.9715800489490745 and parameters: {'n_estimators': 442, 'max_depth': 10, 'learning_rate': 0.05201251311978295, 'subsample': 0.7994136689692293, 'colsample_bytree': 0.7444194313478171, 'num_leaves': 252}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.028698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:02:22,611] Trial 17 finished with value: 0.973847065129812 and parameters: {'n_estimators': 325, 'max_depth': 12, 'learning_rate': 0.13970501025032633, 'subsample': 0.6759446971390966, 'colsample_bytree': 0.8148272609891727, 'num_leaves': 169}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.042966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:03:12,653] Trial 18 finished with value: 0.9737167186449349 and parameters: {'n_estimators': 444, 'max_depth': 11, 'learning_rate': 0.08119280430432164, 'subsample': 0.6360147761498243, 'colsample_bytree': 0.6890230552563816, 'num_leaves': 142}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.079383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:04:12,313] Trial 19 finished with value: 0.9682412355810496 and parameters: {'n_estimators': 447, 'max_depth': 9, 'learning_rate': 0.03955216585201459, 'subsample': 0.6816852110155666, 'colsample_bytree': 0.6178296268233523, 'num_leaves': 68}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.065753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:04:53,556] Trial 20 finished with value: 0.9736640855302976 and parameters: {'n_estimators': 362, 'max_depth': 10, 'learning_rate': 0.17675223183387398, 'subsample': 0.8654887770810311, 'colsample_bytree': 0.7958651570000295, 'num_leaves': 190}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.008201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:05:32,930] Trial 21 finished with value: 0.9731299406792571 and parameters: {'n_estimators': 304, 'max_depth': 12, 'learning_rate': 0.13771482954234857, 'subsample': 0.6794849531288684, 'colsample_bytree': 0.8411059502268398, 'num_leaves': 167}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.059710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:06:07,457] Trial 22 finished with value: 0.9734044899519543 and parameters: {'n_estimators': 249, 'max_depth': 12, 'learning_rate': 0.13004773744869103, 'subsample': 0.6233276714337566, 'colsample_bytree': 0.7430659225764348, 'num_leaves': 158}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.184480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:06:48,636] Trial 23 finished with value: 0.9715062875770175 and parameters: {'n_estimators': 329, 'max_depth': 11, 'learning_rate': 0.06792671901419195, 'subsample': 0.6614122564177919, 'colsample_bytree': 0.8025464275418934, 'num_leaves': 126}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.052931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:07:37,737] Trial 24 finished with value: 0.9731991041202124 and parameters: {'n_estimators': 414, 'max_depth': 12, 'learning_rate': 0.21108273672476663, 'subsample': 0.6865269179928313, 'colsample_bytree': 0.7013844011741106, 'num_leaves': 182}. Best is trial 12 with value: 0.9741785329000621.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.098742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:08:28,053] Trial 25 finished with value: 0.9742020643057616 and parameters: {'n_estimators': 465, 'max_depth': 11, 'learning_rate': 0.1244774646358469, 'subsample': 0.630760485968507, 'colsample_bytree': 0.7587115280400839, 'num_leaves': 107}. Best is trial 25 with value: 0.9742020643057616.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.113215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:09:18,615] Trial 26 finished with value: 0.9745242042490176 and parameters: {'n_estimators': 465, 'max_depth': 11, 'learning_rate': 0.11115531825960226, 'subsample': 0.6271813693364978, 'colsample_bytree': 0.7586120273366164, 'num_leaves': 102}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.108580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:10:09,743] Trial 27 finished with value: 0.9711007389410087 and parameters: {'n_estimators': 445, 'max_depth': 11, 'learning_rate': 0.04143108009545914, 'subsample': 0.6293253061588471, 'colsample_bytree': 0.7596457614513026, 'num_leaves': 97}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.100569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:11:00,132] Trial 28 finished with value: 0.9741578074825272 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.11470821886966749, 'subsample': 0.6144546076310684, 'colsample_bytree': 0.6620780738921731, 'num_leaves': 76}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.168240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:11:42,428] Trial 29 finished with value: 0.9715550103665636 and parameters: {'n_estimators': 387, 'max_depth': 9, 'learning_rate': 0.07693053336879467, 'subsample': 0.6475290346626191, 'colsample_bytree': 0.7758978080096631, 'num_leaves': 99}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.175769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:12:24,241] Trial 30 finished with value: 0.9726929080225764 and parameters: {'n_estimators': 463, 'max_depth': 8, 'learning_rate': 0.25219902943106154, 'subsample': 0.6011052814476562, 'colsample_bytree': 0.7545082763018842, 'num_leaves': 40}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.295002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:13:12,697] Trial 31 finished with value: 0.9735865944165463 and parameters: {'n_estimators': 494, 'max_depth': 10, 'learning_rate': 0.11194795305823405, 'subsample': 0.6163318036667351, 'colsample_bytree': 0.6503949968777084, 'num_leaves': 53}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.129199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:14:29,687] Trial 32 finished with value: 0.9741249899229254 and parameters: {'n_estimators': 462, 'max_depth': 11, 'learning_rate': 0.10956463770499891, 'subsample': 0.6307760844452001, 'colsample_bytree': 0.6004468910546173, 'num_leaves': 78}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.235567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:15:15,073] Trial 33 finished with value: 0.9734367588158506 and parameters: {'n_estimators': 424, 'max_depth': 10, 'learning_rate': 0.08566902017098167, 'subsample': 0.6544042397184104, 'colsample_bytree': 0.6775218244331377, 'num_leaves': 85}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.203148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:16:08,107] Trial 34 finished with value: 0.9736784905289462 and parameters: {'n_estimators': 498, 'max_depth': 11, 'learning_rate': 0.18068931389096737, 'subsample': 0.6173488300920257, 'colsample_bytree': 0.7180521834834019, 'num_leaves': 108}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.164074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:16:59,295] Trial 35 finished with value: 0.9735358226999984 and parameters: {'n_estimators': 466, 'max_depth': 11, 'learning_rate': 0.2107284941017709, 'subsample': 0.6398015839480511, 'colsample_bytree': 0.7003452897175457, 'num_leaves': 131}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.149242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:17:44,443] Trial 36 finished with value: 0.9738447383822001 and parameters: {'n_estimators': 426, 'max_depth': 10, 'learning_rate': 0.12175131100329695, 'subsample': 0.6999081827104231, 'colsample_bytree': 0.7335117137910839, 'num_leaves': 92}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.143139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:18:27,807] Trial 37 finished with value: 0.9678781170977617 and parameters: {'n_estimators': 388, 'max_depth': 9, 'learning_rate': 0.044148503218570154, 'subsample': 0.7315275782526994, 'colsample_bytree': 0.6464730716184033, 'num_leaves': 61}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.132057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:19:11,062] Trial 38 finished with value: 0.9713548406167815 and parameters: {'n_estimators': 465, 'max_depth': 7, 'learning_rate': 0.09944386699519608, 'subsample': 0.6657977004050967, 'colsample_bytree': 0.680717318588085, 'num_leaves': 116}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.154559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:19:42,362] Trial 39 finished with value: 0.9682700039052549 and parameters: {'n_estimators': 207, 'max_depth': 11, 'learning_rate': 0.0648880047807415, 'subsample': 0.7578894200631615, 'colsample_bytree': 0.8470483200018042, 'num_leaves': 105}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.136304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:20:22,125] Trial 40 finished with value: 0.9732759631920752 and parameters: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.16659875173024552, 'subsample': 0.781880469477337, 'colsample_bytree': 0.7851904777873544, 'num_leaves': 86}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.097070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:21:36,527] Trial 41 finished with value: 0.9743068026758765 and parameters: {'n_estimators': 469, 'max_depth': 11, 'learning_rate': 0.1127256704037405, 'subsample': 0.6267718697301513, 'colsample_bytree': 0.602938867056477, 'num_leaves': 80}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.199298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:22:24,786] Trial 42 finished with value: 0.9728078285180621 and parameters: {'n_estimators': 477, 'max_depth': 11, 'learning_rate': 0.07707038391218986, 'subsample': 0.6177125071511742, 'colsample_bytree': 0.668767811586206, 'num_leaves': 49}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.146649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:23:21,651] Trial 43 finished with value: 0.9738021971012352 and parameters: {'n_estimators': 434, 'max_depth': 10, 'learning_rate': 0.15294312984431957, 'subsample': 0.6014477876319665, 'colsample_bytree': 0.6200738412256654, 'num_leaves': 134}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.381080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:24:23,337] Trial 44 finished with value: 0.9737975644425567 and parameters: {'n_estimators': 500, 'max_depth': 12, 'learning_rate': 0.12315408885379356, 'subsample': 0.6331582367190052, 'colsample_bytree': 0.6301046752887847, 'num_leaves': 76}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.127311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:25:14,289] Trial 45 finished with value: 0.9736766569129176 and parameters: {'n_estimators': 463, 'max_depth': 11, 'learning_rate': 0.09596209575996507, 'subsample': 0.8380552926364176, 'colsample_bytree': 0.7613591756885241, 'num_leaves': 120}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.410323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:26:30,126] Trial 46 finished with value: 0.9685061097509902 and parameters: {'n_estimators': 482, 'max_depth': 12, 'learning_rate': 0.019903102915882737, 'subsample': 0.6459127900528612, 'colsample_bytree': 0.6324246863401249, 'num_leaves': 151}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.378062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:27:18,497] Trial 47 finished with value: 0.9727845332598813 and parameters: {'n_estimators': 451, 'max_depth': 11, 'learning_rate': 0.07172233389530237, 'subsample': 0.659421749901719, 'colsample_bytree': 0.7190411991266076, 'num_leaves': 61}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.156695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:27:49,997] Trial 48 finished with value: 0.9617174520080003 and parameters: {'n_estimators': 129, 'max_depth': 10, 'learning_rate': 0.05722706301240299, 'subsample': 0.7124646943947053, 'colsample_bytree': 0.6060205109248582, 'num_leaves': 87}. Best is trial 26 with value: 0.9745242042490176.


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.213548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


[I 2024-06-03 13:28:32,982] Trial 49 finished with value: 0.973323977539065 and parameters: {'n_estimators': 366, 'max_depth': 11, 'learning_rate': 0.09130416127541595, 'subsample': 0.6113842181649033, 'colsample_bytree': 0.660856266774643, 'num_leaves': 101}. Best is trial 26 with value: 0.9745242042490176.


Best parameters found:  {'n_estimators': 465, 'max_depth': 11, 'learning_rate': 0.11115531825960226, 'subsample': 0.6271813693364978, 'colsample_bytree': 0.7586120273366164, 'num_leaves': 102}


In [14]:
# 최적의 파라미터로 모델 학습
best_params = study.best_params
best_model = lgb.LGBMClassifier(**best_params)
best_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.160984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4741
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1975
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208


In [15]:
# 테스트 데이터 예측 및 평가
y_pred = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f'Test AUC Score: {auc_score:.4f}')

Test AUC Score: 0.9745


In [16]:
# 테스트 데이터 예측 및 저장
test_file = '/kaggle/input/leash-BELKA/test.csv'  
output_file = '/kaggle/working/submission9.csv'  # 출력 파일 경로

In [17]:
import os

In [18]:
for df_test in pd.read_csv(test_file, chunksize=100000):
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(generate_ecfp)
    df_test['maccs'] = df_test['molecule'].apply(generate_maccs_keys)
    df_test['physchem'] = df_test['molecule'].apply(generate_physchem_features)
    
    # 테스트 데이터의 물리화학적 특성 정규화
    physchem_features_test = np.array(df_test['physchem'].tolist())
    physchem_features_test_scaled = scaler.transform(physchem_features_test)
    
    df_test['features'] = [ecfp + maccs + physchem_scaled.tolist() for ecfp, maccs, physchem_scaled in zip(df_test['ecfp'], df_test['maccs'], physchem_features_test_scaled)]
    protein_onehot = onehot_encoder.transform(df_test['protein_name'].values.reshape(-1, 1))
    X_test = np.array([features + list(protein) for features, protein in zip(df_test['features'].tolist(), protein_onehot.tolist())])
    
    probabilities = best_model.predict_proba(X_test)[:, 1]
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))