In [1]:
!pip install rdkit
!pip install duckdb

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (763 bytes)
Downloading duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-0.10.3


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import optuna
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import duckdb
import pandas as pd
import os
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

## 데이터 불러오기

In [3]:
train_path = '/kaggle/input/leash-predict-chemical-bindings/train.parquet'
test_path = '/kaggle/input/leash-predict-chemical-bindings/test.parquet'

In [4]:
# 각 단백질에 대해 binds=0,1 데이터를 각각 12000개씩 불러오기
con = duckdb.connect()

df_brd4_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 20000
""").df()

df_brd4_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 20000
""").df()

df_hsa_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 20000
""").df()

df_hsa_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 20000
""").df()

df_seh_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 20000
""").df()

df_seh_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 20000
""").df()

# 데이터프레임 결합
df = pd.concat([df_brd4_0, df_brd4_1, df_hsa_0, df_hsa_1, df_seh_0, df_seh_1], axis=0).reset_index(drop=True)
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## 피처 엔지니어링

In [5]:
# ECFP 생성 함수
def generate_ecfp(molecule, radius=2, bits=2048):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

In [6]:
# 분자의 물리화학적 특성을 추출하는 함수
def generate_physchem_features(molecule):
    if molecule is None:
        return [np.nan] * 5
    return [
        Descriptors.MolWt(molecule),                   # 분자량
        Descriptors.MolLogP(molecule),                 # 로그 P
        Descriptors.NumHDonors(molecule),              # 수소 결합 공여자 수
        Descriptors.NumHAcceptors(molecule),           # 수소 결합 수용체 수
        Descriptors.TPSA(molecule)                     # TPSA (극성 표면적)
    ]

In [7]:
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)

In [8]:
df['physchem'] = df['molecule'].apply(generate_physchem_features)

In [9]:
df['ecfp'] = df['molecule'].apply(generate_ecfp)

In [10]:
# 물리화학적 특성을 정규화
physchem_features = np.array(df['physchem'].tolist())
scaler = StandardScaler()
physchem_features_scaled = scaler.fit_transform(physchem_features)

In [11]:
# ECFP와 정규화된 물리화학적 특성을 결합
df['features'] = [ecfp + physchem_scaled.tolist() for ecfp, physchem_scaled in zip(df['ecfp'], physchem_features_scaled)]

In [12]:
# 단백질 이름을 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

## Train Model

In [13]:
# 최종 입력 데이터 생성
X = [features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())]
y = df['binds'].tolist()

In [14]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# 고정된 LightGBM 파라미터
params_lgb = {
    'n_estimators': 285,
    'max_depth': 9,
    'learning_rate': 0.14411693400043962,
    'subsample': 0.664965277451675,
    'colsample_bytree': 0.6264976016411175,
    'num_leaves': 226
}

In [16]:
# 고정된 XGBoost 파라미터
params_xgb = {
    'n_estimators': 300,
    'max_depth': 8,
    'learning_rate': 0.01461751822935858,
    'subsample': 0.6594401898016142,
    'colsample_bytree': 0.7589525793147899
}

In [17]:
params_cat = {
'iterations': 271, 
'depth': 9, 
'learning_rate': 0.216316733109778, 
'l2_leaf_reg': 4.434922376722466,
'border_count': 62
}

In [18]:
# 기본 모델 정의
model_lgb = lgb.LGBMClassifier(**params_lgb)
model_xgb = xgb.XGBClassifier(**params_xgb)
model_cat = CatBoostClassifier(**params_cat, verbose=0)

In [19]:
# 스태킹 모델 정의
estimators = [
    ('lgb', model_lgb),
    ('xgb', model_xgb),
    ('cat', model_cat)
]
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    passthrough=True
)

In [20]:
# 교차 검증 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증 실행 및 결과 출력
cross_val_scores = cross_val_score(stacking_model, X, y, cv=cv, scoring='roc_auc')
print(f'Cross-Validation AUC Scores: {cross_val_scores}')
print(f'Mean AUC Score: {cross_val_scores.mean()}')

[LightGBM] [Info] Number of positive: 48000, number of negative: 48000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.998442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of negative: 38400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.317144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 48000, number of negative: 48000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.717495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4494
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of negative: 38400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.392453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4494
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 48000, number of negative: 48000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.762689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of negative: 38400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.792601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 48000, number of negative: 48000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.807819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4494
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of negative: 38400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.428913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4494
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 48000, number of negative: 48000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.747252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of negative: 38400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.466320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4495
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 38400, number of

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-Validation AUC Scores: [0.97415104 0.97532423 0.97554286 0.97561208 0.97443756]
Mean AUC Score: 0.9750135527777777


In [21]:
# 스태킹 모델 학습
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 48149, number of negative: 47851
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.706078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4494
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501552 -> initscore=0.006208
[LightGBM] [Info] Start training from score 0.006208
[LightGBM] [Info] Number of positive: 38520, number of negative: 38280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.503142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4493
[LightGBM] [Info] Number of data points in the train set: 76800, number of used features: 1858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501563 -> initscore=0.006250
[LightGBM] [Info] Start training from score 0.006250
[LightGBM] [

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Submission

In [22]:
# 테스트 데이터 예측 및 저장
test_file = '/kaggle/input/leash-predict-chemical-bindings/test.csv'  
output_file = '/kaggle/working/submission11.csv'  # 출력 파일 경로

In [23]:
for df_test in pd.read_csv(test_file, chunksize=100000):
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(generate_ecfp)
    df_test['physchem'] = df_test['molecule'].apply(generate_physchem_features)
    
    # 테스트 데이터의 물리화학적 특성 정규화
    physchem_features_test = np.array(df_test['physchem'].tolist())
    physchem_features_test_scaled = scaler.transform(physchem_features_test)
    
    df_test['features'] = [ecfp + physchem_scaled.tolist() for ecfp, physchem_scaled in zip(df_test['ecfp'], physchem_features_test_scaled)]
    protein_onehot = onehot_encoder.transform(df_test['protein_name'].values.reshape(-1, 1))
    X_test = [features + list(protein) for features, protein in zip(df_test['features'].tolist(), protein_onehot.tolist())]
    
    probabilities = stacking_model.predict_proba(X_test)[:, 1]
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))