In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install rdkit
!pip install duckdb

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, MACCSkeys, rdMolDescriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from catboost import CatBoostClassifier
import lightgbm as lgb
import os
import optuna
import duckdb
from sklearn.metrics import roc_auc_score

In [4]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

In [5]:
con = duckdb.connect()

# 각 단백질에 대해 binds=0,1 데이터를 각각 12000개씩 불러오기
df_brd4_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 10000
""").df()

df_brd4_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'BRD4'
    ORDER BY random()
    LIMIT 10000
""").df()

df_hsa_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 10000
""").df()

df_hsa_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'HSA'
    ORDER BY random()
    LIMIT 10000
""").df()

df_seh_0 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 0 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 10000
""").df()

df_seh_1 = con.query(f"""
    SELECT * 
    FROM parquet_scan('{train_path}') 
    WHERE binds = 1 AND protein_name = 'sEH'
    ORDER BY random()
    LIMIT 10000
""").df()

# 데이터프레임 결합
df = pd.concat([df_brd4_0, df_brd4_1, df_hsa_0, df_hsa_1, df_seh_0, df_seh_1], axis=0).reset_index(drop=True)
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
# ECFP 생성 함수
def generate_ecfp(molecule, radius=2, bits=2048):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

# MACCS keys 생성 함수
def generate_maccs_keys(molecule):
    if molecule is None:
        return None
    return list(MACCSkeys.GenMACCSKeys(molecule))

# Atom pairs fingerprints 생성 함수
def generate_atom_pairs(molecule):
    if molecule is None:
        return None
    fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(molecule, nBits=2048)
    return list(fp)

# Topological fingerprints 생성 함수
def generate_topological_torsion(molecule):
    if molecule is None:
        return None
    fp = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(molecule, nBits=2048)
    return list(fp)

# PubChem fingerprints 생성 함수
def generate_pubchem_fingerprints(molecule):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius=2, nBits=881))  # PubChem-like fingerprints

# 분자의 물리화학적 특성을 추출하는 함수
def generate_physchem_features(molecule):
    if molecule is None:
        return [np.nan] * 8
    return [
        Descriptors.MolWt(molecule),                   # 분자량
        Descriptors.MolLogP(molecule),                 # 로그 P
        Descriptors.NumHDonors(molecule),              # 수소 결합 공여자 수
        Descriptors.NumHAcceptors(molecule),           # 수소 결합 수용체 수
        Descriptors.TPSA(molecule),                    # TPSA (극성 표면적)
        Descriptors.NumRotatableBonds(molecule),       # 회전 가능한 결합 수
        Descriptors.FpDensityMorgan1(molecule),        # 분자 복잡도 지표 예시
        Descriptors.FpDensityMorgan2(molecule)         # 분자 복잡도 지표 예시
    ]

In [7]:
# DataFrame에 새로운 특징 추가
df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)
df['physchem'] = df['molecule'].apply(generate_physchem_features)
df['ecfp'] = df['molecule'].apply(generate_ecfp)
df['maccs'] = df['molecule'].apply(generate_maccs_keys)
df['atom_pairs'] = df['molecule'].apply(generate_atom_pairs)
df['topological'] = df['molecule'].apply(generate_topological_torsion)
df['pubchem'] = df['molecule'].apply(generate_pubchem_fingerprints)

In [8]:
# 물리화학적 특성을 정규화
physchem_features = np.array(df['physchem'].tolist())
scaler = StandardScaler()
physchem_features_scaled = scaler.fit_transform(physchem_features)

In [9]:
# 모든 특징 결합
df['features'] = [ecfp + maccs + atom_pairs + topological + pubchem + physchem_scaled.tolist() 
                  for ecfp, maccs, atom_pairs, topological, pubchem, physchem_scaled 
                  in zip(df['ecfp'], df['maccs'], df['atom_pairs'], df['topological'], df['pubchem'], physchem_features_scaled)]

In [10]:
# 단백질 이름을 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_onehot = onehot_encoder.fit_transform(df['protein_name'].values.reshape(-1, 1))

In [11]:
# 최종 입력 데이터 생성
X = np.array([features + list(protein) for features, protein in zip(df['features'].tolist(), protein_onehot.tolist())])
y = np.array(df['binds'].tolist())

In [12]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params_cat={'iterations': 448, 'depth': 11, 'learning_rate': 0.10711332110740407, 'subsample': 0.7577541518108377, 'colsample_bylevel': 0.835626834709964}

In [13]:
# 최적의 파라미터로 모델 학습
best_model = CatBoostClassifier(**params_cat, loss_function='Logloss', eval_metric='AUC', logging_level='Silent')
best_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7d8c6d72a080>

In [14]:
# 테스트 데이터 예측 및 평가
y_pred = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f'Test AUC Score: {auc_score:.4f}')

Test AUC Score: 0.9693


In [15]:
# 테스트 데이터 예측 및 저장
test_file = '/kaggle/input/leash-BELKA/test.csv'  
output_file = '/kaggle/working/submission14.csv'  # 출력 파일 경로

In [16]:
for df_test in pd.read_csv(test_file, chunksize=20000):
    df_test['molecule'] = df_test['molecule_smiles'].apply(Chem.MolFromSmiles)
    df_test['ecfp'] = df_test['molecule'].apply(generate_ecfp)
    df_test['maccs'] = df_test['molecule'].apply(generate_maccs_keys)
    df_test['atom_pairs'] = df_test['molecule'].apply(generate_atom_pairs)
    df_test['topological'] = df_test['molecule'].apply(generate_topological_torsion)
    df_test['pubchem'] = df_test['molecule'].apply(generate_pubchem_fingerprints)
    df_test['physchem'] = df_test['molecule'].apply(generate_physchem_features)
    
    # 테스트 데이터의 물리화학적 특성 정규화
    physchem_features_test = np.array(df_test['physchem'].tolist())
    physchem_features_test_scaled = scaler.transform(physchem_features_test)
    
    df_test['features'] = [ecfp + maccs + atom_pairs + topological + pubchem + physchem_scaled.tolist() 
                           for ecfp, maccs, atom_pairs, topological, pubchem, physchem_scaled 
                           in zip(df_test['ecfp'], df_test['maccs'], df_test['atom_pairs'], df_test['topological'], df_test['pubchem'], physchem_features_test_scaled)]
    
    protein_onehot = onehot_encoder.transform(df_test['protein_name'].values.reshape(-1, 1))
    X_test = np.array([features + list(protein) for features, protein in zip(df_test['features'].tolist(), protein_onehot.tolist())])
    
    probabilities = best_model.predict_proba(X_test)[:, 1]
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': probabilities})
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))