In [None]:
!pip install mordred

In [None]:
import logging
import os
import time
import shutil
import copy
import pandas as pd
import sys
import sklearn
import warnings
import os
import glob
import numpy as np
import pandas as pd
from rdkit import RDLogger
import utils
from rdkit import Chem
from mordred import Calculator, descriptors
import warnings , os
import subprocess
import shutil

In [None]:

def calc_descriptor(smiles_list):
    calc = Calculator(descriptors, ignore_3D = True)
    mols = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi) 
        if mol != None:
            mols.append(mol)
            
    for each_mol in mols:
        try:
            AllChem.EmbedMolecule(each_mol, randomSeed=42)
        except:
            pass
    
    df = calc.pandas(mols, nproc=1)
    new_df = df.select_dtypes(include=['float64', 'int'])
    return new_df

In [None]:
train = pd.read_csv('../MS_BACL/data_OURS/train.csv', index_col=None)
test  = pd.read_csv('../MS_BACL/data_OURS/test.csv', index_col=None)

In [None]:
train_data = calc_descriptor(train['SMILES'])
test_data = calc_descriptor(test['SMILES'])

In [None]:
test_data = test_data[train_data.columns]

In [None]:
X_train = train_data.values
y_train = train['Label'].values

X_test = test_data.values
y_test = test['Label'].values

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create Random Forest model with the exact parameters from the paper
rf_model = RandomForestClassifier(
    n_estimators=500,      # number of trees = 500
    max_depth=30,          # maximum depth of trees = 30
    min_samples_leaf=3,    # minimum samples in leaf = 3
    min_samples_split=2,   # minimum samples for split = 2
    random_state=42        # for reproducibility
)

# Train model
rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

In [None]:
probs = rf_model.predict_proba(X_test)

In [None]:
probs = probs[:, 1]

In [None]:
PredMS_probs = probs

In [None]:
preds = (probs>0.5).astype(int)

In [None]:
auc = roc_auc_score(test['Label'], probs)
acc = accuracy_score(test['Label'], preds)
prc = precision_score(test['Label'], preds)
rec = recall_score(test['Label'], preds)
f1s = f1_score(test['Label'], preds)

# f1s = (2 * prc * rec) / (rec + prc)
mcc = matthews_corrcoef(test['Label'], preds)

print(f'auc : {auc:.4f}')
print(f'acc : {acc:.4f}')
print(f'prc : {prc:.4f}')
print(f'rec : {rec:.4f}')
print(f'f1s : {f1s:.4f}')
print(f'mcc : {mcc:.4f}')

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import numpy as np
from scipy import stats

def calculate_metrics(y_true, y_pred, y_prob):
    """Calculate all metrics at once"""
    return {
        'auc': roc_auc_score(y_true, y_prob),
        'acc': accuracy_score(y_true, y_pred),
        'prc': precision_score(y_true, y_pred),
        'rec': recall_score(y_true, y_pred),
        'f1s': f1_score(y_true, y_pred),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }

def bootstrap_metrics(test_labels, predicted_probs, predicted_labels, n_iterations=1000, ci=95):
    """
    Calculate bootstrap confidence intervals for all metrics
    """
    # Initialize storage for bootstrap estimates
    bootstrap_estimates = {metric: [] for metric in ['auc', 'acc', 'prc', 'rec', 'f1s', 'mcc']}
    
    # Get sample size
    n_samples = len(test_labels)
    
    # Calculate actual metrics once
    actual_metrics = calculate_metrics(test_labels, predicted_labels, predicted_probs)
    
    # Bootstrap iterations
    for _ in range(n_iterations):
        # Generate bootstrap sample indices
        indices = np.random.randint(0, n_samples, n_samples)
        
        # Calculate metrics for this bootstrap sample
        bootstrap_metrics = calculate_metrics(
            test_labels[indices],
            predicted_labels[indices],
            predicted_probs[indices]
        )
        
        # Store this bootstrap sample's metrics
        for metric, value in bootstrap_metrics.items():
            bootstrap_estimates[metric].append(value)
    
    # Calculate confidence intervals
    alpha = (100 - ci) / 100
    results = {}
    
    for metric in bootstrap_estimates.keys():
        # Get confidence intervals
        ci_lower = np.percentile(bootstrap_estimates[metric], alpha * 50)
        ci_upper = np.percentile(bootstrap_estimates[metric], 100 - alpha * 50)
        
        results[metric] = {
            'value': actual_metrics[metric],
            'ci_lower': ci_lower,
            'ci_upper': ci_upper
        }
    
    return results

# 실행 예시
# test['Label']와 probs, preds는 numpy array로 변환되어 있다고 가정
test_labels = np.array(test['Label'])
predicted_probs = np.array(probs)
predicted_labels = np.array(preds)

# Calculate bootstrap results
bootstrap_results = bootstrap_metrics(test_labels, predicted_probs, predicted_labels)

# Print results with confidence intervals
for metric, result in bootstrap_results.items():
    print(f"{metric:3s}: {result['value']:.4f}({result['ci_lower']:.4f}-{result['ci_upper']:.4f})")