### Part 1: descriptor 생성 후 csv 저장

In [18]:
# ====== Part 1: descriptors 생성 후 *_with_desc.csv 저장 ======
import os, glob
import numpy as np
import pandas as pd

DATA_DIR = "/home/ssm-user/LAIDD/tox21/Data/with_header"
MID_DIR  = "/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc"
os.makedirs(MID_DIR, exist_ok=True)

# ====== RDKit 2D descriptor 유틸 ======
from rdkit import Chem
from rdkit.Chem import Descriptors

try:
    from rdkit.Chem import Descriptors3D
    desc3d = {n for n, _ in Descriptors3D._descList}
except Exception:
    desc3d = set()

DESC_2D_NAMES = [n for n, _ in Descriptors._descList if n not in desc3d]
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
_calc = MolecularDescriptorCalculator(DESC_2D_NAMES)

def rdkit_2d_descriptors_from_series(smiles_series: pd.Series,
                                     keep_all_rows: bool = True) -> pd.DataFrame:
    rows, idxs = [], []
    for idx, smi in smiles_series.items():
        smi = "" if pd.isna(smi) else str(smi).strip()
        if not smi:
            if keep_all_rows:
                rows.append([np.nan]*len(DESC_2D_NAMES)); idxs.append(idx)
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            if keep_all_rows:
                rows.append([np.nan]*len(DESC_2D_NAMES)); idxs.append(idx)
            continue
        try:
            vals = list(_calc.CalcDescriptors(mol))
        except Exception:
            vals = [np.nan]*len(DESC_2D_NAMES)
        rows.append(vals); idxs.append(idx)
    return pd.DataFrame(rows, columns=DESC_2D_NAMES, index=idxs)

def add_rdkit_2d_descriptors(df: pd.DataFrame,
                             smiles_col: str = "SMILES",
                             keep_all_rows: bool = False) -> pd.DataFrame:
    desc_df = rdkit_2d_descriptors_from_series(df[smiles_col], keep_all_rows=keep_all_rows)
    return (pd.concat([df, desc_df.reindex(df.index)], axis=1)
            if keep_all_rows else df.join(desc_df, how="inner"))

# ====== 보조 ======
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    rename_map = {}
    for c in df.columns:
        lc = c.strip().lower().replace(" ", "_")
        if lc in {"smiles", "smile", "smiles_str"}:
            rename_map[c] = "SMILES"
        elif lc in {"code", "sample_id", "id", "sampleid"}:
            rename_map[c] = "Sample ID"
        elif lc in {"toxicity", "label", "y", "active"}:
            rename_map[c] = "toxicity"
    return df.rename(columns=rename_map)

def clean_labels(df: pd.DataFrame) -> pd.DataFrame:
    df["toxicity"] = pd.to_numeric(df["toxicity"], errors="coerce")
    return df[df["toxicity"].isin([0, 1])].assign(toxicity=lambda x: x["toxicity"].astype(int))

# ====== 메인 ======
assay_files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
total_used, total_skipped = 0, 0

for f in assay_files:
    assay_name = os.path.splitext(os.path.basename(f))[0]
    print(f"\n[INFO][P1] {assay_name}")

    df = pd.read_csv(f, dtype=str, engine="python")
    df = normalize_columns(df)

    required = {"SMILES", "Sample ID", "toxicity"}
    if not required.issubset(df.columns):
        total_skipped += 1
        print(f"  -> SKIP: missing {sorted(list(required - set(df.columns)))}")
        continue

    total_used += 1
    df["SMILES"]    = df["SMILES"].astype(str).str.strip()
    df["Sample ID"] = df["Sample ID"].astype(str).str.strip()

    # (선택) toxicity 정리: Part1에서는 원본 보존해도 되지만, 일관성을 위해 정리
    df = clean_labels(df)

    # 디스크립터 계산 (유효 SMILES만 남김)
    df_with_desc = add_rdkit_2d_descriptors(df, smiles_col="SMILES", keep_all_rows=False)

    # 저장: *_with_desc.csv
    out_path = os.path.join(MID_DIR, f"{assay_name}_with_desc.csv")
    df_with_desc.to_csv(out_path, index=False)
    print(f"  -> saved: {out_path} (rows={len(df_with_desc)}, cols={len(df_with_desc.columns)})")

print(f"\n[READY][P1] used={total_used}, skipped={total_skipped}")



[INFO][P1] nr-ahr


[01:52:15] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:52:32] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ahr_with_desc.csv (rows=8167, cols=220)

[INFO][P1] nr-ar-lbd


[01:53:30] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:53:48] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ar-lbd_with_desc.csv (rows=8597, cols=220)

[INFO][P1] nr-ar


[01:54:56] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:55:15] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ar_with_desc.csv (rows=9360, cols=220)

[INFO][P1] nr-aromatase


[01:56:07] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:56:23] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-aromatase_with_desc.csv (rows=7224, cols=220)

[INFO][P1] nr-er-lbd


[01:57:22] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:57:40] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-er-lbd_with_desc.csv (rows=8751, cols=220)

[INFO][P1] nr-er


[01:58:34] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[01:58:51] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-er_with_desc.csv (rows=7695, cols=220)

[INFO][P1] nr-ppar-gamma


[01:59:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[02:00:01] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ppar-gamma_with_desc.csv (rows=8182, cols=220)

[INFO][P1] sr-are


[02:01:02] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/sr-are_with_desc.csv (rows=7166, cols=220)

[INFO][P1] sr-atad5


[02:02:03] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[02:02:22] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/sr-atad5_with_desc.csv (rows=9089, cols=220)

[INFO][P1] sr-hse


[02:02:55] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[02:03:32] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/sr-hse_with_desc.csv (rows=8148, cols=220)

[INFO][P1] sr-mmp


[02:04:25] Explicit valence for atom # 0 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/sr-mmp_with_desc.csv (rows=7319, cols=220)

[INFO][P1] sr-p53


[02:05:37] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[02:05:56] Explicit valence for atom # 2 Cl, 1, is greater than permitted


  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/sr-p53_with_desc.csv (rows=8632, cols=220)

[READY][P1] used=12, skipped=0


### Part 1.5: nan, inf 값 확인

In [2]:
# ====== Part 1.5 (DF 출력 + 전체 요약 CSV): with_desc 파일들에서 ±inf / NaN 현황 집계 ======
import os, glob
import numpy as np
import pandas as pd

MID_DIR  = "/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc"     # Part1 출력 폴더
SAVE_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary"   # 전체 요약 CSV 저장 폴더
os.makedirs(SAVE_DIR, exist_ok=True)

SUMMARY_LIMIT_PRINT = 30  # 콘솔 출력 상위 개수 제한

def summarize_inf_nan(X: pd.DataFrame) -> pd.DataFrame:
    """각 컬럼의 +inf, -inf, total_inf, nan_count 집계."""
    arr = X.to_numpy()
    pos = np.isposinf(arr).sum(axis=0)
    neg = np.isneginf(arr).sum(axis=0)
    tot = pos + neg
    nan = np.isnan(arr).sum(axis=0)
    out = pd.DataFrame({
        "pos_inf": pos,
        "neg_inf": neg,
        "total_inf": tot,
        "nan_count": nan,
    }, index=X.columns)
    return out

with_desc_files = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
print(f"[P1.5] Found {len(with_desc_files)} with_desc files under: {MID_DIR}")
if not with_desc_files:
    print("      -> No files matched *_with_desc.csv (check the path and Part1 output).")

used, skipped = 0, 0

# 메모리 보관용 dict들
summary_by_assay        = {}  # assay -> 전체 요약 DF
inf_columns_dfs         = {}  # assay -> ±inf 있는 컬럼 DF
nan_columns_dfs         = {}  # assay -> NaN 있는 컬럼 DF
nan_presence_by_assay   = {}  # assay -> (각 컬럼에 NaN 존재 여부 bool Series)

for path in with_desc_files:
    assay_name = os.path.basename(path).replace("_with_desc.csv", "")
    print(f"\n[INFO][P1.5] {assay_name} ({path})")

    df = pd.read_csv(path, dtype=str, engine="python")

    # 필수 컬럼 확인
    required = {"Sample ID", "SMILES", "toxicity"}
    if not required.issubset(df.columns):
        skipped += 1
        print(f"  -> SKIP: missing {sorted(list(required - set(df.columns)))}")
        continue
    used += 1

    # 피처 컬럼 선택 및 숫자화 (공백 제거 후 변환)
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    feat_cols = [c for c in df.columns if c not in non_feat]
    X = df[feat_cols].apply(lambda s: s.str.strip())
    X = X.apply(pd.to_numeric, errors="coerce")

    # 1) 요약 집계
    summary = summarize_inf_nan(X)
    summary_by_assay[assay_name] = summary

    # 2) ±inf 있는 컬럼만
    inf_only = (summary[summary["total_inf"] > 0]
                .sort_values("total_inf", ascending=False)
                .reset_index()
                .rename(columns={"index": "column"}))
    inf_only = inf_only[["column", "pos_inf", "neg_inf", "total_inf"]]
    inf_columns_dfs[assay_name] = inf_only

    # 3) NaN 있는 컬럼만
    nan_only = (summary[summary["nan_count"] > 0]
                .sort_values("nan_count", ascending=False)
                .reset_index()
                .rename(columns={"index": "column"}))
    nan_only = nan_only[["column", "nan_count"]]
    nan_columns_dfs[assay_name] = nan_only

    # 4) 이 assay에서 "NaN 보유 여부"를 컬럼별 boolean으로 저장 (교차 집계 용)
    nan_presence_by_assay[assay_name] = X.isna().any(axis=0)

    # 콘솔 출력 (상위 N개 미리보기)
    if inf_only.empty:
        print("  -> No ±inf in any feature column.")
    else:
        print("  -> Columns with ±inf (top):")
        print(inf_only.head(SUMMARY_LIMIT_PRINT).to_string(index=False))
        if len(inf_only) > SUMMARY_LIMIT_PRINT:
            print(f"     ... and {len(inf_only) - SUMMARY_LIMIT_PRINT} more columns")

    if nan_only.empty:
        print("  -> No NaN in any feature column.")
    else:
        print("  -> Columns with NaN (top):")
        print(nan_only.head(SUMMARY_LIMIT_PRINT).to_string(index=False))
        if len(nan_only) > SUMMARY_LIMIT_PRINT:
            print(f"     ... and {len(nan_only) - SUMMARY_LIMIT_PRINT} more columns")

print(f"\n[READY][P1.5] used={used}, skipped={skipped}")

# ====== 교차 집계: 각 컬럼이 NaN을 가진 assay의 개수 ======
assays_with_nan_count_df = pd.DataFrame(columns=["column", "assays_with_nan"])
if nan_presence_by_assay:
    nan_presence_df = pd.DataFrame(nan_presence_by_assay).T  # (#assays, #columns)
    nan_presence_df = nan_presence_df.fillna(False).astype(bool)
    counts = nan_presence_df.sum(axis=0).sort_values(ascending=False)
    assays_with_nan_count_df = (counts.rename("assays_with_nan")
                                .to_frame()
                                .reset_index()
                                .rename(columns={"index": "column"}))
    total_assays = nan_presence_df.shape[0]
    print(f"\n[CROSS-ASSAY] Columns ranked by #assays with NaN (out of {total_assays} assays):")
    print(assays_with_nan_count_df.head(SUMMARY_LIMIT_PRINT).to_string(index=False))
    if len(assays_with_nan_count_df) > SUMMARY_LIMIT_PRINT:
        print(f"  ... and {len(assays_with_nan_count_df) - SUMMARY_LIMIT_PRINT} more columns")

# ====== 전체 요약을 하나의 CSV로 저장 (per-assay × per-column long form) ======
overall_summary_df = pd.DataFrame(
    columns=["assay", "column", "pos_inf", "neg_inf", "total_inf", "nan_count"]
)
for assay_name, summary in summary_by_assay.items():
    tmp = (summary.reset_index()
                 .rename(columns={"index": "column"}))
    tmp.insert(0, "assay", assay_name)
    overall_summary_df = pd.concat([overall_summary_df, tmp], ignore_index=True)

overall_csv_path = os.path.join(SAVE_DIR, "all_assays_inf_nan_summary.csv")
overall_summary_df.to_csv(overall_csv_path, index=False)
print(f"\n[INFO] Saved overall summary CSV -> {overall_csv_path}")


# ====== 12개 assay에서 공통으로 NaN을 가진 컬럼명 저장 ======
TARGET_ASSAYS = 12

cols_nan_in_12_df = pd.DataFrame(columns=["column"])
if nan_presence_by_assay:
    nan_presence_df = pd.DataFrame(nan_presence_by_assay).T  # (#assays, #columns)
    nan_presence_df = nan_presence_df.fillna(False).astype(bool)

    total_assays = nan_presence_df.shape[0]
    counts = nan_presence_df.sum(axis=0)  # 각 컬럼이 NaN을 가진 assay 수

    # 12개 assay 모두에서 NaN인 컬럼
    cols_nan_in_12 = counts[counts == TARGET_ASSAYS].index.tolist()

    # 저장 (CSV / TXT)
    os.makedirs(SAVE_DIR, exist_ok=True)
    save_csv = os.path.join(SAVE_DIR, "columns_with_nan_in_all_12_assays.csv")

    cols_nan_in_12_df = pd.DataFrame({"column": sorted(cols_nan_in_12)})
    cols_nan_in_12_df.to_csv(save_csv, index=False)

    print(f"\n[INFO] Columns with NaN in all {TARGET_ASSAYS} assays: {len(cols_nan_in_12)}")
    print(f"       Saved -> {save_csv}")

    if total_assays != TARGET_ASSAYS:
        print(f"       (NOTE) Detected {total_assays} assays, "
              f"but TARGET_ASSAYS={TARGET_ASSAYS}. Using exact match to {TARGET_ASSAYS}.")
else:
    print("\n[INFO] nan_presence_by_assay is empty. Nothing to save.")


[P1.5] Found 12 with_desc files under: /home/ssm-user/LAIDD/tox21/train_data_inf/with_desc

[INFO][P1.5] nr-ahr (/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ahr_with_desc.csv)
  -> Columns with ±inf (top):
             column  pos_inf  neg_inf  total_inf
   MaxPartialCharge        2        0          2
MaxAbsPartialCharge        2        0          2
  -> Columns with NaN (top):
             column  nan_count
       BCUT2D_CHGLO        506
       BCUT2D_CHGHI        506
       BCUT2D_MWLOW        506
        BCUT2D_MWHI        506
      BCUT2D_LOGPHI        506
     BCUT2D_LOGPLOW        506
        BCUT2D_MRHI        506
       BCUT2D_MRLOW        506
MinAbsPartialCharge         76
MaxAbsPartialCharge         76
   MinPartialCharge         76
   MaxPartialCharge         76

[INFO][P1.5] nr-ar-lbd (/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc/nr-ar-lbd_with_desc.csv)
  -> Columns with ±inf (top):
             column  pos_inf  neg_inf  total_inf
   MaxPartialCharge    

### Part 2: 저장된 *_with_desc.csv 불러와 전처리 & 최종 CSV 저장

In [1]:
# ====== Part 2: *_with_desc.csv 전처리 후 *_2Ddesc.csv 저장 (±inf만 cap, NaN 열 삭제) ======
import os, glob
import numpy as np
import pandas as pd
import json

MID_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc"        # Part1 출력
OUT_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/processed/"       # 최종 출력
os.makedirs(OUT_DIR, exist_ok=True)

# ---- (선택) 공통 NaN 컬럼 드롭 리스트 경로(Part 1.5에서 만든 것) ----
DROP_LIST_TXT = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.txt"
DROP_LIST_CSV = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.csv"

# === cap 계산 파라미터(±inf 치환 시 사용 / clip_caps.json 생성 시 사용) ===
ROBUST_Q = 0.999
MIN_CAP  = 1e6

# ====== helper: 공통 NaN 컬럼 로더 ======
def load_common_nan_columns(txt_path: str, csv_path: str) -> set[str]:
    cols = set()
    if os.path.exists(txt_path):
        with open(txt_path, "r", encoding="utf-8") as f:
            for ln in f:
                name = ln.strip()
                if name and not name.startswith("#"):
                    cols.add(name)
    if os.path.exists(csv_path):
        try:
            df = pd.read_csv(csv_path)
            if "column" in df.columns:
                cols.update(df["column"].dropna().astype(str).str.strip().tolist())
            else:
                first_col = df.columns[0]
                cols.update(df[first_col].dropna().astype(str).str.strip().tolist())
        except Exception as e:
            print(f"[WARN] Failed to read CSV drop list: {csv_path} ({e})")
    return cols

COMMON_NAN_COLS = load_common_nan_columns(DROP_LIST_TXT, DROP_LIST_CSV)
print(f"[P2] Loaded common-NaN drop list: {len(COMMON_NAN_COLS)} columns")

# ====== helper: (미사용) 유한값까지 자르는 robust clip (정의만, 사용하지 않음) ======
def robust_clip_df(X: pd.DataFrame, q: float = ROBUST_Q, min_cap: float = MIN_CAP) -> pd.DataFrame:
    """컬럼별 |값|의 q 분위수×10(최소 min_cap)으로 모든 유한값을 클리핑. (현재 미사용)"""
    Xc = X.copy()
    for c in Xc.columns:
        s = Xc[c]
        finite = s[np.isfinite(s.values)]
        if finite.empty:
            continue
        cap_q = np.nanquantile(np.abs(finite), q)
        cap = max(float(cap_q) * 10.0, min_cap) if np.isfinite(cap_q) and cap_q > 0 else min_cap
        Xc[c] = s.clip(lower=-cap, upper=+cap)
    return Xc

# ====== helper: ±inf만 cap으로 치환 (유한값은 변경하지 않음) ======
def cap_only_infinite_values(X: pd.DataFrame, q: float = ROBUST_Q, min_cap: float = MIN_CAP) -> pd.DataFrame:
    """
    각 컬럼별 cap(분위수 기반)으로 **±inf만** 치환. 유한값은 그대로 둠.
    cap = max(quantile(|finite|, q) * 10, MIN_CAP)
    """
    X = X.copy()
    arr = X.to_numpy()
    is_posinf = np.isposinf(arr)
    is_neginf = np.isneginf(arr)
    inf_cnt   = int(is_posinf.sum() + is_neginf.sum())
    print(f"  -> ±inf total before capping: {inf_cnt}")
    if inf_cnt == 0:
        return X

    for j, col in enumerate(X.columns):
        col_vals = X[col].to_numpy()
        # 안전: 정수 dtype이면 float64로 캐스팅
        if not np.issubdtype(col_vals.dtype, np.floating):
            col_vals = col_vals.astype(np.float64, copy=True)

        mask_pos = is_posinf[:, j]
        mask_neg = is_neginf[:, j]

        finite = col_vals[np.isfinite(col_vals)]
        if finite.size == 0:
            cap = min_cap
        else:
            cap_q = np.nanquantile(np.abs(finite), q)
            cap = max(float(cap_q) * 10.0, min_cap) if np.isfinite(cap_q) and cap_q > 0 else min_cap

        if mask_pos.any():
            col_vals[mask_pos] = +cap
        if mask_neg.any():
            col_vals[mask_neg] = -cap

        X[col] = col_vals

    # 혹시 남은 inf 강제 치환
    if np.isinf(X.to_numpy()).any():
        print("  -> WARNING: residual ±inf after capping. Forcing to ±MIN_CAP.")
        X = X.replace([np.inf, -np.inf], [MIN_CAP, -MIN_CAP])
    return X

# ====== 메인: with_desc 불러와 전처리 ======
with_desc_files = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
data_by_assay = {}

summary_rows = []
total_nan_cols_dropped = 0
total_used, total_skipped = 0, 0

for f in with_desc_files:
    assay_name = os.path.basename(f).replace("_with_desc.csv", "")
    print(f"\n[INFO][P2] {assay_name}")

    df = pd.read_csv(f, dtype=str, engine="python")

    required = {"SMILES", "Sample ID", "toxicity"}
    if not required.issubset(df.columns):
        total_skipped += 1
        print(f"  -> SKIP: missing {sorted(list(required - set(df.columns)))}")
        continue
    total_used += 1

    # 키/라벨 정리
    df["SMILES"]    = df["SMILES"].astype(str).str.strip()
    df["Sample ID"] = df["Sample ID"].astype(str).str.strip()
    df["toxicity"]  = pd.to_numeric(df["toxicity"], errors="coerce")

    # (1) 공통 NaN 컬럼 선삭제
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    feat_cols = [c for c in df.columns if c not in non_feat]
    if COMMON_NAN_COLS:
        drop_cols_common = sorted(set(feat_cols).intersection(COMMON_NAN_COLS))
        if drop_cols_common:
            print(f"  -> Drop common-NaN columns first: {len(drop_cols_common)}")
            df = df.drop(columns=drop_cols_common, errors="ignore")
            feat_cols = [c for c in feat_cols if c not in drop_cols_common]
        else:
            print("  -> No common-NaN columns to drop in this file.")
    else:
        print("  -> No common-NaN drop list loaded or list is empty.")

    # (2) 숫자화 후 float64로 통일
    X = df[feat_cols].apply(pd.to_numeric, errors="coerce").astype(np.float64)

    # (3) ±inf만 cap으로 치환 (유한값은 그대로)
    X = cap_only_infinite_values(X, q=ROBUST_Q, min_cap=MIN_CAP)

    # (4) NaN이 하나라도 있는 열 전부 삭제
    nan_cols = X.columns[X.isna().any(axis=0)].tolist()
    if nan_cols:
        print(f"  -> Drop columns containing NaN ({len(nan_cols)}): "
              f"{nan_cols[:5]}{'...' if len(nan_cols) > 5 else ''}")
        X = X.drop(columns=nan_cols)
        feat_cols = [c for c in feat_cols if c not in nan_cols]
        total_nan_cols_dropped += len(nan_cols)
    else:
        print("  -> No columns contain NaN after INF capping.")

    if X.shape[1] == 0:
        print("  -> WARNING: No feature columns remain after dropping NaN columns. Skipping save.")
        continue

    # 저장
    left_part = df.loc[:, ["Sample ID", "SMILES", "toxicity"]].reset_index(drop=True)
    X_final   = X.reset_index(drop=True)
    save_base = os.path.join(OUT_DIR, f"{assay_name}_2Ddesc")
    df_save   = pd.concat([left_part, X_final], axis=1)
    df_save.to_csv(save_base + ".csv", index=False)

    data_by_assay[assay_name] = {"df": df_save, "X": X_final, "y": left_part["toxicity"]}

    summary_rows.append({
        "assay": assay_name,
        "rows_after_input": len(df),
        "rows_saved": len(df_save),
        "dropped_nan_cols": len(nan_cols),
        "dropped_common_nan_cols": len(drop_cols_common) if COMMON_NAN_COLS else 0,
        "inf_policy": "cap_inf_only",
        "robust_clip": False,   # 변경: 유한값 클립 안 함
    })

    print(f"  -> saved: {save_base}.csv "
          f"(rows={len(df_save)}, cols={len(df_save.columns)}) | "
          f"±inf=cap_only, NaN columns dropped={len(nan_cols)}")

# 요약 저장
print(f"\n[READY][P2] used={total_used}, skipped={total_skipped}")
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values("assay").reset_index(drop=True)
    print("\n[ASSAY SUMMARY TABLE]")
    print(summary_df.head(20).to_string(index=False))
    summary_path = os.path.join(OUT_DIR, "preprocess_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    print(f"\n[INFO] 요약 저장: {summary_path}")

# ====== (추가) train 분포 기반 clip caps 산출 & 저장 ======
# - train의 with_desc 원본들로부터 각 feature의 cap을 계산 (ROBUST_Q / MIN_CAP 규칙)
# - 여러 파일에 같은 컬럼이 있으면 더 "작은" cap을 선택(보수적 결합)
# - 공통 NaN 드롭 컬럼은 cap 계산 대상에서 제외
CAPS_PATH = "/home/ssm-user/LAIDD/tox21/train_data_inf/clip_caps.json"

# COMMON_NAN_COLS 가 위에서 로드되어 있지 않다면 빈 집합으로 처리
try:
    _common_nan_cols = set(COMMON_NAN_COLS)
except NameError:
    _common_nan_cols = set()

caps = {}
with_desc_all = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
print(f"[P2] Building clip caps from {len(with_desc_all)} with_desc files ...")

for path in with_desc_all:
    dfw = pd.read_csv(path, dtype=str, engine="python")
    feat_cols = [c for c in dfw.columns if c not in {"Sample ID", "SMILES", "toxicity"}]
    if _common_nan_cols:
        feat_cols = [c for c in feat_cols if c not in _common_nan_cols]
    if not feat_cols:
        continue

    Xw = dfw[feat_cols].apply(pd.to_numeric, errors="coerce")
    for c in feat_cols:
        s = Xw[c].to_numpy()
        finite = s[np.isfinite(s)]
        if finite.size == 0:
            continue
        cap_q = np.nanquantile(np.abs(finite), ROBUST_Q)
        cap = max(float(cap_q) * 10.0, MIN_CAP) if np.isfinite(cap_q) and cap_q > 0 else MIN_CAP
        caps[c] = min(caps.get(c, cap), cap)  # 보수적: 더 작은 cap

with open(CAPS_PATH, "w") as f:
    json.dump(caps, f, indent=2, sort_keys=True)
print(f"[P2] Saved clip caps for {len(caps)} features -> {CAPS_PATH}")


[P2] Loaded common-NaN drop list: 12 columns

[INFO][P2] nr-ahr
  -> Drop common-NaN columns first: 12
  -> ±inf total before capping: 0
  -> No columns contain NaN after INF capping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ahr_2Ddesc.csv (rows=8167, cols=208) | ±inf=cap_only, NaN columns dropped=0

[INFO][P2] nr-ar-lbd
  -> Drop common-NaN columns first: 12
  -> ±inf total before capping: 0
  -> No columns contain NaN after INF capping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ar-lbd_2Ddesc.csv (rows=8597, cols=208) | ±inf=cap_only, NaN columns dropped=0

[INFO][P2] nr-ar
  -> Drop common-NaN columns first: 12
  -> ±inf total before capping: 0
  -> No columns contain NaN after INF capping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ar_2Ddesc.csv (rows=9360, cols=208) | ±inf=cap_only, NaN columns dropped=0

[INFO][P2] nr-aromatase
  -> Drop common-NaN columns first: 12
  -> ±inf total before capping: 0
  -> No

In [3]:
# ====== Part 2: *_with_desc.csv 전처리 후 *_2Ddesc.csv 저장 (inf clip 유지, NaN 열 삭제) ======
import os, glob
import numpy as np
import pandas as pd

MID_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc"        # Part1 출력
OUT_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/processed/"       # 최종 출력
os.makedirs(OUT_DIR, exist_ok=True)

# ---- (선택) 공통 NaN 컬럼 드롭 리스트 경로(Part 1.5에서 만든 것) ----
DROP_LIST_TXT = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.txt"
DROP_LIST_CSV = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.csv"

# === clipping 설정 ===
ROBUST_Q = 0.999   # 컬럼별 |값|의 99.9% 분위수
MIN_CAP  = 1e6     # 분위수 계산 실패/너무 작을 때 최소 캡

# ====== helper: 공통 NaN 컬럼 로더 ======
def load_common_nan_columns(txt_path: str, csv_path: str) -> set[str]:
    cols = set()
    if os.path.exists(txt_path):
        with open(txt_path, "r", encoding="utf-8") as f:
            for ln in f:
                name = ln.strip()
                if name and not name.startswith("#"):
                    cols.add(name)
    if os.path.exists(csv_path):
        try:
            df = pd.read_csv(csv_path)
            if "column" in df.columns:
                cols.update(df["column"].dropna().astype(str).str.strip().tolist())
            else:
                first_col = df.columns[0]
                cols.update(df[first_col].dropna().astype(str).str.strip().tolist())
        except Exception as e:
            print(f"[WARN] Failed to read CSV drop list: {csv_path} ({e})")
    return cols

COMMON_NAN_COLS = load_common_nan_columns(DROP_LIST_TXT, DROP_LIST_CSV)
print(f"[P2] Loaded common-NaN drop list: {len(COMMON_NAN_COLS)} columns")

# ====== helper: clipping ======
def robust_clip_df(X: pd.DataFrame, q: float = ROBUST_Q, min_cap: float = MIN_CAP) -> pd.DataFrame:
    """컬럼별 |값|의 q 분위수×10(최소 min_cap)으로 모든 유한값을 클리핑."""
    Xc = X.copy()
    for c in Xc.columns:
        s = Xc[c]
        finite = s[np.isfinite(s.values)]
        if finite.empty:
            continue
        cap_q = np.nanquantile(np.abs(finite), q)
        cap = max(float(cap_q) * 10.0, min_cap) if np.isfinite(cap_q) and cap_q > 0 else min_cap
        Xc[c] = s.clip(lower=-cap, upper=+cap)
    return Xc

def clip_infinite_values(X: pd.DataFrame, q: float = ROBUST_Q, min_cap: float = MIN_CAP) -> pd.DataFrame:
    """±inf를 컬럼별 cap으로 치환 (NaN 변환 없이 clip만)."""
    X = X.copy()
    arr = X.to_numpy()
    is_posinf = np.isposinf(arr)
    is_neginf = np.isneginf(arr)
    inf_cnt   = int(is_posinf.sum() + is_neginf.sum())
    print(f"  -> ±inf total before clipping: {inf_cnt}")
    if inf_cnt == 0:
        return X

    for j, col in enumerate(X.columns):
        col_vals = X[col].to_numpy()
        mask_pos = is_posinf[:, j]
        mask_neg = is_neginf[:, j]
        finite = col_vals[np.isfinite(col_vals)]
        if finite.size == 0:
            cap = min_cap
        else:
            cap_q = np.nanquantile(np.abs(finite), q)
            cap = max(float(cap_q) * 10.0, min_cap) if np.isfinite(cap_q) and cap_q > 0 else min_cap
        col_vals[mask_pos] = +cap
        col_vals[mask_neg] = -cap
        X[col] = col_vals

    if np.isinf(X.to_numpy()).any():
        print("  -> WARNING: residual ±inf after clipping. Forcing to ±MIN_CAP.")
        X = X.replace([np.inf, -np.inf], [MIN_CAP, -MIN_CAP])
    return X

# ====== 메인: with_desc 불러와 전처리 ======
with_desc_files = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
data_by_assay = {}

summary_rows = []
total_cols_allnan_dropped = 0     # (참고) all-NaN 컬럼은 아래 NaN열 드롭에 포함되므로 옵션
total_rows_allnan_dropped = 0     # (사용 안 함) 행 드롭은 수행하지 않음
total_nan_cols_dropped    = 0     # 새로 추가: NaN 포함 열 드롭 총합
total_used, total_skipped = 0, 0

for f in with_desc_files:
    assay_name = os.path.basename(f).replace("_with_desc.csv", "")
    print(f"\n[INFO][P2] {assay_name}")

    df = pd.read_csv(f, dtype=str, engine="python")

    required = {"SMILES", "Sample ID", "toxicity"}
    if not required.issubset(df.columns):
        total_skipped += 1
        print(f"  -> SKIP: missing {sorted(list(required - set(df.columns)))}")
        continue
    total_used += 1

    # 키/라벨 정리
    df["SMILES"]    = df["SMILES"].astype(str).str.strip()
    df["Sample ID"] = df["Sample ID"].astype(str).str.strip()
    df["toxicity"]  = pd.to_numeric(df["toxicity"], errors="coerce")

    # === (1) 공통 NaN 컬럼을 "가장 먼저" 제거 ===
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    feat_cols = [c for c in df.columns if c not in non_feat]
    if COMMON_NAN_COLS:
        drop_cols_common = sorted(set(feat_cols).intersection(COMMON_NAN_COLS))
        if drop_cols_common:
            print(f"  -> Drop common-NaN columns first: {len(drop_cols_common)}")
            df = df.drop(columns=drop_cols_common, errors="ignore")
            feat_cols = [c for c in feat_cols if c not in drop_cols_common]
        else:
            print("  -> No common-NaN columns to drop in this file.")
    else:
        print("  -> No common-NaN drop list loaded or list is empty.")

    # === (2) 숫자화
    X = df[feat_cols].apply(pd.to_numeric, errors="coerce")

    # === (3) inf clip (NaN 변경 없음)
    X = clip_infinite_values(X, q=ROBUST_Q, min_cap=MIN_CAP)

    # === (4) (선택) 유한 outlier 클리핑 — NaN에는 영향 없음
    X = robust_clip_df(X, q=ROBUST_Q, min_cap=MIN_CAP)

    # === (5) NaN이 1개라도 존재하는 "열"은 모두 삭제 (요청사항)
    nan_cols = X.columns[X.isna().any(axis=0)].tolist()
    if nan_cols:
        print(f"  -> Drop columns containing NaN ({len(nan_cols)}): "
              f"{nan_cols[:5]}{'...' if len(nan_cols) > 5 else ''}")
        X = X.drop(columns=nan_cols)
        feat_cols = [c for c in feat_cols if c not in nan_cols]
        total_nan_cols_dropped += len(nan_cols)
    else:
        print("  -> No columns contain NaN after inf clipping / robust clipping.")

    # (행 드롭은 수행하지 않음; 모든 NaN 열 삭제로 X에는 NaN이 없어야 함)
    if X.shape[1] == 0:
        print("  -> WARNING: No feature columns remain after dropping NaN columns. Skipping save.")
        continue

    # 저장
    left_part = df.loc[:, ["Sample ID", "SMILES", "toxicity"]].reset_index(drop=True)
    X_final   = X.reset_index(drop=True)
    save_base = os.path.join(OUT_DIR, f"{assay_name}_2Ddesc")
    df_save   = pd.concat([left_part, X_final], axis=1)
    df_save.to_csv(save_base + ".csv", index=False)

    data_by_assay[assay_name] = {"df": df_save, "X": X_final, "y": left_part["toxicity"]}

    summary_rows.append({
        "assay": assay_name,
        "rows_after_input": len(df),
        "rows_saved": len(df_save),
        "dropped_nan_cols": len(nan_cols),
        "dropped_common_nan_cols": len(drop_cols_common) if COMMON_NAN_COLS else 0,
        "inf_policy": "clip",
        "robust_clip": True,
    })

    print(f"  -> saved: {save_base}.csv "
          f"(rows={len(df_save)}, cols={len(df_save.columns)}) | "
          f"±inf=clip, NaN columns dropped={len(nan_cols)}")

# 요약 저장
print(f"\n[READY][P2] used={total_used}, skipped={total_skipped}")
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values("assay").reset_index(drop=True)
    print("\n[ASSAY SUMMARY TABLE]")
    print(summary_df.head(20).to_string(index=False))
    summary_path = os.path.join(OUT_DIR, "preprocess_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    print(f"\n[INFO] 요약 저장: {summary_path}")

# ====== (추가) train 분포 기반 clip caps 산출 & 저장 ======
# - train의 with_desc 원본들로부터 각 feature의 cap을 계산 (ROBUST_Q / MIN_CAP 규칙)
# - 여러 파일에 같은 컬럼이 있으면 더 "작은" cap을 선택(보수적 결합)
# - 공통 NaN 드롭 컬럼은 cap 계산 대상에서 제외

import json

CAPS_PATH = "/home/ssm-user/LAIDD/tox21/train_data_inf/clip_caps.json"

# COMMON_NAN_COLS 가 위에서 로드되어 있지 않다면 빈 집합으로 처리
try:
    _common_nan_cols = set(COMMON_NAN_COLS)
except NameError:
    _common_nan_cols = set()

caps = {}

# MID_DIR에는 train 시 Part1에서 만든 *_with_desc.csv 들이 있습니다.
with_desc_all = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
print(f"[P2] Building clip caps from {len(with_desc_all)} with_desc files ...")

for path in with_desc_all:
    dfw = pd.read_csv(path, dtype=str, engine="python")
    feat_cols = [c for c in dfw.columns if c not in {"Sample ID", "SMILES", "toxicity"}]
    # 공통 NaN 컬럼은 애초에 학습에서 쓰지 않으므로 cap 계산 제외
    if _common_nan_cols:
        feat_cols = [c for c in feat_cols if c not in _common_nan_cols]

    if not feat_cols:
        continue

    Xw = dfw[feat_cols].apply(pd.to_numeric, errors="coerce")
    for c in feat_cols:
        s = Xw[c].to_numpy()
        finite = s[np.isfinite(s)]
        if finite.size == 0:
            continue
        cap_q = np.nanquantile(np.abs(finite), ROBUST_Q)
        cap = max(float(cap_q) * 10.0, MIN_CAP) if np.isfinite(cap_q) and cap_q > 0 else MIN_CAP
        # 여러 파일에서 같은 컬럼이 나오면 더 작은 cap로 병합(보수적)
        caps[c] = min(caps.get(c, cap), cap)

with open(CAPS_PATH, "w") as f:
    json.dump(caps, f, indent=2, sort_keys=True)

print(f"[P2] Saved clip caps for {len(caps)} features -> {CAPS_PATH}")


[P2] Loaded common-NaN drop list: 12 columns

[INFO][P2] nr-ahr
  -> Drop common-NaN columns first: 12
  -> ±inf total before clipping: 0
  -> No columns contain NaN after inf clipping / robust clipping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ahr_2Ddesc.csv (rows=8167, cols=208) | ±inf=clip, NaN columns dropped=0

[INFO][P2] nr-ar-lbd
  -> Drop common-NaN columns first: 12
  -> ±inf total before clipping: 0
  -> No columns contain NaN after inf clipping / robust clipping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ar-lbd_2Ddesc.csv (rows=8597, cols=208) | ±inf=clip, NaN columns dropped=0

[INFO][P2] nr-ar
  -> Drop common-NaN columns first: 12
  -> ±inf total before clipping: 0
  -> No columns contain NaN after inf clipping / robust clipping.
  -> saved: /home/ssm-user/LAIDD/tox21/train_data_inf/processed/nr-ar_2Ddesc.csv (rows=9360, cols=208) | ±inf=clip, NaN columns dropped=0

[INFO][P2] nr-aromatase
  -> Drop common-NaN columns fir

In [None]:
찐찐찐

In [7]:
# ====== Part 2: *_with_desc.csv 전처리 후 *_2Ddesc.csv 저장 (NaN 열 삭제만 수행) ======
import os, glob
import numpy as np
import pandas as pd

MID_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/with_desc"        # Part1 출력
OUT_DIR = "/home/ssm-user/LAIDD/tox21/train_processed/"       # 최종 출력
os.makedirs(OUT_DIR, exist_ok=True)

# ---- (선택) 공통 NaN 컬럼 드롭 리스트 경로(Part 1.5에서 만든 것) ----
DROP_LIST_TXT = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.txt"
DROP_LIST_CSV = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.csv"

# ====== helper: 공통 NaN 컬럼 로더 ======
def load_common_nan_columns(txt_path: str, csv_path: str) -> set[str]:
    cols = set()
    if os.path.exists(txt_path):
        with open(txt_path, "r", encoding="utf-8") as f:
            for ln in f:
                name = ln.strip()
                if name and not name.startswith("#"):
                    cols.add(name)
    if os.path.exists(csv_path):
        try:
            df = pd.read_csv(csv_path)
            if "column" in df.columns:
                cols.update(df["column"].dropna().astype(str).str.strip().tolist())
            else:
                first_col = df.columns[0]
                cols.update(df[first_col].dropna().astype(str).str.strip().tolist())
        except Exception as e:
            print(f"[WARN] Failed to read CSV drop list: {csv_path} ({e})")
    return cols

COMMON_NAN_COLS = load_common_nan_columns(DROP_LIST_TXT, DROP_LIST_CSV)
print(f"[P2] Loaded common-NaN drop list: {len(COMMON_NAN_COLS)} columns")

# ====== 메인: with_desc 불러와 전처리 ======
with_desc_files = sorted(glob.glob(os.path.join(MID_DIR, "*_with_desc.csv")))
data_by_assay = {}

summary_rows = []
total_nan_cols_dropped = 0
total_used, total_skipped = 0, 0

for f in with_desc_files:
    assay_name = os.path.basename(f).replace("_with_desc.csv", "")
    print(f"\n[INFO][P2] {assay_name}")

    df = pd.read_csv(f, dtype=str, engine="python")

    required = {"SMILES", "Sample ID", "toxicity"}
    if not required.issubset(df.columns):
        total_skipped += 1
        print(f"  -> SKIP: missing {sorted(list(required - set(df.columns)))}")
        continue
    total_used += 1

    # 키/라벨 정리
    df["SMILES"]    = df["SMILES"].astype(str).str.strip()
    df["Sample ID"] = df["Sample ID"].astype(str).str.strip()
    df["toxicity"]  = pd.to_numeric(df["toxicity"], errors="coerce")

    # === (1) 공통 NaN 컬럼 선삭제 ===
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    feat_cols = [c for c in df.columns if c not in non_feat]
    if COMMON_NAN_COLS:
        drop_cols_common = sorted(set(feat_cols).intersection(COMMON_NAN_COLS))
        if drop_cols_common:
            print(f"  -> Drop common-NaN columns first: {len(drop_cols_common)}")
            df = df.drop(columns=drop_cols_common, errors="ignore")
            feat_cols = [c for c in feat_cols if c not in drop_cols_common]
        else:
            print("  -> No common-NaN columns to drop in this file.")
    else:
        print("  -> No common-NaN drop list loaded or list is empty.")

    # === (2) 숫자화
    X = df[feat_cols].apply(pd.to_numeric, errors="coerce")

    # === (3) NaN 포함 열 삭제 ===
    nan_cols = X.columns[X.isna().any(axis=0)].tolist()
    if nan_cols:
        num_dropped = len(nan_cols)
        total_nan_cols_dropped += num_dropped
        print(f"  -> Drop columns containing NaN ({num_dropped} in this file): "
              f"{nan_cols[:5]}{'...' if num_dropped > 5 else ''}")
        print(f"  -> Cumulative NaN-dropped columns so far: {total_nan_cols_dropped}")
        X = X.drop(columns=nan_cols)
        feat_cols = [c for c in feat_cols if c not in nan_cols]
    else:
        print("  -> No columns contain NaN.")

    # (행 드롭 없음)
    if X.shape[1] == 0:
        print("  -> WARNING: No feature columns remain after dropping NaN columns. Skipping save.")
        continue

    # === (4) 저장 ===
    left_part = df.loc[:, ["Sample ID", "SMILES", "toxicity"]].reset_index(drop=True)
    X_final   = X.reset_index(drop=True)
    save_base = os.path.join(OUT_DIR, f"{assay_name}_2Ddesc")
    df_save   = pd.concat([left_part, X_final], axis=1)
    df_save.to_csv(save_base + ".csv", index=False)

    data_by_assay[assay_name] = {"df": df_save, "X": X_final, "y": left_part["toxicity"]}

    summary_rows.append({
        "assay": assay_name,
        "rows_after_input": len(df),
        "rows_saved": len(df_save),
        "dropped_nan_cols": num_dropped if nan_cols else 0,
        "dropped_common_nan_cols": len(drop_cols_common) if COMMON_NAN_COLS else 0,
    })

    print(f"  -> saved: {save_base}.csv "
          f"(rows={len(df_save)}, cols={len(df_save.columns)}) | "
          f"NaN columns dropped={num_dropped if nan_cols else 0}")

# ====== 요약 저장 ======
print(f"\n[READY][P2] used={total_used}, skipped={total_skipped}")
if summary_rows:
    summary_df = pd.DataFrame(summary_rows).sort_values("assay").reset_index(drop=True)
    print("\n[ASSAY SUMMARY TABLE]")
    print(summary_df.head(20).to_string(index=False))
    summary_path = os.path.join(OUT_DIR, "preprocess_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    print(f"\n[INFO] 요약 저장: {summary_path}")


[P2] Loaded common-NaN drop list: 12 columns

[INFO][P2] nr-ahr
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
  -> saved: /home/ssm-user/LAIDD/tox21/train_processed/nr-ahr_2Ddesc.csv (rows=8167, cols=208) | NaN columns dropped=0

[INFO][P2] nr-ar-lbd
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
  -> saved: /home/ssm-user/LAIDD/tox21/train_processed/nr-ar-lbd_2Ddesc.csv (rows=8597, cols=208) | NaN columns dropped=0

[INFO][P2] nr-ar
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
  -> saved: /home/ssm-user/LAIDD/tox21/train_processed/nr-ar_2Ddesc.csv (rows=9360, cols=208) | NaN columns dropped=0

[INFO][P2] nr-aromatase
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
  -> saved: /home/ssm-user/LAIDD/tox21/train_processed/nr-aromatase_2Ddesc.csv (rows=7224, cols=208) | NaN columns dropped=0

[INFO][P2] nr-er-lbd
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
  -> saved: /home/ssm-use

### Part 3: train

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, json
import numpy as np
import pandas as pd
import optuna
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    matthews_corrcoef, roc_auc_score,
    precision_score, recall_score, f1_score,
    accuracy_score, confusion_matrix
)
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from tqdm.auto import tqdm
import joblib

# ================== 경로/설정 ==================
PROCESSED_DIR = "/home/ssm-user/LAIDD/tox21/train_data_inf/processed/"
RESULT_DIR    = "/home/ssm-user/LAIDD/tox21/Results_imputer_inf/"
os.makedirs(RESULT_DIR, exist_ok=True)

N_TRIALS    = 50
N_SPLITS    = 5
RANDOM_SEED = 42
ANCHOR_PREFERENCE = "NR-ER"   # 앵커 파일 우선 prefix (없으면 첫 파일)

def log(msg: str):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)

# ================== 유틸 ==================
def list_assay_csvs(processed_dir: str):
    files = sorted([f for f in os.listdir(processed_dir) if f.endswith("_2Ddesc.csv")])
    if not files:
        raise FileNotFoundError(f"No *_2Ddesc.csv under {processed_dir}")
    return files

def normalize_keys(df: pd.DataFrame) -> pd.DataFrame:
    df["Sample ID"] = df["Sample ID"].astype("string").str.strip()
    df["SMILES"]    = df["SMILES"].astype("string").str.strip()
    return df

def find_best_thresh(y_true: np.ndarray, y_prob: np.ndarray):
    best_t, best_m = 0.5, -1.0
    for t in np.linspace(0.05, 0.95, 181):
        pred = (y_prob >= t).astype(int)
        if len(np.unique(pred)) < 2:
            continue
        m = matthews_corrcoef(y_true, pred)
        if m > best_m:
            best_m, best_t = m, t
    return float(best_t), float(best_m)

def mcc_weighted(y_true: pd.DataFrame, y_pred_bin: np.ndarray) -> float:
    mccs, weights = [], []
    for i in range(y_true.shape[1]):
        yt = y_true.iloc[:, i].astype(int)
        if yt.nunique() < 2:
            continue
        mccs.append(matthews_corrcoef(yt, y_pred_bin[:, i]))
        weights.append(len(yt))
    return float(np.average(mccs, weights=weights)) if mccs else 0.0

def build_pipeline(params: dict) -> Pipeline:
    return Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("clf", MultiOutputClassifier(
            RandomForestClassifier(
                n_estimators=params["n_estimators"],
                max_depth=params["max_depth"],
                min_samples_split=params["min_samples_split"],
                min_samples_leaf=params["min_samples_leaf"],
                max_features=params["max_features"],
                class_weight="balanced",
                n_jobs=-1,
                random_state=RANDOM_SEED,
            ),
            n_jobs=-1
        ))
    ])

# ================== 데이터 구성 (메모리 안전) ==================
def choose_anchor(files):
    for f in files:
        if f.startswith(ANCHOR_PREFERENCE):
            return f
    return files[0]

def read_anchor_features(processed_dir: str, anchor_file: str):
    path = os.path.join(processed_dir, anchor_file)
    head = pd.read_csv(path, nrows=5, low_memory=False)
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    feat_cols = [c for c in head.columns if c not in non_feat]
    usecols = ["Sample ID", "SMILES"] + feat_cols
    X = pd.read_csv(
        path, usecols=usecols,
        dtype={"Sample ID":"string","SMILES":"string"},
        low_memory=False
    )
    X = normalize_keys(X)
    X[feat_cols] = X[feat_cols].apply(pd.to_numeric, errors="coerce").astype(np.float32)
    return X, feat_cols

def build_labels_left_on_anchor(processed_dir: str, files, anchor_keys: pd.DataFrame):
    labels = anchor_keys.copy()
    assays = []
    for f in tqdm(files, desc="Left-joining labels onto anchor"):
        assay = f.replace("_2Ddesc.csv", "")
        path  = os.path.join(processed_dir, f)
        df_lab = pd.read_csv(
            path,
            usecols=["Sample ID","SMILES","toxicity"],
            dtype={"Sample ID":"string","SMILES":"string","toxicity":"float32"},
            low_memory=False
        )
        df_lab = normalize_keys(df_lab).drop_duplicates(["Sample ID","SMILES"], keep="first")
        labels = labels.merge(df_lab.rename(columns={"toxicity": assay}),
                              on=["Sample ID","SMILES"], how="left")
        assays.append(assay)
    return labels, assays

# ================== Optuna 목적함수 ==================
def make_objective(X: pd.DataFrame, Y: pd.DataFrame):
    # Stratify는 라벨 합으로 근사 (모든 라벨 보유 행만 사용)
    sums = Y.sum(axis=1).values
    def objective(trial: optuna.trial.Trial) -> float:
        params = {
            "n_estimators":      trial.suggest_int("n_estimators", 200, 900),
            "max_depth":         trial.suggest_int("max_depth", 4, 40),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 40),
            "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 20),
            "max_features":      trial.suggest_float("max_features", 0.2, 1.0),
        }
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
        scores = []
        for tr_idx, va_idx in cv.split(X, sums):
            pipe = build_pipeline(params)
            pipe.fit(X.iloc[tr_idx], Y.iloc[tr_idx])

            proba_list = pipe.named_steps["clf"].predict_proba(X.iloc[va_idx])
            prob_pos   = np.column_stack([p[:, 1] for p in proba_list])
            pred_bin   = (prob_pos >= 0.5).astype(int)  # 탐색 단계에선 임계값 고정
            scores.append(mcc_weighted(Y.iloc[va_idx], pred_bin))
        return float(np.mean(scores))
    return objective

# ================== 학습/저장/평가 ==================
def train_and_save():
    files = list_assay_csvs(PROCESSED_DIR)
    log(f"Found {len(files)} CSVs")

    # 1) 앵커 피처 로드 (한 벌만)
    anchor = choose_anchor(files)
    X_feat, feat_cols = read_anchor_features(PROCESSED_DIR, anchor)
    anchor_keys = X_feat[["Sample ID","SMILES"]]
    log(f"Anchor: {anchor} | #features={len(feat_cols)} | rows={len(anchor_keys)}")

    # 2) 라벨 테이블을 앵커 키에만 left-join
    labels, assays = build_labels_left_on_anchor(PROCESSED_DIR, files, anchor_keys)
    log(f"#assays={len(assays)} | labels shape={labels.shape}")

    # 3) 피처 + 라벨 결합 (inner: 피처가 있는 행만)
    df = labels.merge(X_feat, on=["Sample ID","SMILES"], how="inner")
    # 멀티라벨 학습은 모든 라벨이 있는 행만 사용
    mask_all = df[assays].notna().all(axis=1)
    df = df.loc[mask_all].reset_index(drop=True)

    Y = df[assays].astype(int).reset_index(drop=True)
    X = df[feat_cols].reset_index(drop=True)

    log(f"Samples with all labels: {len(X)}")

    # 4) Optuna 탐색
    log("Optuna search (maximize weighted MCC)...")
    study = optuna.create_study(direction="maximize")
    study.optimize(make_objective(X, Y), n_trials=N_TRIALS, show_progress_bar=True)
    log(f"BEST mean MCC: {study.best_value:.4f}")
    log(f"BEST params : {study.best_params}")

    # 5) OOF 예측 생성 (best params로 5-fold)
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    sums = Y.sum(axis=1).values
    oof_prob = np.zeros((len(X), len(assays)), dtype=np.float32)
    base_pipe = build_pipeline(study.best_params)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, sums), start=1):
        pipe = clone(base_pipe)
        pipe.fit(X.iloc[tr_idx], Y.iloc[tr_idx])
        proba_list = pipe.named_steps["clf"].predict_proba(X.iloc[va_idx])
        oof_prob[va_idx, :] = np.column_stack([p[:, 1] for p in proba_list]).astype(np.float32)
        log(f"Fold {fold} done.")

    # 6) assay별 threshold 탐색 + 성능 리포트 (OOF 기준)
    results = []
    for i, assay in enumerate(assays):
        y_true = Y.iloc[:, i].values
        if np.unique(y_true).size < 2:
            log(f"{assay}: skipped (single class)")
            continue

        y_prob = oof_prob[:, i]
        thr, best_mcc = find_best_thresh(y_true, y_prob)
        y_pred = (y_prob >= thr).astype(int)

        res = {
            "assay": assay,
            "best_threshold": float(thr),
            "mcc": float(best_mcc),
            "accuracy": float(accuracy_score(y_true, y_pred)),
            "precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "f1": float(f1_score(y_true, y_pred, zero_division=0)),
            "roc_auc": float(roc_auc_score(y_true, y_prob)),
            "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        }
        results.append(res)

        # 콘솔 출력
        print(f"\n=== {assay} ===")
        print(f"best_threshold: {res['best_threshold']:.3f}")
        print(f"mcc          : {res['mcc']:.4f}")
        print(f"accuracy     : {res['accuracy']:.4f}")
        print(f"precision    : {res['precision']:.4f}")
        print(f"recall       : {res['recall']:.4f}")
        print(f"f1           : {res['f1']:.4f}")
        print(f"roc_auc      : {res['roc_auc']:.4f}")
        print("confusion_matrix:\n", np.array(res["confusion_matrix"]))

    # 결과 저장
    pd.DataFrame(results).to_csv(os.path.join(RESULT_DIR, "validation_results_oof.csv"), index=False)
    np.save(os.path.join(RESULT_DIR, "oof_prob.npy"), oof_prob)
    with open(os.path.join(RESULT_DIR, "assays.json"), "w") as f:
        json.dump(assays, f, indent=2)

    # 7) 최종 모델 적합(전체 데이터) + 저장
    final_pipe = build_pipeline(study.best_params)
    final_pipe.fit(X, Y)
    model_path = os.path.join(RESULT_DIR, "multilabel_rf_best.joblib")
    joblib.dump(final_pipe, model_path)
    log(f"Saved best model -> {model_path}")

    # 메타 저장
    meta = {
        "best_value_mean_mcc": float(study.best_value),
        "best_params": study.best_params,
        "assays": assays,
        "n_features": len(feat_cols),
        "feature_anchor": anchor,
        "n_trials": N_TRIALS,
        "n_splits": N_SPLITS,
        "seed": RANDOM_SEED,
    }
    with open(os.path.join(RESULT_DIR, "study_meta.json"), "w") as f:
        json.dump(meta, f, indent=2)
    log("Saved study_meta.json")

# ================== main ==================
if __name__ == "__main__":
    train_and_save()


[08:12:22] Found 12 CSVs
[08:12:23] Anchor: nr-ahr_2Ddesc.csv | #features=205 | rows=8167


  return arr.astype(dtype, copy=True)


Left-joining labels onto anchor:   0%|          | 0/12 [00:00<?, ?it/s]

[08:12:24] #assays=12 | labels shape=(8167, 14)
[08:12:24] Samples with all labels: 2543
[08:12:24] Optuna search (maximize weighted MCC)...


[I 2025-09-11 08:12:24,913] A new study created in memory with name: no-name-9080f198-86e9-4875-a3b4-59a1b3d185b1


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-09-11 08:13:12,247] Trial 0 finished with value: 0.2232153067930743 and parameters: {'n_estimators': 305, 'max_depth': 36, 'min_samples_split': 37, 'min_samples_leaf': 6, 'max_features': 0.36174526910083027}. Best is trial 0 with value: 0.2232153067930743.




[I 2025-09-11 08:16:04,943] Trial 1 finished with value: 0.22431194521543452 and parameters: {'n_estimators': 714, 'max_depth': 5, 'min_samples_split': 18, 'min_samples_leaf': 1, 'max_features': 0.7771233686353489}. Best is trial 1 with value: 0.22431194521543452.




[I 2025-09-11 08:18:05,705] Trial 2 finished with value: 0.22438180441779876 and parameters: {'n_estimators': 668, 'max_depth': 12, 'min_samples_split': 31, 'min_samples_leaf': 12, 'max_features': 0.5020033432800792}. Best is trial 2 with value: 0.22438180441779876.




[I 2025-09-11 08:19:11,069] Trial 3 finished with value: 0.23380600396901557 and parameters: {'n_estimators': 273, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 11, 'max_features': 0.6831692098677047}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:21:13,586] Trial 4 finished with value: 0.2006583989151851 and parameters: {'n_estimators': 769, 'max_depth': 4, 'min_samples_split': 30, 'min_samples_leaf': 18, 'max_features': 0.522496185902151}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:22:25,820] Trial 5 finished with value: 0.21161395462444038 and parameters: {'n_estimators': 288, 'max_depth': 32, 'min_samples_split': 36, 'min_samples_leaf': 20, 'max_features': 0.7819936877965648}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:25:43,334] Trial 6 finished with value: 0.22945936316718676 and parameters: {'n_estimators': 642, 'max_depth': 26, 'min_samples_split': 15, 'min_samples_leaf': 9, 'max_features': 0.9254532102325779}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:26:56,828] Trial 7 finished with value: 0.2131846167080179 and parameters: {'n_estimators': 456, 'max_depth': 20, 'min_samples_split': 37, 'min_samples_leaf': 7, 'max_features': 0.41250364119958205}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:29:59,906] Trial 8 finished with value: 0.22492504949868994 and parameters: {'n_estimators': 649, 'max_depth': 18, 'min_samples_split': 35, 'min_samples_leaf': 13, 'max_features': 0.8775502299222346}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:33:30,905] Trial 9 finished with value: 0.21492849725729357 and parameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 0.8267050253646699}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:35:09,420] Trial 10 finished with value: 0.22521979438265127 and parameters: {'n_estimators': 441, 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 15, 'max_features': 0.6592134959141924}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:36:02,928] Trial 11 finished with value: 0.22727279697733316 and parameters: {'n_estimators': 517, 'max_depth': 26, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.2115798449796908}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:37:14,871] Trial 12 finished with value: 0.22949091334762234 and parameters: {'n_estimators': 219, 'max_depth': 26, 'min_samples_split': 11, 'min_samples_leaf': 10, 'max_features': 0.9889577200559052}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:38:05,071] Trial 13 finished with value: 0.21745230766460125 and parameters: {'n_estimators': 219, 'max_depth': 26, 'min_samples_split': 8, 'min_samples_leaf': 15, 'max_features': 0.6666430951651598}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:39:09,939] Trial 14 finished with value: 0.228525325151699 and parameters: {'n_estimators': 202, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 11, 'max_features': 0.9748552303889977}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:40:44,706] Trial 15 finished with value: 0.22503958870696322 and parameters: {'n_estimators': 349, 'max_depth': 40, 'min_samples_split': 25, 'min_samples_leaf': 1, 'max_features': 0.7242923164181684}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:42:01,620] Trial 16 finished with value: 0.216987074493582 and parameters: {'n_estimators': 379, 'max_depth': 31, 'min_samples_split': 8, 'min_samples_leaf': 15, 'max_features': 0.5829559138517819}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:43:27,415] Trial 17 finished with value: 0.23167394842892194 and parameters: {'n_estimators': 260, 'max_depth': 16, 'min_samples_split': 13, 'min_samples_leaf': 8, 'max_features': 0.9783373081844862}. Best is trial 3 with value: 0.23380600396901557.




[I 2025-09-11 08:46:14,699] Trial 18 finished with value: 0.23750516267766267 and parameters: {'n_estimators': 547, 'max_depth': 16, 'min_samples_split': 22, 'min_samples_leaf': 4, 'max_features': 0.8784801333430073}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 08:48:46,535] Trial 19 finished with value: 0.23526582646290722 and parameters: {'n_estimators': 522, 'max_depth': 10, 'min_samples_split': 22, 'min_samples_leaf': 4, 'max_features': 0.8540452874543243}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 08:51:24,491] Trial 20 finished with value: 0.23201582756031086 and parameters: {'n_estimators': 562, 'max_depth': 8, 'min_samples_split': 23, 'min_samples_leaf': 4, 'max_features': 0.8467257176848314}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 08:53:43,121] Trial 21 finished with value: 0.2341758281297331 and parameters: {'n_estimators': 551, 'max_depth': 9, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': 0.7176764841004897}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 08:56:07,628] Trial 22 finished with value: 0.2212854470658412 and parameters: {'n_estimators': 555, 'max_depth': 9, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 0.7530484005170006}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 08:58:36,159] Trial 23 finished with value: 0.23354945873873373 and parameters: {'n_estimators': 488, 'max_depth': 10, 'min_samples_split': 25, 'min_samples_leaf': 3, 'max_features': 0.8975881627011972}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:01:33,412] Trial 24 finished with value: 0.2347759476033613 and parameters: {'n_estimators': 596, 'max_depth': 22, 'min_samples_split': 28, 'min_samples_leaf': 2, 'max_features': 0.8354518008362614}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:04:23,802] Trial 25 finished with value: 0.23483983522291602 and parameters: {'n_estimators': 603, 'max_depth': 22, 'min_samples_split': 30, 'min_samples_leaf': 5, 'max_features': 0.8185164017687262}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:08:19,746] Trial 26 finished with value: 0.23345644471454916 and parameters: {'n_estimators': 758, 'max_depth': 21, 'min_samples_split': 32, 'min_samples_leaf': 5, 'max_features': 0.9207531462524128}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:10:21,148] Trial 27 finished with value: 0.23550895354890714 and parameters: {'n_estimators': 448, 'max_depth': 17, 'min_samples_split': 40, 'min_samples_leaf': 6, 'max_features': 0.7924567313644002}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:11:50,726] Trial 28 finished with value: 0.22414509534223653 and parameters: {'n_estimators': 408, 'max_depth': 14, 'min_samples_split': 17, 'min_samples_leaf': 7, 'max_features': 0.6023454471095798}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:14:22,561] Trial 29 finished with value: 0.22791760908838565 and parameters: {'n_estimators': 492, 'max_depth': 17, 'min_samples_split': 39, 'min_samples_leaf': 6, 'max_features': 0.923730445885029}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:15:00,917] Trial 30 finished with value: 0.21298336360953876 and parameters: {'n_estimators': 341, 'max_depth': 7, 'min_samples_split': 27, 'min_samples_leaf': 7, 'max_features': 0.2533483799341179}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:17:51,485] Trial 31 finished with value: 0.23560591350902813 and parameters: {'n_estimators': 627, 'max_depth': 23, 'min_samples_split': 40, 'min_samples_leaf': 5, 'max_features': 0.7910526128445505}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:19:52,748] Trial 32 finished with value: 0.23508157155427822 and parameters: {'n_estimators': 442, 'max_depth': 13, 'min_samples_split': 40, 'min_samples_leaf': 4, 'max_features': 0.7957659933551583}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:23:17,162] Trial 33 finished with value: 0.23229733241727887 and parameters: {'n_estimators': 691, 'max_depth': 24, 'min_samples_split': 33, 'min_samples_leaf': 6, 'max_features': 0.8757555231213926}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:25:55,412] Trial 34 finished with value: 0.2366247510293143 and parameters: {'n_estimators': 606, 'max_depth': 19, 'min_samples_split': 34, 'min_samples_leaf': 3, 'max_features': 0.7321290082777191}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:29:12,566] Trial 35 finished with value: 0.23455414970413385 and parameters: {'n_estimators': 743, 'max_depth': 19, 'min_samples_split': 38, 'min_samples_leaf': 1, 'max_features': 0.7422184944077194}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:31:30,729] Trial 36 finished with value: 0.2290049603891377 and parameters: {'n_estimators': 615, 'max_depth': 31, 'min_samples_split': 35, 'min_samples_leaf': 3, 'max_features': 0.6116739540034614}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:34:41,403] Trial 37 finished with value: 0.23130980638277215 and parameters: {'n_estimators': 685, 'max_depth': 29, 'min_samples_split': 34, 'min_samples_leaf': 2, 'max_features': 0.7826311033352578}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:37:52,628] Trial 38 finished with value: 0.2273527816417032 and parameters: {'n_estimators': 803, 'max_depth': 15, 'min_samples_split': 40, 'min_samples_leaf': 8, 'max_features': 0.6930846397618265}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:40:08,734] Trial 39 finished with value: 0.23036397496995317 and parameters: {'n_estimators': 589, 'max_depth': 24, 'min_samples_split': 29, 'min_samples_leaf': 5, 'max_features': 0.6392701341284507}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:42:16,965] Trial 40 finished with value: 0.22611164452427643 and parameters: {'n_estimators': 638, 'max_depth': 18, 'min_samples_split': 37, 'min_samples_leaf': 6, 'max_features': 0.5483116423616157}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:44:42,230] Trial 41 finished with value: 0.23084125713544507 and parameters: {'n_estimators': 496, 'max_depth': 11, 'min_samples_split': 22, 'min_samples_leaf': 4, 'max_features': 0.863368452181305}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:47:08,279] Trial 42 finished with value: 0.2235940868727305 and parameters: {'n_estimators': 529, 'max_depth': 19, 'min_samples_split': 18, 'min_samples_leaf': 4, 'max_features': 0.7749742253763925}. Best is trial 18 with value: 0.23750516267766267.




[I 2025-09-11 09:49:19,261] Trial 43 finished with value: 0.23942689992732014 and parameters: {'n_estimators': 466, 'max_depth': 24, 'min_samples_split': 37, 'min_samples_leaf': 3, 'max_features': 0.8076527699228728}. Best is trial 43 with value: 0.23942689992732014.




[I 2025-09-11 09:50:40,558] Trial 44 finished with value: 0.22662577604740167 and parameters: {'n_estimators': 447, 'max_depth': 28, 'min_samples_split': 36, 'min_samples_leaf': 2, 'max_features': 0.47075434697676694}. Best is trial 43 with value: 0.23942689992732014.




[I 2025-09-11 09:52:53,451] Trial 45 finished with value: 0.23958562033468303 and parameters: {'n_estimators': 472, 'max_depth': 24, 'min_samples_split': 38, 'min_samples_leaf': 3, 'max_features': 0.80953386083005}. Best is trial 45 with value: 0.23958562033468303.




[I 2025-09-11 09:54:38,813] Trial 46 finished with value: 0.23695640049651776 and parameters: {'n_estimators': 410, 'max_depth': 24, 'min_samples_split': 32, 'min_samples_leaf': 1, 'max_features': 0.6930687655951047}. Best is trial 45 with value: 0.23958562033468303.




[I 2025-09-11 09:56:21,991] Trial 47 finished with value: 0.2308298554871056 and parameters: {'n_estimators': 404, 'max_depth': 20, 'min_samples_split': 32, 'min_samples_leaf': 1, 'max_features': 0.6961154864194412}. Best is trial 45 with value: 0.23958562033468303.




[I 2025-09-11 09:58:36,044] Trial 48 finished with value: 0.23052258396207526 and parameters: {'n_estimators': 410, 'max_depth': 28, 'min_samples_split': 37, 'min_samples_leaf': 3, 'max_features': 0.9550062333508589}. Best is trial 45 with value: 0.23958562033468303.




[I 2025-09-11 09:59:57,065] Trial 49 finished with value: 0.23460948618985303 and parameters: {'n_estimators': 337, 'max_depth': 34, 'min_samples_split': 34, 'min_samples_leaf': 1, 'max_features': 0.6436197634590701}. Best is trial 45 with value: 0.23958562033468303.
[09:59:57] BEST mean MCC: 0.2396
[09:59:57] BEST params : {'n_estimators': 472, 'max_depth': 24, 'min_samples_split': 38, 'min_samples_leaf': 3, 'max_features': 0.80953386083005}




[10:00:24] Fold 1 done.




[10:00:50] Fold 2 done.




[10:01:17] Fold 3 done.




[10:01:43] Fold 4 done.




[10:02:08] Fold 5 done.

=== nr-ahr ===
best_threshold: 0.565
mcc          : 0.2965
accuracy     : 0.9603
precision    : 0.4565
recall       : 0.2165
f1           : 0.2937
roc_auc      : 0.8303
confusion_matrix:
 [[2421   25]
 [  76   21]]

=== nr-ar-lbd ===
best_threshold: 0.195
mcc          : 0.6784
accuracy     : 0.9972
precision    : 0.4615
recall       : 1.0000
f1           : 0.6316
roc_auc      : 0.9975
confusion_matrix:
 [[2530    7]
 [   0    6]]

=== nr-ar ===
best_threshold: 0.495
mcc          : 0.4746
accuracy     : 0.9957
precision    : 0.5000
recall       : 0.4545
f1           : 0.4762
roc_auc      : 0.9745
confusion_matrix:
 [[2527    5]
 [   6    5]]

=== nr-aromatase ===
best_threshold: 0.195
mcc          : 0.1127
accuracy     : 0.9453
precision    : 0.0534
recall       : 0.3182
f1           : 0.0915
roc_auc      : 0.8356
confusion_matrix:
 [[2397  124]
 [  15    7]]

=== nr-er-lbd ===
best_threshold: 0.560
mcc          : 0.2781
accuracy     : 0.9925
precision    : 0.50

Exception ignored in: <function ResourceTracker.__del__ at 0x74bffd38eac0>
Traceback (most recent call last):
  File "/home/ssm-user/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
