# test data

## test data 전처리

### Part 1: 디스크립터 계산 & with_desc 저장

In [1]:
# ===================== Part 1: 디스크립터 계산 & with_desc 저장 (단일 입력 전용) =====================
import os
import numpy as np
import pandas as pd

# 단일 입력/출력 경로
DATA_FILE = "/home/ssm-user/LAIDD/tox21/Data/test_data/test_data.csv"
OUT_DIR   = "/home/ssm-user/LAIDD/tox21/Data/test_data/"
os.makedirs(OUT_DIR, exist_ok=True)

# RDKit 2D descriptor 유틸
from rdkit import Chem
from rdkit.Chem import Descriptors
try:
    from rdkit.Chem import Descriptors3D
    desc3d = {n for n, _ in Descriptors3D._descList}
except Exception:
    desc3d = set()

DESC_2D_NAMES = [n for n, _ in Descriptors._descList if n not in desc3d]
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
_calc = MolecularDescriptorCalculator(DESC_2D_NAMES)

def rdkit_2d_descriptors_from_series(smiles_series: pd.Series,
                                     keep_all_rows: bool = True) -> pd.DataFrame:
    rows, idxs = [], []
    for idx, smi in smiles_series.items():
        smi = "" if pd.isna(smi) else str(smi).strip()
        if not smi:
            if keep_all_rows:
                rows.append([np.nan] * len(DESC_2D_NAMES)); idxs.append(idx)
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            if keep_all_rows:
                rows.append([np.nan] * len(DESC_2D_NAMES)); idxs.append(idx)
            continue
        try:
            vals = list(_calc.CalcDescriptors(mol))
        except Exception:
            vals = [np.nan] * len(DESC_2D_NAMES)
        rows.append(vals); idxs.append(idx)
    return pd.DataFrame(rows, columns=DESC_2D_NAMES, index=idxs)

def add_rdkit_2d_descriptors(df: pd.DataFrame,
                             smiles_col: str = "SMILES",
                             keep_all_rows: bool = False) -> pd.DataFrame:
    desc_df = rdkit_2d_descriptors_from_series(df[smiles_col], keep_all_rows=keep_all_rows)
    # keep_all_rows=False → RDKit 계산 성공한 행만 남도록 inner-join 효과
    return df.join(desc_df, how="inner") if not keep_all_rows else pd.concat([df, desc_df.reindex(df.index)], axis=1)

# --- 메인 (단일 파일만 처리) ---
assay_name = os.path.splitext(os.path.basename(DATA_FILE))[0]
print(f"\n[INFO][P1] Processing single file: {DATA_FILE}")

# 1) 로드
df = pd.read_csv(DATA_FILE, dtype=str, engine="python")

# 2) 필수 컬럼 확인 (SMILES, Sample ID만 필요)
required = {"SMILES", "Sample ID"}
if not required.issubset(df.columns):
    raise ValueError(f"Input must have SMILES and Sample ID columns. Found: {list(df.columns)}")

# 3) 문자열 정리 + 빈 SMILES 제거(로그용)
df["SMILES"] = df["SMILES"].astype(str).str.strip()
df["Sample ID"] = df["Sample ID"].astype(str).str.strip()
before = len(df)
df = df[df["SMILES"].str.len() > 0]
removed_empty = before - len(df)
if removed_empty > 0:
    print(f"  -> Removed {removed_empty} rows with empty SMILES.")

# 4) 디스크립터 계산 (유효 SMILES만)
df_with_desc = add_rdkit_2d_descriptors(df, smiles_col="SMILES", keep_all_rows=False)

# 5) 저장: 정제 없이 그대로 저장
out_path = os.path.join(OUT_DIR, f"{assay_name}_with_desc.csv")
df_with_desc.to_csv(out_path, index=False)

print(f"[READY][P1] saved -> {out_path} (rows={len(df_with_desc)}, cols={len(df_with_desc.columns)})")



[INFO][P1] Processing single file: /home/ssm-user/LAIDD/tox21/Data/test_data/test_data.csv


[02:44:05] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 10
[02:44:08] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 14


[READY][P1] saved -> /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_with_desc.csv (rows=645, cols=219)


### Part 2: _with_desc.csv 전처리 후 _2Ddesc.csv 저장

In [9]:
# ====== Part 2 (TEST용, 단일 파일): _with_desc.csv 전처리 후 _2Ddesc.csv 저장
#  - 흐름: 공통 NaN 컬럼 선삭제 → (train에서 저장한) cap으로 INF/값 클립 → NaN 포함 열 삭제

import os
import json
import numpy as np
import pandas as pd

# --- 입력/출력 경로 ---
OUT_DIR         = "/home/ssm-user/LAIDD/tox21/Data/test_data/"
WITH_DESC_FILE  = os.path.join(OUT_DIR, "test_data_with_desc.csv")
SAVE_BASE       = os.path.join(OUT_DIR, "test_data_processed")
os.makedirs(OUT_DIR, exist_ok=True)

# --- 공통 NaN 컬럼 드롭 리스트 (CSV만 사용) ---
DROP_LIST_CSV   = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.csv"

# --- train에서 저장해 둔 컬럼별 cap 경로 ---
CLIP_CAPS_JSON  = "/home/ssm-user/LAIDD/tox21/train_data_inf/clip_caps.json"

def load_common_nan_columns_from_csv(csv_path: str) -> set[str]:
    cols = set()
    if not os.path.exists(csv_path):
        print(f"[P2-Test] Drop-list CSV not found: {csv_path} (skip)")
        return cols
    try:
        df = pd.read_csv(csv_path)
        if "column" in df.columns:
            cols.update(df["column"].dropna().astype(str).str.strip().tolist())
        else:
            first_col = df.columns[0]
            cols.update(df[first_col].dropna().astype(str).str.strip().tolist())
    except Exception as e:
        print(f"[WARN] Failed to read CSV drop list: {csv_path} ({e})")
    return cols

def load_clip_caps(json_path: str) -> dict:
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"[P2-Test] clip_caps.json not found: {json_path}")
    with open(json_path, "r") as f:
        caps = json.load(f)
    clean = {}
    for k, v in caps.items():
        try:
            clean[str(k)] = float(v)
        except Exception:
            continue
    print(f"[P2-Test] Loaded caps: {len(clean)} features from {json_path}")
    return clean

print(f"\n[INFO][P2-Test] Load with_desc: {WITH_DESC_FILE}")
df = pd.read_csv(WITH_DESC_FILE, dtype=str, engine="python")

required = {"SMILES", "Sample ID"}
if not required.issubset(df.columns):
    raise ValueError(f"with_desc must have SMILES and Sample ID. Found: {list(df.columns)}")

df["SMILES"]    = df["SMILES"].astype(str).str.strip()
df["Sample ID"] = df["Sample ID"].astype(str).str.strip()

COMMON_NAN_COLS = load_common_nan_columns_from_csv(DROP_LIST_CSV)
CLIP_CAPS       = load_clip_caps(CLIP_CAPS_JSON)

non_feat  = {"Sample ID", "SMILES"}
feat_cols = [c for c in df.columns if c not in non_feat]
if COMMON_NAN_COLS:
    drop_cols_common = sorted(set(feat_cols).intersection(COMMON_NAN_COLS))
    if drop_cols_common:
        print(f"  -> Drop common-NaN columns first: {len(drop_cols_common)}")
        df = df.drop(columns=drop_cols_common, errors="ignore")
        feat_cols = [c for c in feat_cols if c not in drop_cols_common]
    else:
        print("  -> No common-NaN columns to drop in this file.")
else:
    print("  -> No common-NaN drop list loaded or list is empty.")

# (수정) 숫자화 후 float64로 통일
X = df[feat_cols].apply(pd.to_numeric, errors="coerce").astype(np.float64)

# train에서 cap이 정의된 컬럼만 유지
caps_cols = sorted(set(feat_cols).intersection(CLIP_CAPS.keys()))
unused_cols = sorted(set(feat_cols) - set(caps_cols))
if unused_cols:
    print(f"  -> Drop columns without train caps ({len(unused_cols)}): "
          f"{unused_cols[:5]}{'...' if len(unused_cols) > 5 else ''}")
X = X[caps_cols]

# cap 적용: ±inf 대체 + inf값만 clip
for col in caps_cols:
    cap = CLIP_CAPS[col]
    arr = X[col].to_numpy(copy=False)  # float64
    # ±inf만 cap으로 바꿔치기
    np.copyto(arr, +cap, where=np.isposinf(arr))
    np.copyto(arr, -cap, where=np.isneginf(arr))
    # np.clip(...) 제거!  유한값은 손대지 않음
    X[col] = arr

# NaN 포함 열 전부 삭제
nan_cols = X.columns[X.isna().any(axis=0)].tolist()
if nan_cols:
    print(f"  -> Drop columns containing NaN ({len(nan_cols)}): "
          f"{nan_cols[:5]}{'...' if len(nan_cols) > 5 else ''}")
    X = X.drop(columns=nan_cols)
else:
    print("  -> No columns contain NaN after capping.")

if X.shape[1] == 0:
    raise RuntimeError("No feature columns remain after dropping NaN columns. Check preprocessing settings.")

# 비유한값 최종 점검 (NaN/±inf 모두 없어야 함)
assert np.isfinite(X.values).all(), "Processed X still has non-finite values (NaN or ±inf)."

left_part = df.loc[:, ["Sample ID", "SMILES"]].reset_index(drop=True)
X_final   = X.reset_index(drop=True)
df_save   = pd.concat([left_part, X_final], axis=1)
df_save.to_csv(SAVE_BASE + ".csv", index=False)

print(f"[READY][P2-Test] saved -> {SAVE_BASE+'.csv'} "
      f"(rows={len(df_save)}, cols={len(df_save.columns)}) | "
      f"cap_from_train=True, NaN columns dropped={len(nan_cols)}")



[INFO][P2-Test] Load with_desc: /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_with_desc.csv
[P2-Test] Loaded caps: 205 features from /home/ssm-user/LAIDD/tox21/train_data_inf/clip_caps.json
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN after capping.
[READY][P2-Test] saved -> /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_processed.csv (rows=645, cols=207) | cap_from_train=True, NaN columns dropped=0


In [10]:
X

Unnamed: 0,AvgIpc,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
0,3.936855,4.908470e-07,1682.841056,37.329487,31.212127,31.212127,25.440364,19.621438,19.621438,16.370604,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276843
1,2.969295,1.689999e-06,837.416772,16.353007,12.273696,16.754120,10.871811,6.486421,8.740012,5.050654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262347
2,2.942977,1.927142e+00,1104.420013,14.681434,11.835691,11.835691,10.826500,7.423033,7.423033,5.683487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.540057
3,3.106280,2.253331e-06,700.609974,16.777810,13.491812,14.308309,11.720347,7.823811,7.823811,5.518027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608509
4,2.544034,2.227488e+00,758.185306,16.872033,12.698674,14.271099,10.919852,7.137723,9.155409,5.276592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.779959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,1.405639,2.190610e+00,14.364528,4.121320,3.015748,3.910175,2.414214,1.632456,2.264911,0.800767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469276
641,2.185557,2.399922e+00,136.152234,11.096012,9.047243,9.047243,7.295555,5.294733,5.294733,3.778366,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.468960
642,1.905400,3.391829e+00,197.454754,5.276021,3.749889,4.566386,3.304530,1.745765,2.154013,1.141591,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.480265
643,3.086707,1.928025e+00,709.994725,15.087576,11.220996,13.549351,9.935071,6.684735,7.848912,5.521094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.646146


In [2]:
# ====== Part 2 (TEST용, 단일 파일): _with_desc.csv 전처리 후 _2Ddesc.csv 저장
#  - 흐름: 공통 NaN 컬럼 선삭제 → NaN 포함 열 삭제

import os
import numpy as np
import pandas as pd

# --- 입력/출력 경로 ---
OUT_DIR         = "/home/ssm-user/LAIDD/tox21/Data/test_data/"
WITH_DESC_FILE  = os.path.join(OUT_DIR, "test_data_with_desc.csv")
SAVE_BASE       = os.path.join(OUT_DIR, "test_data_processed")
os.makedirs(OUT_DIR, exist_ok=True)

# --- 공통 NaN 컬럼 드롭 리스트 (CSV만 사용) ---
DROP_LIST_CSV   = "/home/ssm-user/LAIDD/tox21/train_data_inf/data_summary/columns_with_nan_in_all_12_assays.csv"

def load_common_nan_columns_from_csv(csv_path: str) -> set[str]:
    cols = set()
    if not os.path.exists(csv_path):
        print(f"[P2-Test] Drop-list CSV not found: {csv_path} (skip)")
        return cols
    try:
        df = pd.read_csv(csv_path)
        if "column" in df.columns:
            cols.update(df["column"].dropna().astype(str).str.strip().tolist())
        else:
            first_col = df.columns[0]
            cols.update(df[first_col].dropna().astype(str).str.strip().tolist())
    except Exception as e:
        print(f"[WARN] Failed to read CSV drop list: {csv_path} ({e})")
    return cols

print(f"\n[INFO][P2-Test] Load with_desc: {WITH_DESC_FILE}")
df = pd.read_csv(WITH_DESC_FILE, dtype=str, engine="python")

required = {"SMILES", "Sample ID"}
if not required.issubset(df.columns):
    raise ValueError(f"with_desc must have SMILES and Sample ID. Found: {list(df.columns)}")

df["SMILES"]    = df["SMILES"].astype(str).str.strip()
df["Sample ID"] = df["Sample ID"].astype(str).str.strip()

COMMON_NAN_COLS = load_common_nan_columns_from_csv(DROP_LIST_CSV)

non_feat  = {"Sample ID", "SMILES"}
feat_cols = [c for c in df.columns if c not in non_feat]
if COMMON_NAN_COLS:
    drop_cols_common = sorted(set(feat_cols).intersection(COMMON_NAN_COLS))
    if drop_cols_common:
        print(f"  -> Drop common-NaN columns first: {len(drop_cols_common)}")
        df = df.drop(columns=drop_cols_common, errors="ignore")
        feat_cols = [c for c in feat_cols if c not in drop_cols_common]
    else:
        print("  -> No common-NaN columns to drop in this file.")
else:
    print("  -> No common-NaN drop list loaded or list is empty.")

# (수정) 숫자화 후 float64로 통일
X = df[feat_cols].apply(pd.to_numeric, errors="coerce").astype(np.float64)

# NaN 포함 열 전부 삭제
nan_cols = X.columns[X.isna().any(axis=0)].tolist()
if nan_cols:
    print(f"  -> Drop columns containing NaN ({len(nan_cols)}): "
          f"{nan_cols[:5]}{'...' if len(nan_cols) > 5 else ''}")
    X = X.drop(columns=nan_cols)
else:
    print("  -> No columns contain NaN.")

if X.shape[1] == 0:
    raise RuntimeError("No feature columns remain after dropping NaN columns. Check preprocessing settings.")

# 비유한값 최종 점검 (NaN/±inf 모두 없어야 함)
assert np.isfinite(X.values).all(), "Processed X still has non-finite values (NaN or ±inf)."

left_part = df.loc[:, ["Sample ID", "SMILES"]].reset_index(drop=True)
X_final   = X.reset_index(drop=True)
df_save   = pd.concat([left_part, X_final], axis=1)
df_save.to_csv(SAVE_BASE + ".csv", index=False)

print(f"[READY][P2-Test] saved -> {SAVE_BASE+'.csv'} "
      f"(rows={len(df_save)}, cols={len(df_save.columns)}) | "
      f"NaN columns dropped={len(nan_cols)}")



[INFO][P2-Test] Load with_desc: /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_with_desc.csv
  -> Drop common-NaN columns first: 12
  -> No columns contain NaN.
[READY][P2-Test] saved -> /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_processed.csv (rows=645, cols=207) | NaN columns dropped=0


### Part 3: test

In [12]:
import os
import json
import numpy as np
import pandas as pd
import joblib

# ===================== 경로 설정 =====================
MODEL_DIR   = "/home/ssm-user/LAIDD/tox21/Results_imputer_inf"
MODEL_PATH  = os.path.join(MODEL_DIR, "multilabel_rf_best.joblib")
ASSAYS_PATH = os.path.join(MODEL_DIR, "assays.json")

TEST_PATH   = "/home/ssm-user/LAIDD/tox21/Data/test_data/test_data_processed.csv"  # 이미 전처리 완료본
TEST_OUT    = os.path.join(MODEL_DIR, "2D_predictions.csv")                  # 스태킹용 출력

# ===================== 유틸 =====================
def load_assays(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"assays.json not found: {path}")
    with open(path, "r") as f:
        assays = json.load(f)
    if not isinstance(assays, list) or not assays:
        raise ValueError("assays.json is not a non-empty list.")
    return assays

def get_training_feature_names(pipe) -> list:
    """
    학습 시 사용된 피처명(순서 포함)을 파이프라인에서 복구.
    - 1순위: 첫 스텝(SimpleImputer 등)의 feature_names_in_
    - 2순위: 파이프라인 자체의 feature_names_in_
    """
    if hasattr(pipe, "named_steps") and "imp" in pipe.named_steps:
        imp = pipe.named_steps["imp"]
        if hasattr(imp, "feature_names_in_"):
            return list(imp.feature_names_in_)
    if hasattr(pipe, "feature_names_in_"):
        return list(pipe.feature_names_in_)
    raise RuntimeError(
        "Cannot find training feature names in the fitted pipeline. "
        "Ensure the model was trained with a pandas DataFrame so steps expose 'feature_names_in_'."
    )

# ===================== 메인 =====================
if __name__ == "__main__":
    print("\n[TEST] Loading model & assays ...")
    pipe = joblib.load(MODEL_PATH)                       # Pipeline(SimpleImputer + MultiOutput(RandomForest))
    assays = load_assays(ASSAYS_PATH)
    expected_features = get_training_feature_names(pipe)
    print(f"[TEST] #features expected by model: {len(expected_features)}")

    # (선택) 내부 차원 로그
    try:
        n_expected_internal = pipe.named_steps["clf"].estimators_[0].n_features_in_
        print(f"[TEST] Model internal n_features_in_: {n_expected_internal}")
    except Exception:
        pass

    print(f"[TEST] Loading test data: {TEST_PATH}")
    test_df = pd.read_csv(TEST_PATH, low_memory=False,
                          dtype={"Sample ID":"string","SMILES":"string"})
    # 키 정리
    test_df["Sample ID"] = test_df["Sample ID"].astype("string").str.strip()
    test_df["SMILES"]    = test_df["SMILES"].astype("string").str.strip()

    # 피처 부분만 분리
    non_feat = {"Sample ID", "SMILES", "toxicity"}
    test_feat_cols = [c for c in test_df.columns if c not in non_feat]
    X_test = test_df[test_feat_cols].copy()

    # 수치 변환 + 안전장치(±inf -> NaN)
    X_test = X_test.apply(pd.to_numeric, errors="coerce")
    X_test = X_test.replace([np.inf, -np.inf], np.nan).astype("float64")

    # 학습 스키마로 정렬: 누락 컬럼은 자동으로 NaN 생성, 여분 컬럼은 드롭
    exp_set = set(expected_features)
    cur_set = set(X_test.columns)
    missing = sorted(exp_set - cur_set)
    extra   = sorted(cur_set - exp_set)

    if missing:
        print(f"[TEST][WARN] Missing {len(missing)} feature(s) in test "
              f"(filled as NaN). e.g., {missing[:5]}{'...' if len(missing)>5 else ''}")
    if extra:
        print(f"[TEST] Dropping {len(extra)} unexpected test feature(s). "
              f"e.g., {extra[:5]}{'...' if len(extra)>5 else ''}")

    X_test = X_test.reindex(columns=expected_features)

    assert list(X_test.columns) == list(expected_features)  # 학습 피처와 완전 동일

    # ===== 예측 (파이프라인 전체 호출: imputer -> classifier) =====
    print("[TEST] Predicting probabilities ...")
    proba_out = pipe.predict_proba(X_test)  # MultiOutputClassifier: list of (n,2) or ndarray (n,2) when single
    proba_list = proba_out if isinstance(proba_out, list) else [proba_out]
    prob_mat = np.column_stack([p[:, 1] for p in proba_list]).astype(np.float32)

    # ===== 저장: Sample ID, SMILES + assay별 확률 =====
    out_df = pd.concat(
        [test_df.loc[:, ["Sample ID", "SMILES"]].reset_index(drop=True),
         pd.DataFrame(prob_mat, columns=assays)],
        axis=1
    )
    os.makedirs(os.path.dirname(TEST_OUT), exist_ok=True)
    out_df.to_csv(TEST_OUT, index=False)
    print(f"[READY] Saved stacking-ready test probabilities -> {TEST_OUT}")
    print(f"        rows={len(out_df)}, assays={len(assays)}")



[TEST] Loading model & assays ...
[TEST] #features expected by model: 205
[TEST] Model internal n_features_in_: 205
[TEST] Loading test data: /home/ssm-user/LAIDD/tox21/Data/test_data/test_data_processed.csv
[TEST] Predicting probabilities ...
[READY] Saved stacking-ready test probabilities -> /home/ssm-user/LAIDD/tox21/Results_imputer_inf/2D_predictions.csv
        rows=645, assays=12
