In [4]:
# build_ecfp4.py
import argparse
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

def ecfp4_bits(smiles: str, n_bits: int = 2048, use_chirality: bool = True):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None
    bv = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=2, nBits=n_bits, useChirality=use_chirality, useFeatures=False
    )
    arr = np.zeros((n_bits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(bv, arr)
    cano = Chem.MolToSmiles(mol, canonical=True)
    return arr, cano

def main():
    
    infile = "Drug_SMILE.txt"      # 输入文件: 两列 SMILES<TAB>drug_name（无表头）
    out_npy      = "ECFP4_2048.npy"      # 输出的矩阵 .npy
    out_csv      = "ECFP4_2048_map.csv"  # 映射/日志 .csv
    bits         = 2048                  # ECFP4 位数
    use_chirality= True                  # 是否编码手性
    sep          = "\t"                  # 分隔符
    has_header   = False

    # 读入
    if has_header:
        df = pd.read_csv(infile, sep=sep)
        if not {"smiles","drug_name"}.issubset(df.columns):
            raise KeyError("Header mode requires columns: smiles, drug_name")
        df = df[["smiles","drug_name"]]
    else:
        df = pd.read_csv(infile, sep=sep, header=None, names=["smiles","drug_name"])
    df["smiles"] = df["smiles"].astype(str).str.strip()
    df["drug_name"] = df["drug_name"].astype(str).str.strip()
    df = df.dropna(subset=["smiles","drug_name"]).drop_duplicates(subset=["drug_name"]).reset_index(drop=True)

    n = len(df)
    X = np.zeros((n, bits), dtype=np.uint8)
    rows = []
    ok = 0

    for i, (smi, name) in enumerate(zip(df["smiles"], df["drug_name"])):
        fp, cano = ecfp4_bits(smi, n_bits=bits, use_chirality=use_chirality)
        if fp is not None:
            X[i] = fp
            ok += 1
            rows.append({"index": i, "drug_name": name, "smiles": smi, "canonical_smiles": cano, "ok": 1})
        else:
            rows.append({"index": i, "drug_name": name, "smiles": smi, "canonical_smiles": "", "ok": 0})

    np.save(out_npy, X)
    pd.DataFrame(rows).to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved matrix: {out_npy} shape={X.shape}, success={ok}/{n}")
    print(f"Saved map   : {out_csv}")

if __name__ == "__main__":
    main()



Saved matrix: ECFP4_2048.npy shape=(4453, 2048), success=4453/4453
Saved map   : ECFP4_2048_map.csv


In [10]:
import pickle
with open("../Process/cls/KPGT_emb2304.pickle", "rb") as f:
    kpgt_data = pickle.load(f)

k, v = next(iter(kpgt_data.items()))
print(k,v)

new_v,_ = ecfp4_bits(k)


BrC1C(Br)C(Br)C(Br)C(Br)C1Br [ 0.9660427  4.656059  -1.6086305 ...  2.2268302 -1.0574567 -1.6837476]
