In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [4]:
import os
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem,DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

In [31]:
folder_path = r"C:\Users\re\Desktop\data\dataset\fenli\huan_RS"
# 初始化空的DataFrame
columns = ['Analyte_Molecular Weight','Analyte_heavy_atom_count','Analyte_Num Aliphatic Carbocycles','Analyte_Num Aliphatic Heterocycles','Analyte_Num Aliphatic Rings','Analyte_Ring Count','Analyte_Num Valence Electrons','Analyte_Labute ASA', 'Analyte_LogP', 'Analyte_Number of Hydrogen Acceptors', 'Analyte_Number of Rotatable Bonds', 'Analyte_Topological Polar Surface Area','Analyte_chi0','Analyte_chi1','Analyte_kappa1','Analyte_kappa2','Analyte_kappa3','Analyte_HallKierAlpha','Analyte_maccs_fingerprint','Analyte_topological_torsion','Analyte_Morgan Fingerprint',
           'Fsp3',
           ]
df = pd.DataFrame(columns=columns)

# 遍历文件夹中的所有SDF文件
for filename in os.listdir(folder_path):
    if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            continue
        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息
        # 计算特征
        data = {}
# 计算 Fsp3 描述符
        data['Fsp3'] = rdMolDescriptors.CalcFractionCSP3(mol)
##分子量
        data['Analyte_Molecular Weight'] = Descriptors.MolWt(mol)
# 重原子数 (Heavy Atom Count)
        data['Analyte_heavy_atom_count'] = Descriptors.HeavyAtomCount(mol)
## 碳环
        data['Analyte_Num Aliphatic Carbocycles']= Descriptors.NumAliphaticCarbocycles(mol)
## 杂环
        data['Analyte_Num Aliphatic Heterocycles']= Descriptors.NumAliphaticHeterocycles(mol)
##脂肪环
        data['Analyte_Num Aliphatic Rings']= Descriptors.NumAliphaticRings(mol)
##环数
        data['Analyte_Ring Count']= Descriptors.RingCount(mol)
##价电子数
        data['Analyte_Num Valence Electrons']=Descriptors.NumValenceElectrons(mol)
## 分子表面积
        data['Analyte_Labute ASA']=rdMolDescriptors.CalcLabuteASA(mol)
##Logp
        data['Analyte_LogP'] = Descriptors.MolLogP(mol)
##分子的H-键受体
        data['Analyte_Number of Hydrogen Acceptors'] = Descriptors.NumHAcceptors(mol)

##可旋转键数
        data['Analyte_Number of Rotatable Bonds'] = Descriptors.NumRotatableBonds(mol)
##拓扑极性表面积
        data['Analyte_Topological Polar Surface Area']=rdMolDescriptors.CalcTPSA(mol)
        # l.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)))

##连接性指数
        data['Analyte_chi0']=Descriptors.Chi0(mol)
        data['Analyte_chi1']= Descriptors.Chi1(mol)
##计算Kappa形状描述符
        data['Analyte_kappa1'] = Descriptors.Kappa1(mol)
        data['Analyte_kappa2']  = Descriptors.Kappa2(mol)
        data['Analyte_kappa3']  = Descriptors.Kappa3(mol)
##HallKierAlpha
        data['Analyte_HallKierAlpha']=Descriptors.HallKierAlpha(mol)

## MACCS键型指纹
        fp_maccs = MACCSkeys.GenMACCSKeys(mol)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_maccs = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_maccs, arr_maccs)
        data['Analyte_maccs_fingerprint']=arr_maccs

##摩根指纹
        fp_Morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_Morgan = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_Morgan, arr_Morgan)
        data['Analyte_Morgan Fingerprint']  = arr_Morgan

# 计算拓扑扭曲指纹
        fp_top = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
        arr_top = np.zeros((0,), dtype=int)  # 初始化一个空的 NumPy 数组
        DataStructs.ConvertToNumpyArray(fp_top , arr_top)
        data['Analyte_topological_torsion'] = arr_top
        # 将结果添加到DataFrame中
        df.loc[filename[:-4]] = data  # 去掉文件名的".sdf"后缀


# 如果需要，可以保存DataFrame到CSV文件
df.to_csv('fenli_huan_features_01.csv')




In [71]:
folder_path = r"C:/Users/re/Desktop/data/dataset/fenli/R_S"
# 初始化空的DataFrame
columns = ['R_PBF','S_PBF']
df = pd.DataFrame(columns=columns)

def calc_stable_pbf(mol):
    # 1. 生成分子并标准化
    mol = Chem.AddHs(mol)  # 统一添加氢原子

    # 2. 生成确定性 3D 构象
    params = AllChem.ETKDGv3()
    params.useRandomCoords = False
    params.randomSeed = 42
    AllChem.EmbedMolecule(mol, params)  # 固定随机种子

    # 3. 力场优化并确保收敛
    AllChem.MMFFOptimizeMolecule(mol, maxIters=1000)

    # 4. 计算 PBF
    return rdMolDescriptors.CalcPBF(mol)

# 遍历文件夹中的所有SDF文件
i = 0
for filename in os.listdir(folder_path):
    # if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            i+=1
            continue
        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息
        # 计算特征
        data = {}

        cal_pbf = calc_stable_pbf(mol)
        if filename[:2] == "S-":
            data['R_PBF'] = df.loc[filename[2:-4]]['R_PBF']
            data['S_PBF'] = cal_pbf
        elif filename[:2] == "R-":
            data['R_PBF'] = cal_pbf

        df.loc[filename[2:-4]] = data  # 去掉文件名的".sdf"后缀

# 如果需要，可以保存DataFrame到CSV文件
df.to_csv('PBF_features_01.csv')
df



Unnamed: 0,R_PBF,S_PBF
2-FMC,0.501361,0.788127
2-PPA,0.890121,0.788839
"3,4-DMMC",0.79668,0.902586
3-FA,0.699611,0.734711
3-FMC,0.665592,0.748836
4-BMC (2),0.782734,0.685445
4-BMC,0.685437,0.703179
4-Chloromandelic_acid,0.854154,0.7952
4-EMC,0.77521,0.775088
4-FA,0.640175,0.763423


In [5]:
folder_path = r"C:/Users/re/Desktop/data/dataset/fenli/R_S"
# 初始化空的DataFrame
columns = ['PBF']
df = pd.DataFrame(columns=columns)

def calc_stable_pbf(mol):
    # 1. 生成分子并标准化
    mol = Chem.AddHs(mol)  # 统一添加氢原子

    # 2. 生成确定性 3D 构象
    params = AllChem.ETKDGv3()
    params.useRandomCoords = False
    params.randomSeed = 42
    AllChem.EmbedMolecule(mol, params)  # 固定随机种子

    # 3. 力场优化并确保收敛
    AllChem.MMFFOptimizeMolecule(mol, maxIters=1000)

    # 4. 计算 PBF
    return rdMolDescriptors.CalcPBF(mol)

# 遍历文件夹中的所有SDF文件
i = 0
for filename in os.listdir(folder_path):
    # if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            i+=1
            continue
        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息
        # 计算特征
        data = {}
        cal_pbf = calc_stable_pbf(mol)
        data['PBF'] = cal_pbf
        df.loc[filename[:-4]] = data  # 去掉文件名的".sdf"后缀

# 如果需要，可以保存DataFrame到CSV文件
df.to_csv('PBF_features_02.csv')
df



Unnamed: 0,PBF
R-2-FMC,0.501361
R-2-PPA,0.890121
"R-3,4-DMMC",0.796680
R-3-FA,0.699611
R-3-FMC,0.665592
...,...
S-Pioglitazone,0.986092
S-PLA,0.622961
S-Terazosin,0.868995
S-Tirofiban,1.002929
