In [1]:
import os
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem,DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors


In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [4]:
folder_path = r"C:\Users\re\Desktop\data\dataset\tianjia\huan"
# 初始化空的DataFrame
columns = ['CMPA_Molecular Weight','CMPA_heavy_atom_count','CMPA_Num Aliphatic Carbocycles','CMPA_Num Aliphatic Heterocycles','CMPA_Num Aliphatic Rings','CMPA_Ring Count','CMPA_Num Valence Electrons','CMPA_Labute ASA', 'CMPA_LogP', 'CMPA_Number of Hydrogen Acceptors', 'CMPA_Number of Rotatable Bonds', 'CMPA_Topological Polar Surface Area','CMPA_chi0','CMPA_chi1','CMPA_kappa1','CMPA_kappa2','CMPA_kappa3','CMPA_HallKierAlpha','CMPA_maccs_fingerprint','CMPA_topological_torsion','CMPA_Morgan Fingerprint']
df = pd.DataFrame(columns=columns)

# 遍历文件夹中的所有SDF文件
for filename in os.listdir(folder_path):
    if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            continue
        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息
        # 计算特征
        data = {}
##分子量
        data['CMPA_Molecular Weight'] = Descriptors.MolWt(mol)
# 重原子数 (Heavy Atom Count)
        data['CMPA_heavy_atom_count'] = Descriptors.HeavyAtomCount(mol)
## 碳环
        data['CMPA_Num Aliphatic Carbocycles']= Descriptors.NumAliphaticCarbocycles(mol)
## 杂环
        data['CMPA_Num Aliphatic Heterocycles']= Descriptors.NumAliphaticHeterocycles(mol)
##脂肪环
        data['CMPA_Num Aliphatic Rings']= Descriptors.NumAliphaticRings(mol)
##环数
        data['CMPA_Ring Count']= Descriptors.RingCount(mol)
##价电子数
        data['CMPA_Num Valence Electrons']=Descriptors.NumValenceElectrons(mol)
## 分子表面积
        data['CMPA_Labute ASA']=rdMolDescriptors.CalcLabuteASA(mol)
##Logp
        data['CMPA_LogP'] = Descriptors.MolLogP(mol)
##分子的H-键受体
        data['CMPA_Number of Hydrogen Acceptors'] = Descriptors.NumHAcceptors(mol)

##可旋转键数
        data['CMPA_Number of Rotatable Bonds'] = Descriptors.NumRotatableBonds(mol)
##拓扑极性表面积
        data['CMPA_Topological Polar Surface Area']=rdMolDescriptors.CalcTPSA(mol)
        # l.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)))

##连接性指数
        data['CMPA_chi0']=Descriptors.Chi0(mol)
        data['CMPA_chi1']= Descriptors.Chi1(mol)
##计算Kappa形状描述符
        data['CMPA_kappa1'] = Descriptors.Kappa1(mol)
        data['CMPA_kappa2']  = Descriptors.Kappa2(mol)
        data['CMPA_kappa3']  = Descriptors.Kappa3(mol)
##HallKierAlpha
        data['CMPA_HallKierAlpha']=Descriptors.HallKierAlpha(mol)




## MACCS键型指纹
        fp_maccs = MACCSkeys.GenMACCSKeys(mol)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_maccs = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_maccs, arr_maccs)
        data['CMPA_maccs_fingerprint']=arr_maccs

##摩根指纹
        fp_Morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_Morgan = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_Morgan, arr_Morgan)
        data['CMPA_Morgan Fingerprint']  = arr_Morgan

# 计算拓扑扭曲指纹
        fp_top = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
        arr_top = np.zeros((0,), dtype=int)  # 初始化一个空的 NumPy 数组
        DataStructs.ConvertToNumpyArray(fp_top , arr_top)
        data['CMPA_topological_torsion'] = arr_top
        # 将结果添加到DataFrame中
        df.loc[filename[:-4]] = data  # 去掉文件名的".sdf"后缀


# 如果需要，可以保存DataFrame到CSV文件
df.to_csv('tianjia_huan_features_01.csv')



In [26]:
df

Unnamed: 0,tianjia_Molecular Weight,tianjia_heavy_atom_count,tianjia_Num Aliphatic Carbocycles,tianjia_Num Aliphatic Heterocycles,tianjia_Num Aliphatic Rings,tianjia_Ring Count,tianjia_Num Valence Electrons,tianjia_Labute ASA,tianjia_LogP,tianjia_Number of Hydrogen Acceptors,...,tianjia_Topological Polar Surface Area,tianjia_chi0,tianjia_chi1,tianjia_kappa1,tianjia_kappa2,tianjia_kappa3,tianjia_HallKierAlpha,tianjia_maccs_fingerprint,tianjia_topological_torsion,tianjia_Morgan Fingerprint
B-CD,1134.987,77,0,15,15,15,448,534.091781,-15.2306,35,...,554.05,115.748737,65.423359,18.06718,26.029053,12.05526,-1.4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
CM-B-CD,1541.239,105,0,21,21,21,602,707.796029,-14.4683,49,...,738.15,149.239937,83.818148,28.494708,41.185505,22.713404,-1.96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
DM-B-CD,1331.365,91,0,15,15,15,532,665.797122,-6.0732,35,...,400.05,150.748737,81.473611,18.57333,35.392845,16.348897,-1.4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
HP-B-CD,1541.547,105,0,15,15,15,616,762.559143,-12.3949,42,...,618.66,173.198485,94.873106,21.885093,41.381888,23.728684,-1.68,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
HP-R-CD,1297.128,88,0,32,32,32,512,610.19511,-17.4064,40,...,633.2,132.284271,74.769553,20.717461,30.000106,13.780189,-1.6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
L-Phe+Cu2+,227.73,13,1,1,2,2,74,97.389225,0.7612,2,...,43.4,17.955665,10.303254,3.46046,4.759513,2.672704,0.399481,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
L-proline+Cu2+,509.126,33,2,2,4,4,188,251.365641,3.7108,5,...,89.1,56.187716,30.246465,6.4408,11.529167,6.330897,0.239481,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
M-B-CD,1303.311,89,0,15,15,15,520,646.982073,-7.3814,35,...,422.05,145.748737,79.180718,18.469558,34.006663,14.806328,-1.4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
S-B-CD,1688.379,105,0,15,15,15,616,678.748955,-18.8391,56,...,877.45,133.248737,73.448485,37.218429,36.160562,24.637091,0.21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
SBE-B-CD,328.496,24,0,0,0,0,132,192.266899,6.5489,2,...,37.3,45.71266,24.409584,4.166097,20.96363,21.92,-0.08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
