In [1]:
import os
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem,DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [3]:
folder_path = r"C:\Users\re\Desktop\data\dataset\tianjia\huan"
# 初始化空的DataFrame
columns = ['CMPA_maccs_fingerprint','CMPA_topological_torsion','CMPA_Morgan Fingerprint']
df = pd.DataFrame(columns=columns)

for filename in os.listdir(folder_path):
    if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            continue

        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息

        # 初始化存储指纹值的字典
        data = {}

        # MACCS键型指纹
        fp_maccs = MACCSkeys.GenMACCSKeys(mol)
        arr_maccs = np.zeros((1, ), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp_maccs, arr_maccs)
        for i in range(len(arr_maccs)):
            data[f'CMPA_maccs_fingerprint_{i}'] = arr_maccs[i]

        # 摩根指纹
        fp_Morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        arr_Morgan = np.zeros((1, ), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp_Morgan, arr_Morgan)
        for i in range(len(arr_Morgan)):
            data[f'CMPA_Morgan_fingerprint_{i}'] = arr_Morgan[i]

        # 计算拓扑扭曲指纹
        fp_top = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
        arr_top = np.zeros((1, ), dtype=int)
        DataStructs.ConvertToNumpyArray(fp_top, arr_top)
        for i in range(len(arr_top)):
            data[f'CMPA_topological_torsion_{i}'] = arr_top[i]

        # 将结果添加到DataFrame中
        df = df._append(pd.Series(data, name=filename[:-4]))

# 如果需要，可以保存DataFrame到CSV文件
print(df.columns)
df=df.drop(['CMPA_maccs_fingerprint','CMPA_topological_torsion','CMPA_Morgan Fingerprint'],axis=1)
df.to_csv('tianjia_huan_print_01.csv')



Index(['CMPA_maccs_fingerprint', 'CMPA_topological_torsion',
       'CMPA_Morgan Fingerprint', 'CMPA_maccs_fingerprint_0',
       'CMPA_maccs_fingerprint_1', 'CMPA_maccs_fingerprint_2',
       'CMPA_maccs_fingerprint_3', 'CMPA_maccs_fingerprint_4',
       'CMPA_maccs_fingerprint_5', 'CMPA_maccs_fingerprint_6',
       ...
       'CMPA_topological_torsion_2038', 'CMPA_topological_torsion_2039',
       'CMPA_topological_torsion_2040', 'CMPA_topological_torsion_2041',
       'CMPA_topological_torsion_2042', 'CMPA_topological_torsion_2043',
       'CMPA_topological_torsion_2044', 'CMPA_topological_torsion_2045',
       'CMPA_topological_torsion_2046', 'CMPA_topological_torsion_2047'],
      dtype='object', length=3242)




In [5]:
df

Unnamed: 0,CMPA_maccs_fingerprint_0,CMPA_maccs_fingerprint_1,CMPA_maccs_fingerprint_2,CMPA_maccs_fingerprint_3,CMPA_maccs_fingerprint_4,CMPA_maccs_fingerprint_5,CMPA_maccs_fingerprint_6,CMPA_maccs_fingerprint_7,CMPA_maccs_fingerprint_8,CMPA_maccs_fingerprint_9,...,CMPA_topological_torsion_2038,CMPA_topological_torsion_2039,CMPA_topological_torsion_2040,CMPA_topological_torsion_2041,CMPA_topological_torsion_2042,CMPA_topological_torsion_2043,CMPA_topological_torsion_2044,CMPA_topological_torsion_2045,CMPA_topological_torsion_2046,CMPA_topological_torsion_2047
B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CM-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DM-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HP-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HP-R-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L-Phe+Cu2+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L-proline+Cu2+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SBE-B-CD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
