In [47]:
import sys

from rdkit import Chem
from rdkit.Chem import AllChem

In [48]:
import os
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem,DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdMolTransforms

In [52]:
def get_features(mol):
        data = {}

##分子量
        data['Analyte_Molecular Weight'] = Descriptors.MolWt(mol)
# 重原子数 (Heavy Atom Count)
        data['Analyte_heavy_atom_count'] = Descriptors.HeavyAtomCount(mol)
## 碳环
        data['Analyte_Num Aliphatic Carbocycles']= Descriptors.NumAliphaticCarbocycles(mol)
## 杂环
        data['Analyte_Num Aliphatic Heterocycles']= Descriptors.NumAliphaticHeterocycles(mol)
##脂肪环
        data['Analyte_Num Aliphatic Rings']= Descriptors.NumAliphaticRings(mol)
##环数
        data['Analyte_Ring Count']= Descriptors.RingCount(mol)
##价电子数
        data['Analyte_Num Valence Electrons']=Descriptors.NumValenceElectrons(mol)
## 分子表面积
        data['Analyte_Labute ASA']=rdMolDescriptors.CalcLabuteASA(mol)
##Logp
        data['Analyte_LogP'] = Descriptors.MolLogP(mol)
##分子的H-键受体
        data['Analyte_Number of Hydrogen Acceptors'] = Descriptors.NumHAcceptors(mol)

##可旋转键数
        data['Analyte_Number of Rotatable Bonds'] = Descriptors.NumRotatableBonds(mol)
##拓扑极性表面积
        data['Analyte_Topological Polar Surface Area']=rdMolDescriptors.CalcTPSA(mol)
        # l.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)))

##连接性指数
        data['Analyte_chi0']=Descriptors.Chi0(mol)
        data['Analyte_chi1']= Descriptors.Chi1(mol)
##计算Kappa形状描述符
        data['Analyte_kappa1'] = Descriptors.Kappa1(mol)
        data['Analyte_kappa2']  = Descriptors.Kappa2(mol)
        data['Analyte_kappa3']  = Descriptors.Kappa3(mol)
##HallKierAlpha
        data['Analyte_HallKierAlpha']=Descriptors.HallKierAlpha(mol)

## MACCS键型指纹
        fp_maccs = MACCSkeys.GenMACCSKeys(mol)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_maccs = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_maccs, arr_maccs)
        data['Analyte_maccs_fingerprint']=arr_maccs

##摩根指纹
        fp_Morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        # 将 ExplicitBitVect 转换为 NumPy 数组
        arr_Morgan = np.zeros((1,), dtype=np.int8)  # 注意：MACCS键长度为166，但NumPy数组需要初始化长度
        DataStructs.ConvertToNumpyArray(fp_Morgan, arr_Morgan)
        data['Analyte_Morgan Fingerprint']  = arr_Morgan

# 计算拓扑扭曲指纹
        fp_top = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
        arr_top = np.zeros((0,), dtype=int)  # 初始化一个空的 NumPy 数组
        DataStructs.ConvertToNumpyArray(fp_top , arr_top)
        data['Analyte_topological_torsion'] = arr_top
        return data

In [53]:
folder_path = r"C:\Users\re\Desktop\data\dataset\fenli\huan_RS"
# 初始化空的DataFrame
columns = ['Analyte_Molecular Weight','Analyte_heavy_atom_count','Analyte_Num Aliphatic Carbocycles','Analyte_Num Aliphatic Heterocycles','Analyte_Num Aliphatic Rings','Analyte_Ring Count','Analyte_Num Valence Electrons','Analyte_Labute ASA', 'Analyte_LogP', 'Analyte_Number of Hydrogen Acceptors', 'Analyte_Number of Rotatable Bonds', 'Analyte_Topological Polar Surface Area','Analyte_chi0','Analyte_chi1','Analyte_kappa1','Analyte_kappa2','Analyte_kappa3','Analyte_HallKierAlpha','Analyte_maccs_fingerprint','Analyte_topological_torsion','Analyte_Morgan Fingerprint','R_Analyte_dipole_x','R_Analyte_dipole_y','R_Analyte_dipole_z','angle0','angle1','S_Analyte_dipole_x','S_Analyte_dipole_y','S_Analyte_dipole_z','angle2']
df = pd.DataFrame(columns=columns)

# 遍历文件夹中的所有SDF文件
for filename in os.listdir(folder_path):
    if filename.endswith(".sdf"):
        file_path = os.path.join(folder_path, filename)
        mol = Chem.MolFromMolFile(file_path, sanitize=False)  # 读取分子，不进行sanitize

        if mol is None:
            print(f"Failed to load molecule from {filename}")
            continue
        # 手动初始化环信息
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSymmSSSR(mol)  # 计算并储存环信息
        # 计算特征
        data_r = get_features(mol)
        # 将结果添加到DataFrame中
        df.loc[filename[:-4]] = data_r # 去掉文件名的".sdf"后缀

# 如果需要，可以保存DataFrame到CSV文件
df.to_csv('fenli_huan_features_RS_04.csv')


