In [1]:
import os
from rdkit import Chem
import pandas as pd
from tqdm import tqdm
from mordred import Calculator, descriptors # mordred==1.2.0, numpy==1.26.4, rdkit==2024.3.1

In [2]:
def mol_files_to_df(folder_path):
    """
    Creates a DataFrame from MOL files in the specified folder.
    Index will be filename without .mol extension.
    
    Parameters:
    folder_path (str): Path to the folder containing MOL files
    
    Returns:
    pandas.DataFrame: DataFrame with filename (without .mol) as index and RDKit mol object as 'MOL' column
    """
    mol_files = {}
    
    # Walk through the directory
    for filename in os.listdir(folder_path):
        if filename.endswith('.mol'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Read the MOL file
                mol = Chem.MolFromMolFile(file_path)
                if mol is not None:
                    # Remove .mol extension from filename for index
                    name_without_extension = filename[:-4]
                    mol_files[name_without_extension] = mol
            except Exception as e:
                print(f"Error reading file {filename}: {str(e)}")
    
    # Create DataFrame
    df = pd.DataFrame.from_dict(mol_files, orient='index', columns=['MOL'])
    
    return df

In [3]:
df = mol_files_to_df("../gauss_files/antidiabetic/")

In [4]:
df

Unnamed: 0,MOL
DB00222,<rdkit.Chem.rdchem.Mol object at 0x7fa73d760ba0>
some_name,<rdkit.Chem.rdchem.Mol object at 0x7fa73d760b30>
DB00197,<rdkit.Chem.rdchem.Mol object at 0x7fa73d760900>


In [5]:
# Initialize calculator
calc = Calculator(descriptors, ignore_3D=False)

# Get descriptor names before calculation
descriptor_names = [str(d) for d in calc.descriptors]


# Calculate descriptors for each molecule
d = []
for mol in tqdm(df['MOL']):
    try:
        # Calculate descriptors
        result = calc(mol)
        # Convert result to list while preserving order
        # result_list = [result[desc] for desc in calc.descriptors()]
        d.append(result)
    except Exception as e:
        print(e)
        # Append None values for failed calculations
        d.append([None] * len(descriptor_names))

100%|█████████████████████████████████████████████| 3/3 [00:00<00:00,  6.14it/s]


In [6]:
# Create DataFrame with proper column names
descriptors_df = pd.DataFrame(d, index=df.index, columns=descriptor_names)

In [7]:
descriptors_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
DB00222,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,42.17682,2.420551,4.792735,42.17682,1.240495,4.436923,...,10.35051,84.366859,490.224991,7.209191,4390,52,174.0,201.0,12.923611,7.486111
some_name,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,8.590258,37.289972,122.073165,6.424903,88,9,40.0,43.0,3.472222,2.111111
DB00197,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,39.481227,2.506828,5.013626,39.481227,1.273588,4.37477,...,10.508049,81.223172,441.160994,7.606224,3093,53,170.0,203.0,11.284722,6.569444


### 📥 Load Precomputed Mordred Descriptors

In this step, we load two precomputed datasets containing molecular descriptors generated using the **Mordred** descriptor calculator:

- `antidiabetic_mordred_descriptors.csv`: Descriptors for molecules with potential **antidiabetic activity**
- `non_antidiabetic_mordred_descriptors.csv`: Descriptors for molecules **without known antidiabetic activity**

These datasets are provided in CSV format and can be used directly for machine learning tasks such as classification or clustering. The files are stored in the following directories:

- `../data/antidiabetic/`
- `../data/non_antidiabetic/`

Each row corresponds to a single molecule, and each column (except the first one, usually an ID or SMILES) represents a molecular descriptor computed by Mordred.

In [13]:
antidiabetic_mordred_descriptors = pd.read_csv('../data/antidiabetic/antidiabetic_mordred_descriptors.csv')
non_antidiabetic_mordred_descriptors  = pd.read_csv('../data/non_antidiabetic/non_antidiabetic_mordred_descriptors.csv')