In [1]:
import pandas as pd  # Import pandas for data manipulation
from rdkit import Chem  # Import RDKit for chemical informatics

from rdkit.Chem import rdMolDescriptors  # Import molecular descriptor functions from RDKit

# GABAA_processing

In [2]:
# Load the 'GABAA.csv' file into a pandas DataFrame
data = pd.read_csv('../data/GABAA.csv', encoding='gb2312')

In [4]:
# Function to check if a given SMILES string contains metal atoms
def contains_metal(smiles):
    mol = Chem.MolFromSmiles(smiles)  # Convert SMILES string to RDKit molecular object
    if mol:  # Check if molecule conversion was successful
        for atom in mol.GetAtoms():  # Iterate through each atom in the molecule
            # Check if the atomic number corresponds to a metal
            if atom.GetAtomicNum() in [3, 11, 13, 19, 20, 25, 26, 29, 30, 42, 47, 50, 73, 74, 78, 79, 80, 82, 83]:
                return True  # Return True if a metal atom is found
    return False  # Return False if no metal atoms are found

In [4]:
# Filter out molecules that contain metal atoms
filtered_df = data[~data['smiles'].apply(contains_metal)]

In [5]:
# Calculate the exact molecular weight for each molecule
filtered_df['molecular_weight'] = filtered_df['smiles'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

In [6]:
# Further filter molecules based on molecular weight (between 30 and 1000 Da)
filtered_df_2 = filtered_df[(filtered_df['molecular_weight'] >= 30) & (filtered_df['molecular_weight'] <= 1000)]

In [7]:
filtered_df_2

Unnamed: 0,name,origin,label,smiles,class,molecular_weight
0,gamma-Aminobutyric acid,10.3390/molecules24152678,GABAA agonist,C(CC(=O)O)CN,1,103.063329
1,Menthol,10.1111/bph.12602,GABAA agonist,CC1CCC(C(C1)O)C(C)C,1,156.151415
2,phenobarbital,10.1002/ana.24967,GABAA agonist,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,1,232.084792
3,isoguvacine oxide,10.1002/chir.530070608,GABAA agonist,C1CNCC2C1(O2)C(=O)O,1,143.058243
4,Clomethiazole,10.1016/s0014-2999(02)02233-1,GABAA agonist,CC1=C(CCCl)SC=N1,1,161.006598
...,...,...,...,...,...,...
483,chlorogenic acid,10.1021/jf0303971,GABAA inhibition,C1C(C(C(CC1(C(=O)O)O)OC(=O)C=CC2=CC(=C(C=C2)O)...,0,354.095082
484,maltol,10.1021/jf0303971,GABAA inhibition,CC1=C(C(=O)C=CO1)O,0,126.031694
485,Theobromine,10.1021/jf0303971,GABAA inhibition,CN1C=NC2=C1C(=O)NC(=O)N2C,0,180.064725
486,"2,3,5-trimethylpyrazine",10.1021/jf0303971,GABAA inhibition,CN1C2=C(C(=O)N(C1=O)C)NC=N2,0,180.064725


In [16]:
# Save the filtered dataset back to 'GABAA.csv'
filtered_df_2.to_csv('GABAA.csv')

# plant_processing

In [8]:
# Load the 'plant.xlsx' file into a pandas DataFrame
plant = pd.read_excel('../data/plant_raw.xlsx', sheet_name = "Delete samples without SMILES")

In [9]:
# Filter out molecules that contain metal atoms in the plant dataset
filtered_plant = plant[~plant['SMILES'].apply(contains_metal)]

In [10]:
# Calculate the exact molecular weight for each molecule in the plant dataset
filtered_plant['molecular_weight'] = filtered_plant['SMILES'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_plant['molecular_weight'] = filtered_plant['SMILES'].apply(


In [12]:
# Further filter plant molecules based on molecular weight (between 30 and 1000 Da)
filtered_plant_2 = filtered_plant[(filtered_plant['molecular_weight'] >= 30) & (filtered_plant['molecular_weight'] <= 1000)]

In [14]:
filtered_plant_2.to_excel('../data/processed_plant.xlsx')