In [1]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdmolops
from rdkit.Chem import Crippen
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdchem import GetPeriodicTable
from rdkit import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.ML.Descriptors import MoleculeDescriptors

from padelpy import padeldescriptor, from_smiles
from mordred import Calculator, descriptors

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
#from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import metrics
from sklearn.svm import SVR

import xgboost as xgb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import Linear, Sequential, BatchNorm1d #, ReLU
from torch.nn.functional import relu
from torch.utils.data.sampler import SubsetRandomSampler

#import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.utils import to_dense_adj
from torch_geometric.nn import ARMAConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import global_add_pool








### Read Data from Files

In [2]:
# Replace '.csv' with the actual file path
#file_path = 'D:\Research\Results\PFAS-ML\My-ML-code\My_CMC_data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('dataset/My_CMC_data.csv')

total_elements=len(df)
print('Total no. of smiles:', total_elements)
df


Total no. of smiles: 473


Unnamed: 0,smiles,logCMC
0,CCCCCCCCC(O)CO,3.361728
1,CCCCCCCCC(O)CCO,3.361728
2,CCCCCCCCCCCCC(O)CCO,1.113943
3,CCCCCCCCOCCO,3.690196
4,CCCCCCCCOCCOCCOCCO,3.875061
...,...,...
468,CCCCCCCCCCCCCCOCC(Cn1cc[n+](c1)CCC[n+]1ccn(c1)...,1.851000
469,CCCCCCCCCCCCCCC([n+]1ccn(c1)C)CSCCSCC([n+]1ccn...,1.342000
470,CCCCCCCCCCCCCCC([n+]1ccn(c1)C)CSCCCSCC([n+]1cc...,1.322000
471,CCCCCCCCCCCCCCC([n+]1ccn(c1)C)CSCCCCSCC([n+]1c...,1.301000


In [3]:
#Create a new column with canonical format of smile notation
df['canonical_smiles'] = [Chem.CanonSmiles(j) for j in df['smiles']]

#Drop duplicate canonical smiles from dataframe 
df_unique=df.drop_duplicates(subset='canonical_smiles',inplace=False)

#Reset index of pandas dataframe
df_unique=df_unique.reset_index(drop=True)

#Replace 'smiles' column with 'canonical_smiles' column and then delete 'canonical_smiles' column
df_unique['smiles']=df_unique['canonical_smiles']
df_unique = df_unique.drop('canonical_smiles', axis=1)
df_unique

Unnamed: 0,smiles,logCMC
0,CCCCCCCCC(O)CO,3.361728
1,CCCCCCCCC(O)CCO,3.361728
2,CCCCCCCCCCCCC(O)CCO,1.113943
3,CCCCCCCCOCCO,3.690196
4,CCCCCCCCOCCOCCOCCO,3.875061
...,...,...
435,CCCCCCCCCCCCCCOCC(O)Cn1cc[n+](CCC[n+]2ccn(CC(O...,1.851000
436,CCCCCCCCCCCCCCC(CSCCSCC(CCCCCCCCCCCCCC)[n+]1cc...,1.342000
437,CCCCCCCCCCCCCCC(CSCCCSCC(CCCCCCCCCCCCCC)[n+]1c...,1.322000
438,CCCCCCCCCCCCCCC(CSCCCCSCC(CCCCCCCCCCCCCC)[n+]1...,1.301000


In [4]:
#Removing counterions from smile strings
mols = [Chem.MolFromSmiles(i,sanitize=True) for i in df_unique['smiles']]
#defnFormat='smarts', defnData="[Cl]"
remover = SaltRemover(defnFormat='smarts',defnData="[Cl-,Br-,I-,Na+,Li+,K+,N]") # N is to remove NH4+ counter-ion 

count = 0
no_salt_smile = []

for mol in mols:
    stripped_mol = remover.StripMol(mol)
    temp=Chem.MolToSmiles(stripped_mol)
    #print(temp)
    no_salt_smile.append(temp)
    count=count+1
    # Process or print the stripped molecule as needed
    #print(Chem.MolToSmiles(stripped_mol))

df_no_salt=df_unique
df_no_salt['smiles']=no_salt_smile
df_no_salt

Unnamed: 0,smiles,logCMC
0,CCCCCCCCC(O)CO,3.361728
1,CCCCCCCCC(O)CCO,3.361728
2,CCCCCCCCCCCCC(O)CCO,1.113943
3,CCCCCCCCOCCO,3.690196
4,CCCCCCCCOCCOCCOCCO,3.875061
...,...,...
435,CCCCCCCCCCCCCCOCC(O)Cn1cc[n+](CCC[n+]2ccn(CC(O...,1.851000
436,CCCCCCCCCCCCCCC(CSCCSCC(CCCCCCCCCCCCCC)[n+]1cc...,1.342000
437,CCCCCCCCCCCCCCC(CSCCCSCC(CCCCCCCCCCCCCC)[n+]1c...,1.322000
438,CCCCCCCCCCCCCCC(CSCCCCSCC(CCCCCCCCCCCCCC)[n+]1...,1.301000


In [5]:
# Smiles Strings
df_no_salt.to_csv('total_smile_and_property.csv', index=False)

### Generating RDKit Descriptors

In [40]:
def RDkit_descriptors(Smiles):
    mols = [Chem.MolFromSmiles(i) for i in Smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        #mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

total_rdkit_descriptors, total_rdkit_desc_names = RDkit_descriptors(df_no_salt['smiles'])


In [41]:
total_rdkit_descriptors = pd.DataFrame(total_rdkit_descriptors,columns=total_rdkit_desc_names)

In [42]:
# Apply maximum absolute scaling
scaled_total_rdkit_descriptors = total_rdkit_descriptors.copy()
for column in total_rdkit_descriptors.columns:
    scaled_total_rdkit_descriptors[column] = scaled_total_rdkit_descriptors[column] / scaled_total_rdkit_descriptors[column].abs().max()

scaled_total_rdkit_descriptors = scaled_total_rdkit_descriptors.fillna(0)
# View the normalized data
display(scaled_total_rdkit_descriptors)

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.657419,0.657419,0.011192,-0.054889,0.746020,0.299897,0.149272,0.145056,0.149263,0.153527,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053333,0.0
1,0.677939,0.677939,0.013701,-0.031535,0.736011,0.297721,0.161286,0.156510,0.161275,0.165975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053333,0.0
2,0.686729,0.686729,0.013943,-0.031756,0.647406,0.291576,0.209342,0.202327,0.209323,0.215768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106667,0.0
3,0.613884,0.613884,0.018833,0.016663,0.733277,0.237654,0.149272,0.145056,0.149263,0.153527,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093333,0.0
4,0.618096,0.618096,0.008970,0.007937,0.620918,0.248971,0.224734,0.221387,0.224727,0.228216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,0.762473,0.762473,0.049787,-0.055154,0.072397,0.292907,0.615945,0.606988,0.615894,0.614108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320000,0.0
436,0.181055,0.181055,0.079508,0.070345,0.070797,0.294239,0.604049,0.595665,0.603852,0.576763,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306667,0.0
437,0.181212,0.181212,0.080522,0.071242,0.069528,0.293777,0.616063,0.607119,0.615864,0.589212,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320000,0.0
438,0.181334,0.181334,0.081260,0.071895,0.068559,0.293333,0.628077,0.618573,0.627876,0.601660,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0


### Saving/Exporting total RDKit, Mordred, and Padel descriptors to CSV file

In [60]:
# RDKit
scaled_total_rdkit_descriptors.to_csv('total_rdkit_descriptors.csv', index=False)

In [61]:
total_smiles=df_no_salt['smiles']
Z=df_no_salt.logCMC

### Data Splitting

In [62]:
train_rdkit_descriptors, test_rdkit_descriptors, train_smiles, test_smiles, Z_train, Z_test=train_test_split(scaled_total_rdkit_descriptors, total_smiles, Z, test_size=0.2, shuffle=True,random_state=42)

### Saving/Exporting train/test RDKit, Mordred, and Padel descriptors to CSV file

In [63]:
# RDKit
train_rdkit_descriptors.to_csv('train_rdkit_descriptors.csv', index=False)
test_rdkit_descriptors.to_csv('test_rdkit_descriptors.csv', index=False)
# Mordred

# Smiles Strings
train_smiles.to_csv('train_smile.csv', index=False)
test_smiles.to_csv('test_smile.csv', index=False)

# Property - CMC
Z_train.to_csv('train_property.csv', index=False)
Z_test.to_csv('test_property.csv', index=False)