In [10]:
import torch
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from torch.nn import Linear, Sequential, BatchNorm1d #, ReLU
from torch.nn.functional import relu
from torch.utils.data.sampler import SubsetRandomSampler
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.utils import to_dense_adj
from torch_geometric.nn import ARMAConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import global_add_pool

import argparse
import time
import math
import copy

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdmolops
from rdkit.Chem import Crippen
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdchem import GetPeriodicTable
from rdkit import DataStructs
from rdkit.Chem import Descriptors
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.ML.Descriptors import MoleculeDescriptors
import rdkit

from mordred import Calculator, descriptors
from padelpy import padeldescriptor, from_smiles

import xgboost as xgb

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
#from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import metrics
from sklearn.svm import SVR



In [11]:
# Replace '.csv' with the actual file path
#file_path = 'D:\Research\Results\PFAS-ML\My-ML-code\My_CMC_data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('../dataset/updated_CMC-Data-for-mixed-micelle.csv')

total_elements=len(df)
print('Total no. of smiles:', total_elements)
df.head()


Total no. of smiles: 979


Unnamed: 0,smile_A,mol_fraction_A,smile_B,mol_fraction_B,logCMC
0,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.0,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],1.0,2.857332
1,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.1,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.9,0.991226
2,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.2,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.8,0.991226
3,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.3,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.7,0.90309
4,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.4,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.6,0.732394


In [12]:
#Create a new column with canonical format of smile notation

df['smile_A'] = [Chem.CanonSmiles(j) for j in df['smile_A']]
df['smile_B'] = [Chem.CanonSmiles(j) for j in df['smile_B']]

df.head()

Unnamed: 0,smile_A,mol_fraction_A,smile_B,mol_fraction_B,logCMC
0,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.0,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],1.0,2.857332
1,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.1,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.9,0.991226
2,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.2,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.8,0.991226
3,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.3,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.7,0.90309
4,CCCCCCCCCCCCOS(=O)(=O)[O-].[Na+],0.4,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],0.6,0.732394


In [13]:
#Removing counterions from smile strings
mol_A = [Chem.MolFromSmiles(i,sanitize=True) for i in df['smile_A']]
mol_B = [Chem.MolFromSmiles(i,sanitize=True) for i in df['smile_B']]

remover = SaltRemover(defnFormat='smarts',defnData="[Cl-,Br-,I-,Na+,Li+,K+,N]") # N is to remove NH4+ counter-ion 

count = 0
no_salt_smile_A = []
for mol in mol_A:
    stripped_mol = remover.StripMol(mol)
    temp=Chem.MolToSmiles(stripped_mol)
    #print(temp)
    no_salt_smile_A.append(temp)
    count=count+1

no_salt_smile_B = []
for mol in mol_B:
    stripped_mol = remover.StripMol(mol)
    temp=Chem.MolToSmiles(stripped_mol)
    #print(temp)
    no_salt_smile_B.append(temp)
    count=count+1


df_no_salt=df
df_no_salt['smile_A']=no_salt_smile_A
df_no_salt['smile_B']=no_salt_smile_B


Z=df_no_salt.logCMC
df_no_salt

Unnamed: 0,smile_A,mol_fraction_A,smile_B,mol_fraction_B,logCMC
0,CCCCCCCCCCCCOS(=O)(=O)[O-],0.0,CCCCCCCCCCCCCCCC[N+](C)(C)C,1.0,2.857332
1,CCCCCCCCCCCCOS(=O)(=O)[O-],0.1,CCCCCCCCCCCCCCCC[N+](C)(C)C,0.9,0.991226
2,CCCCCCCCCCCCOS(=O)(=O)[O-],0.2,CCCCCCCCCCCCCCCC[N+](C)(C)C,0.8,0.991226
3,CCCCCCCCCCCCOS(=O)(=O)[O-],0.3,CCCCCCCCCCCCCCCC[N+](C)(C)C,0.7,0.903090
4,CCCCCCCCCCCCOS(=O)(=O)[O-],0.4,CCCCCCCCCCCCCCCC[N+](C)(C)C,0.6,0.732394
...,...,...,...,...,...
974,CCCCCCCCCCCCCCOCC(O)Cn1cc[n+](CCC[n+]2ccn(CC(O...,0.5,CCCCCCCCCCCCCCOCC(O)Cn1cc[n+](CCC[n+]2ccn(CC(O...,0.5,1.851000
975,CCCCCCCCCCCCCCC(CSCCSCC(CCCCCCCCCCCCCC)[n+]1cc...,0.5,CCCCCCCCCCCCCCC(CSCCSCC(CCCCCCCCCCCCCC)[n+]1cc...,0.5,1.342000
976,CCCCCCCCCCCCCCC(CSCCCSCC(CCCCCCCCCCCCCC)[n+]1c...,0.5,CCCCCCCCCCCCCCC(CSCCCSCC(CCCCCCCCCCCCCC)[n+]1c...,0.5,1.322000
977,CCCCCCCCCCCCCCC(CSCCCCSCC(CCCCCCCCCCCCCC)[n+]1...,0.5,CCCCCCCCCCCCCCC(CSCCCCSCC(CCCCCCCCCCCCCC)[n+]1...,0.5,1.301000


In [14]:
smile_A=df_no_salt['smile_A']
smile_B=df_no_salt['smile_B']
frac_a=df_no_salt['mol_fraction_A']
frac_b=df_no_salt['mol_fraction_B']

# Generating Padel Descriptors

In [15]:
def padel_descriptor(smiles):
    descriptors = []
    ii=0
    for smi in smiles:
        print(ii,smi)
        tmp=from_smiles(smi,descriptors=True,fingerprints=False)
        descriptors.append(tmp)
        ii=ii+1

    return descriptors


In [112]:
total_A_padel_descriptors = padel_descriptor(df_no_salt['smile_A'])

0 CCCCCCCCCCCCOS(=O)(=O)[O-]
1 CCCCCCCCCCCCOS(=O)(=O)[O-]
2 CCCCCCCCCCCCOS(=O)(=O)[O-]
3 CCCCCCCCCCCCOS(=O)(=O)[O-]
4 CCCCCCCCCCCCOS(=O)(=O)[O-]
5 CCCCCCCCCCCCOS(=O)(=O)[O-]
6 CCCCCCCCCCCCOS(=O)(=O)[O-]
7 CCCCCCCCCCCCOS(=O)(=O)[O-]
8 CCCCCCCCCCCCOS(=O)(=O)[O-]
9 CCCCCCCCCCCCOS(=O)(=O)[O-]
10 CCCCCCCCCCCCOS(=O)(=O)[O-]
11 CCCCCCCCCCCCOS(=O)(=O)[O-]
12 CCCCCCCCCCCCOS(=O)(=O)[O-]
13 CCCCCCCCCCCCOS(=O)(=O)[O-]
14 CCCCCCCCCCCCOS(=O)(=O)[O-]
15 CCCCCCCCCCCCOS(=O)(=O)[O-]
16 CCCCCCCCCCCCOS(=O)(=O)[O-]
17 CCCCCCCCCCCCOS(=O)(=O)[O-]
18 CCCCCCCCCCCCOS(=O)(=O)[O-]
19 CCCCCCCCCCCCOS(=O)(=O)[O-]
20 CCCCCCCCCCCCOS(=O)(=O)[O-]
21 CCCCCCCCCCCCOS(=O)(=O)[O-]
22 CCCCCCCCCCCCOS(=O)(=O)[O-]
23 CCCCCCCCCCCCOS(=O)(=O)[O-]
24 CCCCCCCCCCCCOS(=O)(=O)[O-]
25 CCCCCCCCCCOS(=O)(=O)[O-]
26 CCCCCCCCCCOS(=O)(=O)[O-]
27 CCCCCCCCCCOS(=O)(=O)[O-]
28 CCCCCCCCCCOS(=O)(=O)[O-]
29 CCCCCCCCCCOS(=O)(=O)[O-]
30 CCCCCCCCCCOS(=O)(=O)[O-]
31 CCCCCCCCCCOS(=O)(=O)[O-]
32 CCCCCCCCCCOS(=O)(=O)[O-]
33 CCCCCCCCCCOS(=O)(=O)[O-]
34 CCCCC

In [110]:
total_B_padel_descriptors = padel_descriptor(df_no_salt['smile_B'])

0 CCCCCCCCCCCCCCCC[N+](C)(C)C
1 CCCCCCCCCCCCCCCC[N+](C)(C)C
2 CCCCCCCCCCCCCCCC[N+](C)(C)C
3 CCCCCCCCCCCCCCCC[N+](C)(C)C
4 CCCCCCCCCCCCCCCC[N+](C)(C)C
5 CCCCCCCCCCCCCCCC[N+](C)(C)C
6 CCCCCCCCCCCCCCCC[N+](C)(C)C
7 CCCCCCCCCCCCCCCC[N+](C)(C)C
8 CCCCCCCCCCCCCCCC[N+](C)(C)C
9 CCCCCCCCCCCCCCCC[N+](C)(C)C
10 CCCCCCCCCCCCCCCC[N+](C)(C)C
11 CCCCCCCCCCOS(=O)(=O)[O-]
12 CCCCCCCCCCOS(=O)(=O)[O-]
13 CCCCCCCCCCOS(=O)(=O)[O-]
14 CCCCCCCCCCOS(=O)(=O)[O-]
15 CCCCCCCCCCOS(=O)(=O)[O-]
16 CCCCCCCCCCOS(=O)(=O)[O-]
17 CCCCCCCCCCOS(=O)(=O)[O-]
18 CCCCCCCCCCOS(=O)(=O)[O-]
19 CCCCCCCCCCOS(=O)(=O)[O-]
20 CCCCCCCCCCS(C)=O
21 CCCCCCCCCCS(C)=O
22 CCCCCCCCCCS(C)=O
23 CCCCCCCCCCS(C)=O
24 CCCCCCCCCCS(C)=O
25 CCCCCCCCCC[N+](C)(C)C
26 CCCCCCCCCC[N+](C)(C)C
27 CCCCCCCCCC[N+](C)(C)C
28 CCCCCCCCCC[N+](C)(C)C
29 CCCCCCCCCC[N+](C)(C)C
30 CCCCCCCCCC[N+](C)(C)C
31 CCCCCCCCCC[N+](C)(C)C
32 CCCCCCCCCC[N+](C)(C)C
33 CCCCCCCCCC[N+](C)(C)C
34 CCCCCCCCCC[N+](C)(C)C
35 CCCCCCCC(=O)N(C)CC(O)C(O)C(O)C(O)CO
36 CCCCCCCC(=O)N(C)CC(O)C(O)

In [113]:
total_A_padel_descriptors = pd.DataFrame(total_A_padel_descriptors)
total_B_padel_descriptors = pd.DataFrame(total_B_padel_descriptors)
padel_descriptor_names = total_A_padel_descriptors.columns

In [114]:
# Convert descriptors to numeric, replacing non-numeric values with NaN, then fill NaNs with 0
total_A_padel_descriptors = total_A_padel_descriptors.apply(pd.to_numeric, errors='coerce').fillna(0)
total_B_padel_descriptors = total_B_padel_descriptors.apply(pd.to_numeric, errors='coerce').fillna(0)

# Merge Padel Descriptors of SMILE A and SMILE B using Arithmetic Mean

In [115]:
# Summation
u_padel = [frac_a[j]*total_A_padel_descriptors.loc[j] for j in range(len(total_A_padel_descriptors))]
u_padel = pd.DataFrame(u_padel,columns=padel_descriptor_names)
v_padel = [frac_b[j]*total_B_padel_descriptors.loc[j] for j in range(len(total_B_padel_descriptors))]
v_padel = pd.DataFrame(v_padel,columns=padel_descriptor_names)
total_padel_descriptors_summation = u_padel + v_padel
total_padel_descriptors_summation

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,0.0,-5.10610,26.072257,68.46340,62.545306,0.0,0.0,62.0,20.0,42.0,...,0.965580,0.019574,0.586133,0.529526,0.459266,49.387927,81.776090,164.966584,0.948369,1.574926
1,0.0,-4.99589,25.068233,66.58893,60.680558,0.0,0.0,60.0,19.7,40.3,...,0.964549,0.020241,0.587892,0.542525,0.474392,47.411480,77.390780,156.429465,0.946823,1.604809
2,0.0,-4.88568,24.064209,64.71446,58.815810,0.0,0.0,58.0,19.4,38.6,...,0.963518,0.020907,0.589651,0.555523,0.489518,45.435032,73.005470,147.892345,0.945276,1.634692
3,0.0,-4.77547,23.060185,62.83999,56.951062,0.0,0.0,56.0,19.1,36.9,...,0.962487,0.021574,0.591410,0.568522,0.504644,43.458584,68.620160,139.355226,0.943730,1.664576
4,0.0,-4.66526,22.056161,60.96552,55.086314,0.0,0.0,54.0,18.8,35.2,...,0.961456,0.022240,0.593169,0.581520,0.519769,41.482137,64.234851,130.818106,0.942183,1.694459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,0.0,-11.19560,125.341459,143.24420,137.965026,10.0,10.0,133.0,51.0,82.0,...,0.987053,0.009161,0.540911,0.435668,0.353714,246.567769,779.045133,1538.809721,0.980579,1.330294
975,0.0,-8.91540,79.484357,145.92000,137.463440,10.0,10.0,128.0,48.0,80.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
976,0.0,-9.20340,84.702572,148.83160,140.557026,10.0,10.0,131.0,49.0,82.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
977,0.0,-9.49140,90.086674,151.74320,143.650612,10.0,10.0,134.0,50.0,84.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Scaling Merged Descriptors

In [116]:
# Apply maximum absolute scaling
scaled_total_padel_descriptors_summation = total_padel_descriptors_summation.copy()
for column in total_padel_descriptors_summation.columns:
    scaled_total_padel_descriptors_summation[column] = scaled_total_padel_descriptors_summation[column] / scaled_total_padel_descriptors_summation[column].abs().max()

scaled_total_padel_descriptors_summation = scaled_total_padel_descriptors_summation.fillna(0)
# View the normalized data
display(scaled_total_padel_descriptors_summation)

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,0.0,-0.374446,0.140210,0.163433,0.206525,0.000000,0.000000,0.210169,0.168067,0.238636,...,0.972293,0.053436,0.878528,0.622760,0.565937,0.200302,0.032348,0.026997,0.958294,0.711438
1,0.0,-0.366364,0.134811,0.158959,0.200368,0.000000,0.000000,0.203390,0.165546,0.228977,...,0.971254,0.055255,0.881164,0.638047,0.584577,0.192286,0.030613,0.025600,0.956731,0.724937
2,0.0,-0.358282,0.129411,0.154484,0.194211,0.000000,0.000000,0.196610,0.163025,0.219318,...,0.970216,0.057074,0.883800,0.653334,0.603216,0.184270,0.028878,0.024203,0.955168,0.738436
3,0.0,-0.350200,0.124012,0.150009,0.188053,0.000000,0.000000,0.189831,0.160504,0.209659,...,0.969178,0.058894,0.886437,0.668621,0.621855,0.176254,0.027144,0.022805,0.953606,0.751935
4,0.0,-0.342118,0.118612,0.145535,0.181896,0.000000,0.000000,0.183051,0.157983,0.200000,...,0.968140,0.060713,0.889073,0.683908,0.640494,0.168238,0.025409,0.021408,0.952043,0.765434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,0.0,-0.821008,0.674055,0.341947,0.455562,0.833333,0.833333,0.450847,0.428571,0.465909,...,0.993915,0.025009,0.810746,0.512376,0.435870,1.000000,0.308163,0.251825,0.990841,0.600931
975,0.0,-0.653794,0.427447,0.348335,0.453906,0.833333,0.833333,0.433898,0.403361,0.454545,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
976,0.0,-0.674914,0.455509,0.355285,0.464121,0.833333,0.833333,0.444068,0.411765,0.465909,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
977,0.0,-0.696034,0.484464,0.362236,0.474336,0.833333,0.833333,0.454237,0.420168,0.477273,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Saving Merged and Scaled Padel Descriptors

In [117]:
# Padel summation
scaled_total_padel_descriptors_summation.to_csv('total_padel_descriptors_summation.csv', index=False)

# Saving Target Property - log CMC

In [16]:
Z.to_csv('total_property.csv', index=False)

# End of File