<a href="https://colab.research.google.com/github/raissalohanna/Doutorado/blob/main/Anatole_BoB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Applying the Anatole's descriptor to the new PubChemQC dataset**

**Objective**: Make a representation of the molecules using Anatole´s descriptor

# Drive - Colab configuration

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Importing libraries and configuration

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit import Chem
import matplotlib.pyplot as plt

In [26]:
from tqdm import tqdm

In [27]:
pd.set_option('display.max_columns', None)

In [28]:
sns.set_theme(style="darkgrid", palette="husl", rc={"figure.figsize":(10, 5)})

Load data

In [29]:
df = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/Doutorado/PubchemQC_nova_base/joined_data.parquet")

In [30]:
df.head()

Unnamed: 0_level_0,smiles,smiles source,charge,total_dipole_moment,multiplicity,homo,lumo,gap,total_energy,TD_ET_00,TD_OS_00,TD_ET_01,TD_OS_01,TD_ET_02,TD_OS_02,TD_ET_03,TD_OS_03,TD_ET_04,TD_OS_04,TD_ET_05,TD_OS_05,TD_ET_06,TD_OS_06,TD_ET_07,TD_OS_07,TD_ET_08,TD_OS_08,TD_ET_09,TD_OS_09
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
3,O=C(O)C1=CC=C[C@H](O)[C@H]1O,PubChemQC,0,5.266252,1,-6.821894,-2.239497,4.582397,-15575.874068,32576.410766,0.053438,34937.703709,0.127298,35191.469929,0.03561,38959.724483,0.04552,45869.45208,0.002901,46837.502902,0.004995,48214.775245,0.008408,48851.170886,0.003279,50890.898644,0.04637,51729.941085,0.009966
4,C[C@H](O)CN,PubChemQC,0,2.681395,1,-6.187869,1.847653,8.035522,-6794.53586,45133.600212,0.003867,50498.44539,0.022722,52102.569231,0.001101,53565.199229,0.044094,54291.396641,0.024346,58571.208342,0.009868,60147.530252,0.014954,60319.624771,0.008397,60994.089781,0.004093,62458.026397,0.000847
5,NCC(=O)COP(=O)(O)O,PubChemQC,0,8.447997,1,-7.270882,-1.52928,5.741602,-24256.843704,34507.648887,0.001015,41628.443712,0.008713,45462.755074,0.001559,46943.403498,0.000598,48366.754296,0.003638,50735.499802,0.000874,53071.087855,0.009599,54805.905776,0.025285,54979.815042,0.000488,55740.210361,0.013173
6,O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1,PubChemQC,0,4.157554,1,-8.14981,-3.246318,4.903492,-29955.022787,30008.462447,0.008708,30033.272061,0.00305,33529.032202,0.020309,33901.611955,0.001681,34123.341833,0.00552,35756.461306,0.200556,36520.961987,0.046772,38970.830737,0.019129,39701.819082,0.000927,40475.820974,0.073291
7,CCn1cnc2c(N)ncnc21,PubChemQC,0,2.775867,1,-5.823236,-0.389123,5.434114,-14856.035269,39855.991961,0.190582,40311.404854,0.020937,42125.119929,0.037105,43694.465144,0.011508,45023.18291,0.001745,45422.403156,0.002244,47443.717276,0.000626,48595.614115,0.159045,48912.840038,0.01633,49181.261352,0.015406


As we still have some null data, let us drop them

In [31]:
df.isnull().sum()

smiles                   0
smiles source            0
charge                   0
total_dipole_moment      0
multiplicity             0
homo                     0
lumo                     0
gap                      0
total_energy             0
TD_ET_00               387
TD_OS_00               388
TD_ET_01               391
TD_OS_01               392
TD_ET_02               391
TD_OS_02               392
TD_ET_03               391
TD_OS_03               392
TD_ET_04               395
TD_OS_04               396
TD_ET_05               398
TD_OS_05               398
TD_ET_06               398
TD_OS_06               398
TD_ET_07               398
TD_OS_07               398
TD_ET_08               398
TD_OS_08               398
TD_ET_09               408
TD_OS_09               408
dtype: int64

In [32]:
df.dropna(inplace=True)

In [33]:
df.isnull().sum()

smiles                 0
smiles source          0
charge                 0
total_dipole_moment    0
multiplicity           0
homo                   0
lumo                   0
gap                    0
total_energy           0
TD_ET_00               0
TD_OS_00               0
TD_ET_01               0
TD_OS_01               0
TD_ET_02               0
TD_OS_02               0
TD_ET_03               0
TD_OS_03               0
TD_ET_04               0
TD_OS_04               0
TD_ET_05               0
TD_OS_05               0
TD_ET_06               0
TD_OS_06               0
TD_ET_07               0
TD_OS_07               0
TD_ET_08               0
TD_OS_08               0
TD_ET_09               0
TD_OS_09               0
dtype: int64

Testing Anatole's descriptor

In [34]:
!pip install molml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [47]:
from molml.features import BagOfBonds, EncodedBond, Connectivity

In [36]:
smi = df['smiles'].values[0]

In [37]:
smi_test = BagOfBonds(smi)

In [38]:
smi_test

BagOfBonds(input_type='O=C(O)C1=CC=C[C@H](O)[C@H]1O', n_jobs=1, drop_values=False, add_atoms=False)

In [39]:
type(smi_test)

molml.molecule.BagOfBonds

Implementing descriptor featurization

In [42]:
df.columns

Index(['smiles', 'smiles source', 'charge', 'total_dipole_moment',
       'multiplicity', 'homo', 'lumo', 'gap', 'total_energy', 'TD_ET_00',
       'TD_OS_00', 'TD_ET_01', 'TD_OS_01', 'TD_ET_02', 'TD_OS_02', 'TD_ET_03',
       'TD_OS_03', 'TD_ET_04', 'TD_OS_04', 'TD_ET_05', 'TD_OS_05', 'TD_ET_06',
       'TD_OS_06', 'TD_ET_07', 'TD_OS_07', 'TD_ET_08', 'TD_OS_08', 'TD_ET_09',
       'TD_OS_09'],
      dtype='object')

In [40]:
X = df.drop(columns = ['TD_ET_00'])
y = df[['TD_ET_00']]

In [49]:
X.head()

Unnamed: 0_level_0,smiles,smiles source,charge,total_dipole_moment,multiplicity,homo,lumo,gap,total_energy,TD_OS_00,TD_ET_01,TD_OS_01,TD_ET_02,TD_OS_02,TD_ET_03,TD_OS_03,TD_ET_04,TD_OS_04,TD_ET_05,TD_OS_05,TD_ET_06,TD_OS_06,TD_ET_07,TD_OS_07,TD_ET_08,TD_OS_08,TD_ET_09,TD_OS_09
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
3,O=C(O)C1=CC=C[C@H](O)[C@H]1O,PubChemQC,0,5.266252,1,-6.821894,-2.239497,4.582397,-15575.874068,0.053438,34937.703709,0.127298,35191.469929,0.03561,38959.724483,0.04552,45869.45208,0.002901,46837.502902,0.004995,48214.775245,0.008408,48851.170886,0.003279,50890.898644,0.04637,51729.941085,0.009966
4,C[C@H](O)CN,PubChemQC,0,2.681395,1,-6.187869,1.847653,8.035522,-6794.53586,0.003867,50498.44539,0.022722,52102.569231,0.001101,53565.199229,0.044094,54291.396641,0.024346,58571.208342,0.009868,60147.530252,0.014954,60319.624771,0.008397,60994.089781,0.004093,62458.026397,0.000847
5,NCC(=O)COP(=O)(O)O,PubChemQC,0,8.447997,1,-7.270882,-1.52928,5.741602,-24256.843704,0.001015,41628.443712,0.008713,45462.755074,0.001559,46943.403498,0.000598,48366.754296,0.003638,50735.499802,0.000874,53071.087855,0.009599,54805.905776,0.025285,54979.815042,0.000488,55740.210361,0.013173
6,O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1,PubChemQC,0,4.157554,1,-8.14981,-3.246318,4.903492,-29955.022787,0.008708,30033.272061,0.00305,33529.032202,0.020309,33901.611955,0.001681,34123.341833,0.00552,35756.461306,0.200556,36520.961987,0.046772,38970.830737,0.019129,39701.819082,0.000927,40475.820974,0.073291
7,CCn1cnc2c(N)ncnc21,PubChemQC,0,2.775867,1,-5.823236,-0.389123,5.434114,-14856.035269,0.190582,40311.404854,0.020937,42125.119929,0.037105,43694.465144,0.011508,45023.18291,0.001745,45422.403156,0.002244,47443.717276,0.000626,48595.614115,0.159045,48912.840038,0.01633,49181.261352,0.015406


Pre-processing

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

feats = [
    ("encoded_bond", EncodedBond(n_jobs=-1, max_depth=3)),
    #("atom_count", Connectivity(depth=1, n_jobs=-1)),
    #("angle_count", Connectivity(depth=3, use_coordination=True, n_jobs=-1))
    ]

full_feat = FeatureUnion(feats)
X_train = full_feat.fit_transform(X_train)
X_test = full_feat.transform(X_test)