Извлечение дескрипторов из записей SMILES отобранных молекул

In [69]:
import pandas as pd

# Загрузка датасета и вывод первых 5 строк
df = pd.read_csv('dataset_filtered.csv')
df = df.drop(columns=['Standard Relation', 'Standard Units'])
df.head()

Unnamed: 0,Smiles,Standard Value
0,O=S(=O)(c1ccc(C(F)(F)F)cc1)N1C2Cc3[nH]ncc3C1CC...,36.0
1,O=S(=O)(c1ccc(C(F)(F)F)nc1)N1C2Cc3[nH]ncc3C1CC...,4.0
2,COc1cc(-c2nnc3n2CCCN3C(C)c2c(F)cc(F)cc2F)ccc1-...,342.0
3,COc1cc(-c2nnc3n2CCCN3Cc2ccc(Cl)c(Cl)c2)ccc1-n1...,126.0
4,COc1cc(-c2onc3c2CCCN3[C@@H](C)c2ccccc2)ccc1-n1...,537.0


Импорт необходимых библиотек

In [70]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors

In [71]:
# Получение списка доступных дескрипторов
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)
num_descriptors = len(descriptor_names)

# Инициализация пустой матрицы для дескрипторов
descriptors_set = np.empty((0, num_descriptors), float)

# Расчет дескрипторов
for _, row in df.iterrows():
    smiles = row['Smiles']
    molecule = Chem.MolFromSmiles(smiles)

    if molecule is not None:
        descriptor = np.array(get_descriptors.ComputeProperties(molecule)).reshape((-1, num_descriptors))
        descriptors_set = np.append(descriptors_set, descriptor, axis=0)

df_descriptors = pd.DataFrame(descriptors_set, columns=descriptor_names)

# Объединение существующего датасета с новыми дескрипторами
df_rdkit = pd.concat([df, df_descriptors], axis=1)

Выводим информацию о новом датасете с десткрипторами

In [72]:
df_rdkit.head()

Unnamed: 0,Smiles,Standard Value,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,O=S(=O)(c1ccc(C(F)(F)F)cc1)N1C2Cc3[nH]ncc3C1CC...,36.0,513.144631,513.545,7.0,1.0,4.0,1.0,5.0,36.0,...,18.929067,11.528472,7.205882,7.205882,5.506248,-3.44,22.988319,8.308724,3.824418,5.305655
1,O=S(=O)(c1ccc(C(F)(F)F)nc1)N1C2Cc3[nH]ncc3C1CC...,4.0,466.10866,466.46,6.0,1.0,3.0,1.0,4.0,32.0,...,16.550417,9.993499,6.22903,6.22903,4.848409,-2.79,21.076572,7.376896,3.394202,4.85874
2,COc1cc(-c2nnc3n2CCCN3C(C)c2c(F)cc(F)cc2F)ccc1-...,342.0,468.188544,468.483,7.0,0.0,5.0,0.0,7.0,34.0,...,18.965546,10.907253,6.125779,6.125779,4.484056,-3.82,21.9961,8.510579,3.683284,5.505869
3,COc1cc(-c2nnc3n2CCCN3Cc2ccc(Cl)c(Cl)c2)ccc1-n1...,126.0,468.123215,469.376,7.0,0.0,5.0,0.0,7.0,32.0,...,17.794688,10.347161,5.60908,5.60908,4.059013,-3.03,20.849495,8.483873,3.823881,5.527639
4,COc1cc(-c2onc3c2CCCN3[C@@H](C)c2ccccc2)ccc1-n1...,537.0,414.205576,414.509,6.0,0.0,5.0,0.0,6.0,31.0,...,18.077524,10.649493,6.046511,6.046511,4.384465,-3.54,19.425078,7.945994,3.478645,4.979082


In [73]:
df_rdkit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3041 entries, 0 to 3040
Data columns (total 45 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Smiles                           3041 non-null   object 
 1   Standard Value                   3041 non-null   float64
 2   exactmw                          3041 non-null   float64
 3   amw                              3041 non-null   float64
 4   lipinskiHBA                      3041 non-null   float64
 5   lipinskiHBD                      3041 non-null   float64
 6   NumRotatableBonds                3041 non-null   float64
 7   NumHBD                           3041 non-null   float64
 8   NumHBA                           3041 non-null   float64
 9   NumHeavyAtoms                    3041 non-null   float64
 10  NumAtoms                         3041 non-null   float64
 11  NumHeteroatoms                   3041 non-null   float64
 12  NumAmideBonds       

Фильтрация по коэффициенту корреляции

In [74]:
# установка библиотеки sklearn
! pip install scikit-learn



In [75]:
from sklearn.preprocessing import LabelEncoder

# Кодирование SMILES
label_encoder = LabelEncoder()
df_rdkit['smiles_encoded'] = label_encoder.fit_transform(df_rdkit['Smiles'])

# Сохранение целевого столбца
df_target = pd.DataFrame()
df_target['Standard Value'] = df_rdkit['Standard Value']
df_target['smiles_encoded'] = df_rdkit['smiles_encoded']
df_rdkit.drop(columns=['Standard Value', 'Smiles'], inplace=True, axis=1)

In [76]:
# Вычисление матрицы корреляции
corr_matrix = df_rdkit.corr()

# Выбор дескрипторов с корреляцией > 0.8 или < -0.8
high_corr_descriptors = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_descriptors.add(corr_matrix.columns[i])

# Удаление выбранных дескрипторов
data_filtered = df_rdkit.drop(high_corr_descriptors, axis=1)

In [77]:
# Вывод информации по оставшимся дескрипторам
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3041 entries, 0 to 3040
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   exactmw                          3041 non-null   float64
 1   FractionCSP3                     3041 non-null   float64
 2   NumRings                         3041 non-null   float64
 3   NumAromaticRings                 3041 non-null   float64
 4   NumAliphaticRings                3041 non-null   float64
 5   NumHeterocycles                  3041 non-null   float64
 6   NumAromaticHeterocycles          3041 non-null   float64
 7   NumSaturatedHeterocycles         3041 non-null   float64
 8   NumAliphaticHeterocycles         3041 non-null   float64
 9   NumSpiroAtoms                    3041 non-null   float64
 10  NumBridgeheadAtoms               3041 non-null   float64
 11  NumAtomStereoCenters             3041 non-null   float64
 12  NumUnspecifiedAtomSt

Объедение столбцов-дескрипторов и целевого столбца

In [82]:
merged_df = pd.merge(df_target, data_filtered, on='smiles_encoded', how='right')
merged_df.head()

Unnamed: 0,Standard Value,smiles_encoded,exactmw,FractionCSP3,NumRings,NumAromaticRings,NumAliphaticRings,NumHeterocycles,NumAromaticHeterocycles,NumSaturatedHeterocycles,NumAliphaticHeterocycles,NumSpiroAtoms,NumBridgeheadAtoms,NumAtomStereoCenters,NumUnspecifiedAtomStereoCenters,CrippenClogP,hallKierAlpha
0,36.0,2181,513.144631,0.28,6.0,4.0,2.0,4.0,2.0,1.0,2.0,0.0,2.0,3.0,3.0,4.8485,-3.44
1,10457.0,2181,513.144631,0.28,6.0,4.0,2.0,4.0,2.0,1.0,2.0,0.0,2.0,3.0,3.0,4.8485,-3.44
2,40.0,2181,513.144631,0.28,6.0,4.0,2.0,4.0,2.0,1.0,2.0,0.0,2.0,3.0,3.0,4.8485,-3.44
3,29717.0,2181,513.144631,0.28,6.0,4.0,2.0,4.0,2.0,1.0,2.0,0.0,2.0,3.0,3.0,4.8485,-3.44
4,4.0,2205,466.10866,0.333333,5.0,3.0,2.0,4.0,2.0,1.0,2.0,0.0,2.0,3.0,3.0,4.1969,-2.79


Удаление дубликатов и пропущенных значений

In [85]:
merged_df = merged_df.drop_duplicates()
merged_df = merged_df.dropna()

In [86]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3041 entries, 0 to 5076
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Standard Value                   3041 non-null   float64
 1   smiles_encoded                   3041 non-null   int64  
 2   exactmw                          3041 non-null   float64
 3   FractionCSP3                     3041 non-null   float64
 4   NumRings                         3041 non-null   float64
 5   NumAromaticRings                 3041 non-null   float64
 6   NumAliphaticRings                3041 non-null   float64
 7   NumHeterocycles                  3041 non-null   float64
 8   NumAromaticHeterocycles          3041 non-null   float64
 9   NumSaturatedHeterocycles         3041 non-null   float64
 10  NumAliphaticHeterocycles         3041 non-null   float64
 11  NumSpiroAtoms                    3041 non-null   float64
 12  NumBridgeheadAtoms       

Сохранение подгатовленного датасета для обучения модели

In [87]:
merged_df.to_csv('prepared_dataset.csv', index=False)