# первая часть #

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install pymatgen

In [3]:
import pandas as pd
from pymatgen.core import periodic_table, composition

In [27]:
path_to_pymatgen = '/content/drive/MyDrive/datacon2023/minitask_2/data_pymatgen.csv'

data = pd.read_csv(path_to_pymatgen)
data.head()

Unnamed: 0.1,Unnamed: 0,Material,Element,Ionic Radius,Average Electronegativity,Molecular weight
0,0,CuO,,,,
1,1,ZnO,,,,
2,4,Mn2O3,,,,
3,8,CoO,,,,
4,80,CeO2,,,,


In [28]:
for i in data.Material:
  compound = composition.Composition(i)

  # некислородный элемент из оксида
  elements = compound.elements
  data.loc[data.Material==i,('Element')] = elements[0]

  # ионный радиус для среднего значения степеней окисления некислородного элемента
  ion = periodic_table.Element(compound.elements[0])
  data.loc[data.Material==i,('Ionic Radius')] = ion.average_ionic_radius

  # молярная масса
  molar_mass = compound.weight
  data.loc[data.Material==i,('Molecular weight')] = molar_mass

  # средняя электроотрицательность
  electronegativities = []
  for element in elements:
      electronegativities.append(element.X)
  average_electronegativity = sum(electronegativities) / len(electronegativities)
  data.loc[data.Material==i,('Average Electronegativity')] = average_electronegativity

In [29]:
data

Unnamed: 0.1,Unnamed: 0,Material,Element,Ionic Radius,Average Electronegativity,Molecular weight
0,0,CuO,Cu,0.82,2.67,79.5454
1,1,ZnO,Zn,0.88,2.545,81.4084
2,4,Mn2O3,Mn,0.648333,2.495,157.87429
3,8,CoO,Co,0.768333,2.66,74.932595
4,80,CeO2,Ce,1.08,2.28,172.1148
5,82,Fe2O3,Fe,0.8525,2.635,159.6882
6,83,Gd2O3,Gd,1.075,2.32,362.4982
7,84,HfO2,Hf,0.85,2.37,210.4888
8,87,In2O3,In,0.94,2.61,277.6342
9,92,La2O3,La,1.172,2.27,325.80914


In [30]:
data.to_csv('minitask_2_1_13.csv')

# вторая часть #

In [11]:
!pip install rdkit
!pip install pubchempy

In [12]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors

In [13]:
# загружаем список нужных дескрипторов

with open(r'/content/drive/MyDrive/datacon2023/minitask_2/chosen_descriptors.txt', 'r') as file:
    descriptors = [mol.replace('\n', '') for mol in file]

descriptors

['Chi0n',
 'Chi0v',
 'EState_VSA9',
 'ExactMolWt',
 'FpDensityMorgan1',
 'NumAromaticRings',
 'NumHAcceptors',
 'NumHDonors',
 'NumHeteroatoms',
 'NumValenceElectrons',
 'PEOE_VSA1',
 'PEOE_VSA9',
 'RingCount',
 'SMR_VSA1',
 'SlogP_VSA2',
 'TPSA',
 'VSA_EState1']

функция которая принимает на вход нужные дескрипторы и молекулу и возвращает список этих дескрипторов для данной молекулы

In [14]:
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

In [15]:
def get_descriptor_values(mol, descriptors):
  calc = MolecularDescriptorCalculator(descriptors)
  ds = calc.CalcDescriptors(mol)
  return list(ds)

In [16]:
for i in range(15):
  sdf_file = Chem.MolFromMolFile(f'/content/drive/MyDrive/datacon2023/minitask_2/mol_files/mol_files/file_{i}.sdf')
  lst = get_descriptor_values(sdf_file, descriptors)
  print(lst)

[14.627154049305076, 15.383082995323532, 11.600939890232516, 375.140134876, 1.0769230769230769, 2, 3, 1, 5, 138, 10.006437125691184, 5.601050810983688, 3, 14.291479626587346, 35.42395128979385, 40.54, 12.905095918537544]
[23.128347803352913, 23.128347803352913, 42.63176658420044, 588.184291084, 0.8809523809523809, 2, 13, 3, 13, 226, 57.95134876872255, 39.4552314454906, 7, 62.745885952794374, 92.51110779981832, 160.82999999999998, 51.52619149706759]
[15.442229938874268, 15.442229938874268, 0.0, 360.193673996, 1.3076923076923077, 0, 5, 3, 5, 142, 15.319582184522117, 6.103966387748303, 4, 24.90865655266576, 45.1979712404969, 94.83000000000001, 0.0]
[9.737974215908514, 9.737974215908514, 9.843390348640755, 258.05282342, 0.8947368421052632, 2, 4, 2, 5, 96, 14.94991774348146, 0.0, 2, 24.538992111625106, 22.15166536558511, 83.83000000000001, 4.993935185185185]
[48.66008583125275, 48.66008583125275, 28.668337385810926, 1154.7499271960003, 0.5308641975308642, 0, 18, 18, 29, 464, 97.366066819562

делаем датафрейм с молекулами в виде смайлз и вычисленными дескрипторами

In [17]:
# лист со всеми молекулами из файла

mols = []
for i in range(15):
  sdf_file = Chem.MolFromMolFile(f'/content/drive/MyDrive/datacon2023/minitask_2/mol_files/mol_files/file_{i}.sdf')
  mols.append(sdf_file)

через написанную функцию оказалось не так удобно, чем сразу в датафрейм со встроенной функцией CalcMolDescriptors

In [18]:
descrs = [Descriptors.CalcMolDescriptors(mol) for mol in mols] # получаем все-все дескрипторы
df = pd.DataFrame(descrs)

In [20]:
df_descr = df[descriptors] # берем только нужные нам дескрипторы

In [21]:
df_descr.insert(0, 'molecule', [Chem.MolToSmiles(mol) for mol in mols])

In [22]:
df_descr

Unnamed: 0,molecule,Chi0n,Chi0v,EState_VSA9,ExactMolWt,FpDensityMorgan1,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumValenceElectrons,PEOE_VSA1,PEOE_VSA9,RingCount,SMR_VSA1,SlogP_VSA2,TPSA,VSA_EState1
0,O=C(CCCN1CCC(O)(c2ccc(Cl)cc2)CC1)c1ccc(F)cc1,14.627154,15.383083,11.60094,375.140135,1.076923,2,3,1,5,138,10.006437,5.601051,3,14.29148,35.423951,40.54,12.905096
1,COc1cc([C@@H]2c3cc4c(cc3[C@@H](O[C@@H]3O[C@@H]...,23.128348,23.128348,42.631767,588.184291,0.880952,2,13,3,13,226,57.951349,39.455231,7,62.745886,92.511108,160.83,51.526191
2,C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[...,15.44223,15.44223,0.0,360.193674,1.307692,0,5,3,5,142,15.319582,6.103966,4,24.908657,45.197971,94.83,0.0
3,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,9.737974,9.737974,9.84339,258.052823,0.894737,2,4,2,5,96,14.949918,0.0,2,24.538992,22.151665,83.83,4.993935
4,CCC(C)CCCC(=O)N[C@@H](CCN)C(=O)N[C@H](C(=O)N[C...,48.660086,48.660086,28.668337,1154.749927,0.530864,0,18,18,29,464,97.366067,12.207933,1,62.952964,187.086911,490.66,0.0
5,C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C=C[C@...,16.491432,17.247361,11.60094,410.166015,1.321429,0,4,2,6,154,10.213055,10.978139,4,24.192544,45.536241,74.6,15.305172
6,O=C([O-])C(O)[C@H](O)[C@@H](O)[C@H](O)[C@H](O)...,15.894379,18.102749,61.278329,490.084676,0.516129,0,16,12,17,180,81.080458,25.152375,0,81.080458,185.208602,323.02,0.0
7,COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@@H]1C(OC)...,23.157198,23.157198,23.684315,545.262482,1.128205,1,10,2,10,214,33.89737,37.698299,5,43.486444,83.998619,123.99,28.09674
8,CC1(C)O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]...,18.181756,18.181756,9.473726,416.219889,1.266667,0,6,2,6,164,19.686781,12.207933,5,29.275855,51.982521,93.06,12.5577
9,CC(=O)[C@H]1CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=O)C...,16.047803,16.047803,0.0,344.235145,1.28,0,3,1,3,138,5.106527,6.103966,4,14.695602,22.776984,54.37,0.0


In [24]:
df_descr.to_csv('minitask_2_2_13.csv')