In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip uninstall cython --y
!pip uninstall plotnine --y
!pip install cython==0.29.35
!pip install molmod

Found existing installation: Cython 3.0.12
Uninstalling Cython-3.0.12:
  Successfully uninstalled Cython-3.0.12
Found existing installation: plotnine 0.14.5
Uninstalling plotnine-0.14.5:
  Successfully uninstalled plotnine-0.14.5
Collecting cython==0.29.35
  Downloading Cython-0.29.35-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Downloading Cython-0.29.35-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cython
Successfully installed cython-0.29.35
Collecting molmod
  Downloading molmod-1.4.8.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: molmod
  Building whe

In [None]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from molmod.molecules import Molecule

In [None]:
import RankerModel

Instructions for updating:
non-resource variables are not supported in the long term


# Download Data
* QM7X train data

In [None]:
!cp '/content/gdrive/MyDrive/teste_full.zip' teste.zip

In [None]:
!unzip teste.zip

In [None]:
diretorio_treino = "/content/teste/"
arquivos_treino = os.listdir(diretorio_treino)
print("Total Molecules:", len(arquivos_treino))

Total Molecules: 359


# Choose the Property

In [None]:
prop = "HOMO_energy"

# Loss function

In [None]:
#loss_f="BinaryCrossentropy"
loss_f="MSE"

# Auxliary Functions

In [None]:
def ExtractIsomerosConfomeros(val):
  val=val.replace(".xyz","")
  aux = val.split("-")
  return aux[2], aux[3]

In [None]:
def FeatureMatrix(simbols, Max_dim):
  elementos_dict = {'C':0, 'Cl':1, 'H':2, 'N':3, 'O':4, 'S':5} #QM7-X
  matrix_features = np.zeros(shape=(Max_dim, len(elementos_dict)), dtype=float, order='C')
  dimensao = len(simbols)
  for i in range(dimensao):
    r = elementos_dict[simbols[i]]
    matrix_features[i,r] = 1

  return matrix_features


def Euclidian_Distance(vec_a, vec_b):
  # Input are atoms position
  # output euclidean distance
  aux = vec_a - vec_b
  aux = aux**2
  return sum(aux)**0.5


def Distance_Matrix(coordinates, max_dim):
  matrix_dist = np.zeros(shape=(max_dim, max_dim), dtype=float, order='C')
  n_atoms = coordinates.shape[0]
  for i in range(n_atoms):
    for j in range(i, n_atoms):
      vec_a = coordinates[i]
      vec_b = coordinates[j]
      dist = Euclidian_Distance(vec_a, vec_b)
      matrix_dist[i,j] = dist
      matrix_dist[j,i] = dist

  return matrix_dist

# Load Data

In [None]:
lst_features_teste = list()
lst_distancias_teste = list()
lst_mol_sizes_teste = list()
lst_target_teste = list()

lst_molecule = list()
lst_molecule_file_name = list()

In [None]:
inicio = time.time()
for arq in arquivos_treino:
  df_aux = pd.read_csv(diretorio_treino + arq + "/propriedades.csv")
  lst_files = os.listdir(diretorio_treino + "/" + arq + "/")

  lst_files_aux = lst_files.copy()
  lst_files_aux.remove("propriedades.csv")

  lst_files = lst_files_aux

  for molecula in lst_files:
    molecule = Molecule.from_file(diretorio_treino + arq + "/" + molecula)
    df_aux_tmp = df_aux[df_aux['molecula_config'] == str.replace(molecula, ".xyz", "")]

    val_target = df_aux_tmp[prop].values[0] #target
    val_dist = Distance_Matrix(molecule.coordinates, 23)
    val_features = FeatureMatrix(molecule.symbols, 23)
    val_mol_size = len(molecule.symbols)

    lst_target_teste.append(val_target)
    lst_features_teste.append(val_features)
    lst_distancias_teste.append(val_dist)
    lst_mol_sizes_teste.append(val_mol_size)

    lst_molecule.append(arq)
    lst_molecule_file_name.append(molecula)

fim = time.time()
print("Time to load data :", fim - inicio)

Time to load data : 247.11971378326416


In [None]:
print("Number of samples for testing:", len(lst_target_teste))

Number of samples for testing: 207252


# Model Training

In [None]:
obj_ranker = RankerModel.ModelRanker(loss_f)

In [None]:
ranker_model = obj_ranker.AtomNeuralNet(loss_f, predict_aggregation = 'mean')

In [None]:
caminho_parametros = f"/content/gdrive/MyDrive/ModelParameters_Pairwise/{prop}_{loss_f}/parameters/parameters.ckp"
print(caminho_parametros)

In [None]:
predicted_ranking_scores = obj_ranker.Predict(ranker_model, caminho_parametros,
                                              lst_features_teste, lst_target_teste,
                                              lst_distancias_teste, lst_mol_sizes_teste)

# Saving Results

In [None]:
df_resultados = pd.DataFrame()
df_resultados['Molecula'] = lst_molecule
df_resultados['FileName'] = lst_molecule_file_name
df_resultados[prop] = lst_target_teste
df_resultados['Predito'] = predicted_ranking_scores

In [None]:
df_resultados['Isomero'] = df_resultados['FileName'].apply(lambda x: ExtractIsomerosConfomeros(x)[0]).values
df_resultados['Conformero'] = df_resultados['FileName'].apply(lambda x: ExtractIsomerosConfomeros(x)[1]).values

In [None]:
df_resultados.to_csv(f"/content/gdrive/MyDrive/ModelParameters_Pairwise/{prop}_{loss_f}/structures_scores_{loss_f}.csv")

In [None]:
from google.colab import runtime
runtime.unassign()