In [None]:
from sklearn import preprocessing
import torch
from rdkit.Chem import Descriptors, MolFromSmiles
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from botorch.models import SingleTaskGP
from botorch import fit_gpytorch_model
from gpytorch.mlls import ExactMarginalLogLikelihood

sns.set(style="whitegrid", font_scale=1.75)

In [None]:
subsample_ratio = 0.25
overwrite_df = False
src_path = '../conformalbo/assets/zinc_subsample.csv'
zinc_df = pd.read_csv(src_path)

descriptors = {
    "exact_mol_wt": Descriptors.ExactMolWt,
    "fp_density_morgan_1": Descriptors.FpDensityMorgan1,
    "fp_density_morgan_2": Descriptors.FpDensityMorgan2,
    "fp_density_morgan_3": Descriptors.FpDensityMorgan3,
    "heavy_atom_mol_wt": Descriptors.HeavyAtomMolWt,
    "max_abs_partial_charge": Descriptors.MaxAbsPartialCharge,
    "max_partial_charge": Descriptors.MaxPartialCharge,
    "min_abs_partial_charge": Descriptors.MinAbsPartialCharge,
    "min_partial_charge": Descriptors.MinPartialCharge,
    "mol_weight": Descriptors.MolWt,
    "num_radical_electrons": Descriptors.NumRadicalElectrons,
    "num_valence_electons": Descriptors.NumValenceElectrons,
}

for descr_key, descr_fn in descriptors.items():
    if descr_key in zinc_df.columns:
        continue
    zinc_df.loc[:, descr_key] = zinc_df.smiles.apply(lambda x: descr_fn(MolFromSmiles(x)))

if overwrite_df:
    zinc_df.to_csv(src_path, index=None)
    
zinc_df = zinc_df.sample(frac=subsample_ratio)
zinc_df.tail()