In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn import metrics
import tqdm
import pickle as pkl

In [None]:
df = pd.read_csv('data/qm8.csv')

In [None]:
df.head()

In [None]:
tqdm.tqdm.pandas()

In [None]:
def smile_is_3d(smile):
    mol = Chem.MolFromSmiles(smile)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    
    is_3d = False
    for c in mol.GetConformers():
        is_3d |= c.Is3D()
        
    return is_3d

In [None]:
def smile_features(smile):
    mol = Chem.MolFromSmiles(smile)
    mol = Chem.AddHs(mol)
    embeded = AllChem.EmbedMolecule(mol) != -1
    is_3D = False
    if embeded:
        AllChem.MMFFOptimizeMolecule(mol)
        is_3D = mol.GetConformer(0).Is3D()
    
    atomic_nums = [a.GetAtomicNum() for a in mol.GetAtoms()]
    
    euclid_D = Chem.rdmolops.Get3DDistanceMatrix(mol) if embeded else Chem.rdDistGeom.GetMoleculeBoundsMatrix(mol)
    graph_D = Chem.GetDistanceMatrix(mol)
    return (atomic_nums, euclid_D, graph_D, is_3D)

In [None]:
mol = Chem.MolFromSmiles('[H]C([H])([H])C12C([H])([H])N(C1([H])[H])C2([H])[H]')
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol)
# AllChem.MMFFOptimizeMolecule(mol)

In [None]:
Chem.rdmolops.Get3DDistanceMatrix(mol)

In [None]:
import numpy as np

In [None]:
sample_df = df

In [None]:
res = df.smiles.head(10).progress_apply(smile_features).to_list()

In [None]:
sample_df.loc[:, ['Z', 'euclid_D', 'graph_D', 'is_3D']] = sample_df.smiles.progress_apply(smile_features).to_list()

In [None]:
sample_df.to_json('data/rdkit_euclid.json', lines=True, orient='records')