In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn import metrics
import tqdm
import pickle as pkl

In [None]:
df = pd.read_csv('data/qm8.csv')

In [None]:
df.head()

In [None]:
tqdm.tqdm.pandas()

In [None]:
def process_smile(smile):
    mol = Chem.MolFromSmiles(smile)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    
    is_3d = False
    pos = []
    for c in mol.GetConformers():
        pos.append(c.GetPositions())
        is_3d |= c.Is3D()
        
    return np.concatenate(pos)

In [None]:
def smile_features(smile):
    mol = Chem.MolFromSmiles(smile)
    mol = Chem.AddHs(mol)
    
    atomic_nums = [a.GetAtomicNum() for a in mol.GetAtoms()]
    
    dist_matrix = Chem.rdDistGeom.GetMoleculeBoundsMatrix(mol)
    return (atomic_nums, dist_matrix)

In [None]:
mol = Chem.MolFromSmiles(df.smiles[1])
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol)

In [None]:
mol

In [None]:
sample_df = df

In [None]:
res = df.smiles.head(10).progress_apply(smile_features).to_list()

In [None]:
sample_df.loc[:, ['Z', 'D']] = sample_df.smiles.progress_apply(smile_features).to_list()

In [None]:
sample_df.to_json('data/preprocessed.json')

In [None]:
def gaussian_expansion(D, mu_min=-1, delta_mu=0.2, mu_max=10, sigma=0.2):
    mu = np.arange(mu_min, mu_max + delta_mu, delta_mu)
    diff = D[:,:,np.newaxis] - mu[np.newaxis, np.newaxis, :]
    return np.exp(-diff ** 2 / (2 * sigma))

In [None]:
sample_df['D_hat'] = sample_df['D'].progress_apply(gaussian_expansion)

In [None]:
sample_df.to_pickle('data/preprocessed_df.pkl')

In [None]:
sample_df.to_csv('data/preprocessed.csv', index=False)

In [None]:
import data
from torch.utils.data import DataLoader

In [None]:
d = data.QM8Dataset(sample_df.columns[3], 30, 10)

In [None]:
dl = DataLoader(d, 20)

In [None]:
Zs, Ds, sizes, target = next(iter(dl))

In [None]:
Zs.shape, Ds.shape, sizes.shape, target.shape

In [None]:
D = sample_df.D[0]

In [None]:
D

In [None]:
import numpy as np

In [None]:
gaussian_expansion(D).shape