In [None]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import matplotlib.pyplot as plt

# Compare the two target values

In [None]:
df = pd.read_csv('data/qm8.csv')

In [None]:
df2 = pd.read_csv('data/qm8.sdf.csv')

In [None]:
df.columns

In [None]:
df2.columns

In [None]:
target_cols = ['E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2']

In [None]:
for target in target_cols:
    print("Checking", target)
    print(np.allclose(df[target].values, df2[target].values))

In [None]:
def check_3d(pos):
    return not np.any(np.all(np.isclose(pos, pos[0]), 0))

# Loading the dataset

In [None]:
suppl = Chem.SDMolSupplier('data/qm8.sdf', removeHs=False, sanitize=False)

In [None]:
Zs, euclid_Ds, graph_Ds, is_3D = [], [], [], []
for i, mol in enumerate(suppl):
    pos = mol.GetConformer(0).GetPositions()
    is_3D.append(check_3d(pos))
    Zs.append([a.GetAtomicNum() for a in mol.GetAtoms()])
    euclid_Ds.append(Chem.Get3DDistanceMatrix(mol))
    graph_Ds.append(Chem.GetDistanceMatrix(mol))

In [None]:
len(Zs), len(euclid_Ds), len(graph_Ds), len(is_3D)

In [None]:
df['Z'] = Zs
df['euclid_D'] = euclid_Ds
df['graph_D']  = graph_Ds
df['is_3D'] = is_3D

In [None]:
check_3d(suppl[3].GetConformer(0).GetPositions())

In [None]:
print(suppl.GetItemText(3))

In [None]:
df.to_json('data/sdf.json', orient='records', lines=True)

## Stuff

In [None]:
def flatten(lst):
    return [item for slist in lst for item in slist]

In [None]:
df = pd.read_json('data/sdf.json', orient='records', lines=True)

In [None]:
Zs = flatten(df.Z.tolist())

In [None]:
plt.hist(Zs)

In [None]:
from collections import Counter
Counter(Zs)

In [None]:
sizes = df.Z.apply(len)

In [None]:
sizes.describe()

In [None]:
df.is_3D.value_counts()

In [None]:
euclid_D = flatten(flatten(df.euclid_D))
graph_D = flatten(flatten(df.graph_D))

In [None]:
Counter(graph_D)

In [None]:
import seaborn as sns

In [None]:
sns.distplot(euclid_D, kde=False)

In [None]:
idx = df.graph_D.apply(lambda x: len(np.intersect1d([100000000.0], x)) > 0)

In [None]:
df[idx].is_3D

In [None]:
suppl[32].GetConformer(0).GetPositions()

In [None]:
check_3d(_)