In [2]:
import numpy as np
import pandas as pd
from scipy.spatial import distance_matrix
from tqdm import tqdm

In [4]:
dataset = "../qm9/"
qm9 = pd.read_csv(dataset + "gdb9.sdf.csv")
qm9

Unnamed: 0,mol_id,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.643290,-401.014647,-372.471772
1,gdb_2,293.60975,293.541110,191.393970,1.6256,9.46,-0.2570,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,0.00000,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.409370,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,gdb_133881,3.59483,2.198990,1.904230,1.6637,69.37,-0.2254,0.0588,0.2842,760.7472,0.127406,-400.633868,-400.628599,-400.627654,-400.663098,23.658,-1603.983913,-1614.898804,-1623.788097,-1492.819438
133881,gdb_133882,3.65648,2.142370,1.904390,1.2976,69.52,-0.2393,0.0608,0.3002,762.6354,0.127495,-400.629713,-400.624444,-400.623500,-400.658942,23.697,-1601.376613,-1612.291504,-1621.181424,-1490.211511
133882,gdb_133883,3.67118,2.143140,1.895010,1.2480,73.60,-0.2233,0.0720,0.2953,780.3553,0.140458,-380.753918,-380.748619,-380.747675,-380.783148,23.972,-1667.045429,-1678.830048,-1688.312964,-1549.143391
133883,gdb_133884,3.52845,2.151310,1.865820,1.9576,77.40,-0.2122,0.0881,0.3003,803.1904,0.152222,-364.720374,-364.714974,-364.714030,-364.749650,24.796,-1794.600439,-1807.210860,-1817.286772,-1670.349892


## Find representations for nodes and edges

In [5]:
def qm9_nodes(g, hydrogen=False):
    h = []
    for n, d in g.nodes_iter(data=True):
        h_t = []
        # Atom type (One-hot H, C, N, O F)
        h_t += [int(d['a_type'] == x) for x in ['H', 'C', 'N', 'O', 'F']]
        # Atomic number
        h_t.append(d['a_num'])
        # Partial Charge
        h_t.append(d['pc'])
        # Acceptor
        h_t.append(d['acceptor'])
        # Donor
        h_t.append(d['donor'])
        # Aromatic
        h_t.append(int(d['aromatic']))
        # Hybradization
        h_t += [int(d['hybridization'] == x) for x in [rdkit.Chem.rdchem.HybridizationType.SP, rdkit.Chem.rdchem.HybridizationType.SP2, rdkit.Chem.rdchem.HybridizationType.SP3]]
        # If number hydrogen is used as a
        if hydrogen:
            h_t.append(d['num_h'])
        h.append(h_t)
    return h

## Normalize scalar coupling

In [7]:
scale_min  = train['scalar_coupling_constant'].min()
scale_max  = train['scalar_coupling_constant'].max()
scale_mid = (scale_max + scale_min)/2
scale_norm = scale_max - scale_mid

train['scalar_coupling_constant'] = (train['scalar_coupling_constant'] - scale_mid)/scale_norm

train[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] = pd.get_dummies(train['type'])
test[['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']] = pd.get_dummies(test['type'])

In [8]:
train

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,1JHC,1JHN,2JHC,2JHH,2JHN,3JHC,3JHH,3JHN
0,0,dsgdb9nsd_000001,1,0,1JHC,0.026406,1,0,0,0,0,0,0,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-0.734594,0,0,0,1,0,0,0,0
2,2,dsgdb9nsd_000001,1,3,2JHH,-0.734576,0,0,0,1,0,0,0,0
3,3,dsgdb9nsd_000001,1,4,2JHH,-0.734572,0,0,0,1,0,0,0,0
4,4,dsgdb9nsd_000001,2,0,1JHC,0.026404,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4659071,4659071,dsgdb9nsd_133884,17,4,2JHC,-0.617348,0,0,1,0,0,0,0,0
4659072,4659072,dsgdb9nsd_133884,17,5,3JHC,-0.640911,0,0,0,0,0,1,0,0
4659073,4659073,dsgdb9nsd_133884,17,6,3JHC,-0.636123,0,0,0,0,0,1,0,0
4659074,4659074,dsgdb9nsd_133884,17,7,2JHC,-0.607695,0,0,1,0,0,0,0,0


## One hot encoding atom type and normalize distance

In [9]:
structures[['C', 'F' ,'H', 'N', 'O']] = pd.get_dummies(structures['atom'])
structures[['x', 'y', 'z']] = structures[['x', 'y', 'z']]/10.

## Process bonds

In [24]:
train_bonds[['nbond_1', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])#train_bonds['nbond']/3
test_bonds[['nbond_1', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])#test_bonds['nbond']/3
train_bonds

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,nbond_1,nbond_2,nbond_3
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH,1,0,0
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH,1,0,0
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH,1,0,0
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH,1,0,0
4,dsgdb9nsd_000002,0,1,1.0,1.017190,0,1.0HN,1,0,0
...,...,...,...,...,...,...,...,...,...,...
76653,dsgdb9nsd_007969,1,2,1.0,1.542051,0,1.0CC,1,0,0
76654,dsgdb9nsd_007969,1,6,1.0,1.549548,0,1.0CC,1,0,0
76655,dsgdb9nsd_007969,2,3,1.0,1.513276,0,1.0CC,1,0,0
76656,dsgdb9nsd_007969,3,5,2.0,1.357137,0,2.0CC,0,1,0


## Process angle

In [33]:
angles['dihedral'] = angles['dihedral']/np.pi
angles['shortest_path_n_bonds'] = angles['shortest_path_n_bonds']/6.0
angles = angles.fillna(0)

## Split structures to train and test sets

In [52]:
train_mol_names = train['molecule_name'].unique()
test_mol_names  = test['molecule_name'].unique()

train_structures = structures.loc[structures['molecule_name'].isin(train_mol_names)]
test_structures = structures.loc[structures['molecule_name'].isin(test_mol_names)]

train_struct_group = train_structures.groupby('molecule_name')
test_struct_group  = test_structures.groupby('molecule_name')

train_group = train.groupby('molecule_name')
test_group  = test.groupby('molecule_name')

train_bond_group = train_bonds.groupby('molecule_name')
test_bond_group  = test_bonds.groupby('molecule_name')

train_angs = angles.loc[angles['molecule_name'].isin(train_mol_names)]
test_angs = angles.loc[angles['molecule_name'].isin(test_mol_names)]

train_angs_group = train_angs.groupby('molecule_name')
test_angs_group  = test_angs.groupby('molecule_name')

# Find max nodes in graph:
max_size = train_struct_group.size().max()
print(max_size)

29


## Define node and edge values

In [13]:
# Values for nodes
node_vals = ['C', 'F' ,'H', 'N', 'O']#, 'x', 'y', 'z']
# Values for edges
bond_vals = ['nbond_1', 'nbond_2', 'nbond_3']
ang_vals = ['shortest_path_n_bonds', 'cosinus', 'dihedral']
j_coup_vals = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']
edge_vals = j_coup_vals + bond_vals + ang_vals

# Find amount of training molecules
n_train_mols = len(train_mol_names)
n_test_mols = len(test_mol_names)

# Find dimension of nodes and edges
node_dim  = len(node_vals)
bond_dim  = len(bond_vals)
ang_dim   = len(ang_vals)
j_coup_dim= len(j_coup_vals)

edge_dim  = len(edge_vals) 

# Additional edge dims for distances 
add_edge_dim = 1

## Pre-allocate array

In [14]:
train_nodes_array = np.zeros((n_train_mols, max_size, node_dim), dtype=np.float32) 
train_in_edges_array = np.zeros((n_train_mols, max_size, max_size, edge_dim + add_edge_dim), dtype=np.float32) 
train_out_edges_array = np.zeros((n_train_mols, max_size, max_size, 1), dtype=np.float32) 

test_nodes_array = np.zeros((n_test_mols, max_size, node_dim), dtype=np.float32) 
test_in_edges_array = np.zeros((n_test_mols, max_size, max_size, edge_dim + add_edge_dim), dtype=np.float32) 

In [76]:
def make_arrs(val_group, struct_group, bond_group, ang_group, mode):
    for i, (values, structs, bonds, angles) in tqdm(enumerate(zip(val_group, struct_group, bond_group, ang_group))):
        
        distances = np.zeros((max_size, max_size, add_edge_dim))
        coords = structs[1][['x','y','z']].values
        
        dists  = distance_matrix(coords, coords)
        
        distances[:dists.shape[0],:dists.shape[1], 0] = dists
        
        # Create nodes
        mol_info = structs[1][node_vals].values
        nodes = np.zeros((max_size, node_dim))
        nodes[:mol_info.shape[0], :mol_info.shape[1]] = mol_info

        # Create edges
        # in_feats is type descriptos one_hot_encoded -> use it to filter on type later on
        in_feats = np.zeros((max_size, max_size, j_coup_dim))
        ind = values[1][['atom_index_0', 'atom_index_1' ]].values
        in_feats[ind[:,0], ind[:,1], 0:j_coup_dim] = values[1][j_coup_vals].values
        in_feats[ind[:,1], ind[:,0], 0:j_coup_dim] = in_feats[ind[:,0], ind[:,1], 0:j_coup_dim]
                  
        # Create bonds
        in_bonds = np.zeros((max_size, max_size, bond_dim))
        ind_bonds = bonds[1][['atom_index_0', 'atom_index_1' ]].values
        in_bonds[ind_bonds[:,0], ind_bonds[:,1]] = bonds[1][bond_vals].values
        in_bonds[ind_bonds[:,1], ind_bonds[:,0]] = in_bonds[ind_bonds[:,0], ind_bonds[:,1]]
        
        # Create angles
        ind_angs = angles[1][['atom_index_0', 'atom_index_1' ]].values
        ang_mat  = np.zeros((max_size, max_size, ang_dim))
        ang_mat[ind_angs[:,0], ind_angs[:,1]]  = angles[1][ang_vals]
        ang_mat[ind_angs[:,1], ind_angs[:,0]]  = ang_mat[ind_angs[:,0], ind_angs[:,1]]
        
        # concat all edge values
        in_edges = np.concatenate((in_feats, in_bonds, ang_mat, distances),axis=2)
        
        # create list of input array
        if mode == "train":
            out_edges = np.zeros((max_size, max_size, 1))
            out_edges[ind[:,0], ind[:,1], 0] = values[1]['scalar_coupling_constant'].values
            out_edges[ind[:,1], ind[:,0], 0] = out_edges[ind[:,0], ind[:,1], 0]
        
            train_nodes_array[i]      = nodes
            train_in_edges_array[i]   = in_edges
            train_out_edges_array[i]  = out_edges
        elif mode == "test":
            test_nodes_array[i]      = nodes
            test_in_edges_array[i]   = in_edges

In [77]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, mode="train")

311it [00:01, 223.02it/s]


In [78]:
make_arrs(train_group, train_struct_group, train_bond_group, train_angs_group, mode="test")

311it [00:01, 217.03it/s]


In [18]:
print(train_nodes_array.shape)
print(train_in_edges_array.shape)
print(train_out_edges_array.shape)
print(test_nodes_array.shape)
print(test_in_edges_array.shape)

(85012, 29, 5)
(85012, 29, 29, 15)
(85012, 29, 29, 1)
(45777, 29, 5)
(45777, 29, 29, 15)


## Save arrays

In [19]:
np.savez_compressed("nodes_train.npz" , train_nodes_array)
np.savez_compressed("in_edges_train.npz" , train_in_edges_array)
np.savez_compressed("out_edges_train.npz" , train_out_edges_array)

np.savez_compressed("nodes_test.npz" , test_nodes_array)
np.savez_compressed("in_edges_test.npz" , test_in_edges_array)