In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import deepchem as dc
import gc
from tqdm import tqdm_notebook
from scipy.spatial.distance import norm
from glob import glob
from rdkit import Chem
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig

In [2]:
pd.set_option("display.max_columns", 100)

In [3]:
# %load ../xyz2mol.py
##
# Copied from: https://github.com/jensengroup/xyz2mol
#
# Written by Jan H. Jensen based on this paper Yeonjoon Kim and Woo Youn Kim
# "Universal Structure Conversion Method for Organic Molecules: From Atomic Connectivity
# to Three-Dimensional Geometry" Bull. Korean Chem. Soc. 2015, Vol. 36, 1769-1777 DOI: 10.1002/bkcs.10334
#
from rdkit import Chem
from rdkit.Chem import AllChem
import itertools
from rdkit.Chem import rdmolops
from collections import defaultdict
import copy
import networkx as nx #uncomment if you don't want to use "quick"/install networkx


global __ATOM_LIST__
__ATOM_LIST__ = [ x.strip() for x in ['h ','he', \
      'li','be','b ','c ','n ','o ','f ','ne', \
      'na','mg','al','si','p ','s ','cl','ar', \
      'k ','ca','sc','ti','v ','cr','mn','fe','co','ni','cu', \
      'zn','ga','ge','as','se','br','kr', \
      'rb','sr','y ','zr','nb','mo','tc','ru','rh','pd','ag', \
      'cd','in','sn','sb','te','i ','xe', \
      'cs','ba','la','ce','pr','nd','pm','sm','eu','gd','tb','dy', \
      'ho','er','tm','yb','lu','hf','ta','w ','re','os','ir','pt', \
      'au','hg','tl','pb','bi','po','at','rn', \
      'fr','ra','ac','th','pa','u ','np','pu'] ]


def get_atom(atom):
    global __ATOM_LIST__
    atom = atom.lower()
    return __ATOM_LIST__.index(atom) + 1


def getUA(maxValence_list, valence_list):
    UA = []
    DU = []
    for i, (maxValence,valence) in enumerate(zip(maxValence_list, valence_list)):
        if maxValence - valence > 0:
            UA.append(i)
            DU.append(maxValence - valence)
    return UA,DU


def get_BO(AC,UA,DU,valences,UA_pairs,quick):
    BO = AC.copy()
    DU_save = []

    while DU_save != DU:
        for i,j in UA_pairs:
            BO[i,j] += 1
            BO[j,i] += 1

        BO_valence = list(BO.sum(axis=1))
        DU_save = copy.copy(DU)
        UA, DU = getUA(valences, BO_valence)
        UA_pairs = get_UA_pairs(UA,AC,quick)[0]

    return BO


def valences_not_too_large(BO,valences):
    number_of_bonds_list = BO.sum(axis=1)
    for valence, number_of_bonds in zip(valences,number_of_bonds_list):
        if number_of_bonds > valence:
            return False

    return True


def BO_is_OK(BO,AC,charge,DU,atomic_valence_electrons,atomicNumList,charged_fragments):
    Q = 0 # total charge
    q_list = []
    if charged_fragments:
        BO_valences = list(BO.sum(axis=1))
        for i,atom in enumerate(atomicNumList):
            q = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
            Q += q
            if atom == 6:
                number_of_single_bonds_to_C = list(BO[i,:]).count(1)
                if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    Q += 1
                    q = 2
                if number_of_single_bonds_to_C == 3 and Q + 1 < charge:
                    Q += 2
                    q = 1

            if q != 0:
                q_list.append(q)

    if (BO-AC).sum() == sum(DU) and charge == Q and len(q_list) <= abs(charge):
        return True
    else:
        return False


def get_atomic_charge(atom,atomic_valence_electrons,BO_valence):
    if atom == 1:
        charge = 1 - BO_valence
    elif atom == 5:
        charge = 3 - BO_valence
    elif atom == 15 and BO_valence == 5:
        charge = 0
    elif atom == 16 and BO_valence == 6:
        charge = 0
    else:
        charge = atomic_valence_electrons - 8 + BO_valence

    return charge

def clean_charges(mol):
    # this hack should not be needed any more but is kept just in case

    rxn_smarts = ['[N+:1]=[*:2]-[C-:3]>>[N+0:1]-[*:2]=[C-0:3]',
                  '[N+:1]=[*:2]-[O-:3]>>[N+0:1]-[*:2]=[O-0:3]',
                  '[N+:1]=[*:2]-[*:3]=[*:4]-[O-:5]>>[N+0:1]-[*:2]=[*:3]-[*:4]=[O-0:5]',
                  '[#8:1]=[#6:2]([!-:6])[*:3]=[*:4][#6-:5]>>[*-:1][*:2]([*:6])=[*:3][*:4]=[*+0:5]',
                  '[O:1]=[c:2][c-:3]>>[*-:1][*:2][*+0:3]',
                  '[O:1]=[C:2][C-:3]>>[*-:1][*:2]=[*+0:3]']

    fragments = Chem.GetMolFrags(mol,asMols=True,sanitizeFrags=False)

    for i,fragment in enumerate(fragments):
        for smarts in rxn_smarts:
            patt = Chem.MolFromSmarts(smarts.split(">>")[0])
            while fragment.HasSubstructMatch(patt):
                rxn = AllChem.ReactionFromSmarts(smarts)
                ps = rxn.RunReactants((fragment,))
                fragment = ps[0][0]
        if i == 0:
            mol = fragment
        else:
            mol = Chem.CombineMols(mol,fragment)

    return mol


def BO2mol(mol,BO_matrix, atomicNumList,atomic_valence_electrons,mol_charge,charged_fragments):
    # based on code written by Paolo Toscani

    l = len(BO_matrix)
    l2 = len(atomicNumList)
    BO_valences = list(BO_matrix.sum(axis=1))

    if (l != l2):
        raise RuntimeError('sizes of adjMat ({0:d}) and atomicNumList '
            '{1:d} differ'.format(l, l2))

    rwMol = Chem.RWMol(mol)

    bondTypeDict = {
        1: Chem.BondType.SINGLE,
        2: Chem.BondType.DOUBLE,
        3: Chem.BondType.TRIPLE
    }

    for i in range(l):
        for j in range(i + 1, l):
            bo = int(round(BO_matrix[i, j]))
            if (bo == 0):
                continue
            bt = bondTypeDict.get(bo, Chem.BondType.SINGLE)
            rwMol.AddBond(i, j, bt)
    mol = rwMol.GetMol()

    if charged_fragments:
        mol = set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge)
    else:
        mol = set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences)

    return mol

def set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge):
    q = 0
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
        q += charge
        if atom == 6:
            number_of_single_bonds_to_C = list(BO_matrix[i,:]).count(1)
            if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    q += 1
                    charge = 0
            if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge:
                    q += 2
                    charge = 1

        if (abs(charge) > 0):
            a.SetFormalCharge(int(charge))

    # shouldn't be needed anymore bit is kept just in case
    #mol = clean_charges(mol)

    return mol


def set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences):
    # The number of radical electrons = absolute atomic charge
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])

        if (abs(charge) > 0):
            a.SetNumRadicalElectrons(abs(int(charge)))

    return mol

def get_bonds(UA,AC):
    bonds = []

    for k,i in enumerate(UA):
        for j in UA[k+1:]:
            if AC[i,j] == 1:
                bonds.append(tuple(sorted([i,j])))

    return bonds

def get_UA_pairs(UA,AC,quick):
    bonds = get_bonds(UA,AC)
    if len(bonds) == 0:
        return [()]

    if quick:
        G=nx.Graph()
        G.add_edges_from(bonds)
        UA_pairs = [list(nx.max_weight_matching(G))]
        return UA_pairs

    max_atoms_in_combo = 0
    UA_pairs = [()]
    for combo in list(itertools.combinations(bonds, int(len(UA)/2))):
        flat_list = [item for sublist in combo for item in sublist]
        atoms_in_combo = len(set(flat_list))
        if atoms_in_combo > max_atoms_in_combo:
            max_atoms_in_combo = atoms_in_combo
            UA_pairs = [combo]
 #           if quick and max_atoms_in_combo == 2*int(len(UA)/2):
 #               return UA_pairs
        elif atoms_in_combo == max_atoms_in_combo:
            UA_pairs.append(combo)

    return UA_pairs

def AC2BO(AC,atomicNumList,charge,charged_fragments,quick):
    # TODO
    atomic_valence = defaultdict(list)
    atomic_valence[1] = [1]
    atomic_valence[6] = [4]
    atomic_valence[7] = [4,3]
    atomic_valence[8] = [2,1]
    atomic_valence[9] = [1]
    atomic_valence[14] = [4]
    atomic_valence[15] = [5,4,3]
    atomic_valence[16] = [6,4,2]
    atomic_valence[17] = [1]
    atomic_valence[32] = [4]
    atomic_valence[35] = [1]
    atomic_valence[53] = [1]


    atomic_valence_electrons = {}
    atomic_valence_electrons[1] = 1
    atomic_valence_electrons[6] = 4
    atomic_valence_electrons[7] = 5
    atomic_valence_electrons[8] = 6
    atomic_valence_electrons[9] = 7
    atomic_valence_electrons[14] = 4
    atomic_valence_electrons[15] = 5
    atomic_valence_electrons[16] = 6
    atomic_valence_electrons[17] = 7
    atomic_valence_electrons[32] = 4
    atomic_valence_electrons[35] = 7
    atomic_valence_electrons[53] = 7

    # make a list of valences, e.g. for CO: [[4],[2,1]]
    valences_list_of_lists = []
    for atomicNum in atomicNumList:
        valences_list_of_lists.append(atomic_valence[atomicNum])

    # convert [[4],[2,1]] to [[4,2],[4,1]]
    valences_list = list(itertools.product(*valences_list_of_lists))

    best_BO = AC.copy()

    # implemenation of algorithm shown in Figure 2
    # UA: unsaturated atoms
    # DU: degree of unsaturation (u matrix in Figure)
    # best_BO: Bcurr in Figure
    #

    for valences in valences_list:
        AC_valence = list(AC.sum(axis=1))
        UA,DU_from_AC = getUA(valences, AC_valence)

        if len(UA) == 0 and BO_is_OK(AC,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
            return AC,atomic_valence_electrons

        UA_pairs_list = get_UA_pairs(UA,AC,quick)
        for UA_pairs in UA_pairs_list:
            BO = get_BO(AC,UA,DU_from_AC,valences,UA_pairs,quick)
            if BO_is_OK(BO,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
                return BO,atomic_valence_electrons

            elif BO.sum() >= best_BO.sum() and valences_not_too_large(BO,valences):
                best_BO = BO.copy()

    return best_BO,atomic_valence_electrons


def AC2mol(mol,AC,atomicNumList,charge,charged_fragments,quick):
    # convert AC matrix to bond order (BO) matrix
    BO,atomic_valence_electrons = AC2BO(AC,atomicNumList,charge,charged_fragments,quick)

    # add BO connectivity and charge info to mol object
    mol = BO2mol(mol,BO, atomicNumList,atomic_valence_electrons,charge,charged_fragments)

    return mol


def get_proto_mol(atomicNumList):
    mol = Chem.MolFromSmarts("[#"+str(atomicNumList[0])+"]")
    rwMol = Chem.RWMol(mol)
    for i in range(1,len(atomicNumList)):
        a = Chem.Atom(atomicNumList[i])
        rwMol.AddAtom(a)

    mol = rwMol.GetMol()

    return mol


def get_atomicNumList(atomic_symbols):
    atomicNumList = []
    for symbol in atomic_symbols:
        atomicNumList.append(get_atom(symbol))
    return atomicNumList


def read_xyz_file(filename):

    atomic_symbols = []
    xyz_coordinates = []

    with open(filename, "r") as file:
        for line_number,line in enumerate(file):
            if line_number == 0:
                num_atoms = int(line)
            elif line_number == 1:
                if "charge=" in line:
                    charge = int(line.split("=")[1])
                else:
                    charge = 0
            else:
                atomic_symbol, x, y, z = line.split()
                atomic_symbols.append(atomic_symbol)
                xyz_coordinates.append([float(x),float(y),float(z)])

    atomicNumList = get_atomicNumList(atomic_symbols)

    return atomicNumList,charge,xyz_coordinates

def xyz2AC(atomicNumList,xyz):
    import numpy as np
    mol = get_proto_mol(atomicNumList)

    conf = Chem.Conformer(mol.GetNumAtoms())
    for i in range(mol.GetNumAtoms()):
        conf.SetAtomPosition(i,(xyz[i][0],xyz[i][1],xyz[i][2]))
    mol.AddConformer(conf)

    dMat = Chem.Get3DDistanceMatrix(mol)
    pt = Chem.GetPeriodicTable()

    num_atoms = len(atomicNumList)
    AC = np.zeros((num_atoms,num_atoms)).astype(int)

    for i in range(num_atoms):
        a_i = mol.GetAtomWithIdx(i)
        Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum())*1.30
        for j in range(i+1,num_atoms):
            a_j = mol.GetAtomWithIdx(j)
            Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum())*1.30
            if dMat[i,j] <= Rcov_i + Rcov_j:
                AC[i,j] = 1
                AC[j,i] = 1

    return AC,mol,dMat

def chiral_stereo_check(mol):
    Chem.DetectBondStereochemistry(mol,-1)
    Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
    Chem.AssignAtomChiralTagsFromStructure(mol,-1)
    return mol

def xyz2mol(atomicNumList, charge, xyz_coordinates, charged_fragments, quick,
            check_chiral_stereo=True):

    # Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge,
    # and mol object with no connectivity information
    AC,mol,dMat = xyz2AC(atomicNumList, xyz_coordinates)

    # Convert AC to bond order matrix and add connectivity and charge info to mol object
    new_mol = AC2mol(mol, AC, atomicNumList, charge, charged_fragments, quick)

    # sanitize
    try: Chem.SanitizeMol(new_mol)
    except ValueError as e: print(e)

    # Check for stereocenters and chiral centers
    if check_chiral_stereo:
        try: new_mol = chiral_stereo_check(new_mol)
        except ValueError as e: print(e)

    return new_mol,dMat

In [4]:
## Constants
N_TRAIN_DF = 4658147

# scalar coupling types
TYPES     = np.array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC',
                      '3JHN'])
TYPES_MAP = {t: i for i, t in enumerate(TYPES)}

# feature definition file
FDEF = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')

# feature collections
SYMBOLS        = ['H', 'C', 'N', 'O', 'F']
DEGREES        = [1, 2, 3, 4, 5]
HYBRIDIZATIONS = [
    Chem.rdchem.HybridizationType.SP,
    Chem.rdchem.HybridizationType.SP2,
    Chem.rdchem.HybridizationType.SP3,
    Chem.rdchem.HybridizationType.UNSPECIFIED
]

# feature maps
ATOMIC_RADIUS = {'H': 0.38, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.71} 
ELECTRO_NEG   = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}

# feature column names
ATOM_COLUMNS = [f'type_{a}' for a in SYMBOLS] \
             + [f'degree_{i}' for i in DEGREES] \
             + ['SP', 'SP2', 'SP3', 'hybridization_unspecified'] \
             + ['aromatic', 'formal_charge', 'atomic_num',
                'donor', 'acceptor', 'ave_bond_length',
                'ave_inv_bond_length', 'ave_neighbor_weight']
EDGE_COLUMNS = ['single', 'double', 'triple', 'aromatic', 
                'conjugated', 'in_ring', 'dist', 'normed_dist',
                'inv_dist', 'normed_inv_dist']

# feature dimensions
N_EDGE_FEATURES = 10
N_ATOM_FEATURES = 22
N_TYPES         = len(TYPES)
MAX_N_ATOMS     = 29

# paths
DATA_PATH = '../data/'
PATH      = '../tmp/'

In [5]:
## Helper functions
def array_to_csv(arr, f_name, n, fmt='%.10f'):
    "Writes numpy array 'arr' to csv file."
    f = PATH + f_name + '.csv'
    np.savetxt(f, arr.reshape(n, -1), delimiter=',', fmt=fmt)

def print_progress(i, print_iter=10000):
    if (i%print_iter)==0: print(i)

def clear_memory(var_strs):
    for var_str in var_strs: del globals()[var_str]
    gc.collect()

In [6]:
## import data
train_df = pd.read_csv(DATA_PATH+'train.csv', index_col=0)
test_df = pd.read_csv(DATA_PATH+'test.csv', index_col=0)
structures_df = pd.read_csv(DATA_PATH+'structures.csv')

  mask |= (ar1 == a)


## Compute cosine angles

In [19]:
edge_df = pd.read_csv(PATH+'edge_df.csv', index_col=0)
pairs_idx = edge_df[['molecule_id', 'idx_0', 'idx_1']].astype(int)
clear_memory(['edge_df'])

  mask |= (ar1 == a)


In [22]:
xyz_filepath_list = list(glob(DATA_PATH+'structures/*.xyz'))
xyz_filepath_list.sort()
n_mols, mol_ids = len(xyz_filepath_list), {}
for i in tqdm_notebook(range(n_mols)):
    filepath = xyz_filepath_list[i]
    mol_name = filepath.split('/')[-1][:-4]
    mol_ids[mol_name] = i
structures_df['molecule_id'] = structures_df['molecule_name'].map(mol_ids)

HBox(children=(IntProgress(value=0, max=130775), HTML(value='')))




In [184]:
in_out_idx = pd.concat((pairs_idx, pairs_idx.rename(columns={'idx_0': 'idx_1', 'idx_1': 'idx_0'})), sort=False)
gb_pairs_mol_0 = in_out_idx.groupby(['molecule_id', 'idx_0'])

In [96]:
def get_combinations(idx_0_group):
    s = list(idx_0_group['idx_1'])[1:]
    return [list(itertools.combinations(s, r))[-1] for r in range(len(s), 0, -1)]

In [188]:
angle_idxs = []
for it, (mol_id, idx_0) in enumerate(gb_pairs_mol_0.groups):
    print_progress(it, print_iter=50000)
    idx_0_group = gb_pairs_mol_0.get_group((mol_id, idx_0))
    combs = get_combinations(idx_0_group)
    for i, comb in enumerate(combs):
        idx_1 = idx_0_group['idx_1'].iloc[i]
        for idx_2 in comb:
            angle_idxs.append((mol_id, idx_0, idx_1, idx_2))

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000
1000000
1050000
1100000
1150000
1200000
1250000
1300000
1350000
1400000
1450000
1500000
1550000
1600000
1650000
1700000
1750000
1800000
1850000
1900000
1950000
2000000
2050000
2100000
2150000
2200000
2250000
2300000
2350000


In [189]:
angle_df = pd.DataFrame(angle_idxs, columns=['molecule_id', 'atom_index_0', 'atom_index_1', 'atom_index_2'])
angle_df['molecule_name'] = angle_df['molecule_id'].map({v:k for k,v in mol_ids.items()})
angle_df.drop(columns='molecule_id', inplace=True)

for i in range(3): angle_df = map_atom_info(angle_df, i, structures_df)
angle_df.drop(columns=['atom_0', 'atom_1', 'atom_2', 'molecule_id_x', 'molecule_id_y'], inplace=True)

for c in ['x', 'y', 'z']:
    angle_df[f'{c}_0_1'] = angle_df[f'{c}_0'].values - angle_df[f'{c}_1'].values
    angle_df[f'{c}_0_2'] = angle_df[f'{c}_0'].values - angle_df[f'{c}_2'].values
def cos_angles(v1, v2):
    return (v1*v2).sum(1) / np.sqrt((v1**2).sum(1) * (v2**2).sum(1))
angle_df['cos_angle'] = cos_angles(angle_df[['x_0_1', 'y_0_1', 'z_0_1']].values, 
                                   angle_df[['x_0_2', 'y_0_2', 'z_0_2']].values)
angle_df = angle_df[['molecule_id', 'atom_index_0', 'atom_index_1', 'atom_index_2', 'cos_angle']]
angle_df.head(50)

Unnamed: 0,molecule_id,atom_index_0,atom_index_1,atom_index_2,cos_angle
0,0,0,1,2,-0.333287
1,0,0,1,3,-0.333335
2,0,0,1,4,-0.333347
3,0,0,2,3,-0.333352
4,0,0,2,4,-0.333337
5,0,0,3,4,-0.333342
6,1,0,1,2,-0.265915
7,1,0,1,3,-0.266182
8,1,0,2,3,-0.266179
9,2,0,1,2,-0.237105


In [191]:
gb_angle_mol = angle_df.groupby('molecule_id')
gb_pairs_mol = pairs_idx.groupby('molecule_id')

In [257]:
angle_to_in_edge, angle_to_out_edge, mol_ids = [], [], angle_df['molecule_id'].unique()
for mol_id in tqdm_notebook(mol_ids):
    p_df, a_df = gb_pairs_mol.get_group(mol_id), gb_angle_mol.get_group(mol_id)
    p_in_idx, p_out_idx = p_df[['idx_0', 'idx_1']].values, p_df[['idx_1', 'idx_0']].values
    a1 = a_df[['atom_index_0', 'atom_index_1', 'cos_angle']].values 
    a2 = a_df[['atom_index_0', 'atom_index_2', 'cos_angle']].values
    for a in np.concatenate((a1, a2)): 
        if any(np.all(p_in_idx==a[:2], axis=1)): 
            a_to_in_idx = np.where(np.all(p_in_idx==a[:2], axis=1))[0][0]
            angle_to_in_edge.append((mol_id, a_to_in_idx, a[-1]))
        if any(np.all(p_out_idx==a[:2], axis=1)): 
            a_to_out_idx = np.where(np.all(p_out_idx==a[:2], axis=1))[0][0]
            angle_to_out_edge.append((mol_id, a_to_out_idx, a[-1]))

HBox(children=(IntProgress(value=0, max=130775), HTML(value='')))

In [260]:
angle_in_df = pd.DataFrame(angle_to_in_edge, columns=['molecule_id', 'p_idx', 'cos_angle'])
angle_out_df = pd.DataFrame(angle_to_out_edge, columns=['molecule_id', 'p_idx', 'cos_angle'])

In [263]:
angle_in_df.to_csv(PATH+'angle_in_df.csv')
angle_out_df.to_csv(PATH+'angle_out_df.csv')

## Create features

In [73]:
# concatenate train and test into one dataframe
all_df = pd.concat((train_df, test_df), sort=True)
if 'id' in all_df.columns: all_df.drop(columns='id', inplace=True)
clear_memory(['train_df', 'test_df'])

In [14]:
all_df.head()

Unnamed: 0_level_0,atom_index_0,atom_index_1,molecule_name,scalar_coupling_constant,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0,dsgdb9nsd_000001,84.8076,1JHC
1,1,2,dsgdb9nsd_000001,-11.257,2JHH
2,1,3,dsgdb9nsd_000001,-11.2548,2JHH
3,1,4,dsgdb9nsd_000001,-11.2543,2JHH
4,2,0,dsgdb9nsd_000001,84.8074,1JHC


## Create Scalar Coupling level features

In [118]:
# compute distance features
def map_atom_info(df, atom_idx, struct_df):
    df = pd.merge(df, struct_df, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])

    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

def add_dist(df, struct_df):
    df = map_atom_info(df, 0, struct_df)
    df = map_atom_info(df, 1, struct_df)
    p_0 = df[['x_0', 'y_0', 'z_0']].values
    p_1 = df[['x_1', 'y_1', 'z_1']].values
    df['dist'] = np.linalg.norm(p_0 - p_1, axis=1)
    df.drop(
        columns=['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'],
        inplace=True
    )
    return df

def transform_per_atom_group(df, a_idx, col='dist', trans='mean'):
    return df.groupby(['molecule_name', f'atom_index_{a_idx}'])[col].transform(trans)

def inv_dist_per_atom(df, a_idx, d_col='dist', power=3):
    trans = lambda x: 1/(sum(x**-power))
    return transform_per_atom_group(df, a_idx, d_col, trans=trans)

def inv_dist_harm_mean(df, postfix=''):
    c0, c1 = 'inv_dist0' + postfix, 'inv_dist1' + postfix
    return (df[c0] * df[c1]) / (df[c0] + df[c1])

In [16]:
if 'dist' not in all_df.columns: all_df = add_dist(all_df, structures_df)

In [17]:
all_df['R0'] = all_df['atom_0'].map(ATOMIC_RADIUS)
all_df['R1'] = all_df['atom_1'].map(ATOMIC_RADIUS)
all_df['E0'] = all_df['atom_0'].map(ELECTRO_NEG)
all_df['E1'] = all_df['atom_1'].map(ELECTRO_NEG)
all_df['dist_min_rad'] = all_df['dist'] - all_df['R0'] - all_df['R1']
all_df['dist_electro_neg_adj'] = .5 * all_df['dist'] * (all_df['E0'] + all_df['E1'])

In [18]:
# for a_idx in [0, 1]: 
#     all_df[f'inv_dist{a_idx}'] = inv_dist_per_atom(all_df, a_idx, 'dist', 3)
#     all_df[f'inv_dist{a_idx}R'] = inv_dist_per_atom(all_df, a_idx, 'dist_min_rad', 2)
#     all_df[f'inv_dist{a_idx}E'] = inv_dist_per_atom(all_df, a_idx, 'dist_electro_neg_adj', 2)
# all_df['inv_distP'] = inv_dist_harm_mean(all_df, postfix='')
# all_df['inv_distPR'] = inv_dist_harm_mean(all_df, postfix='R')
# all_df['inv_distPE'] = inv_dist_harm_mean(all_df, postfix='E')

In [19]:
all_df.drop(columns=['R0','R1','E0','E1'], inplace=True)

In [20]:
# for a_idx in [0, 1]:
#     all_df[f'mean_dist{a_idx}'] = transform_per_atom_group(all_df, a_idx, col='dist', trans='mean')
#     all_df[f'std_dist{a_idx}'] = transform_per_atom_group(all_df, a_idx, col='dist', trans='std')

In [21]:
gb_type_dist = all_df.groupby('type')['dist']
all_df['normed_dist'] = (all_df['dist'] - gb_type_dist.transform('mean')) / gb_type_dist.transform('std')

In [22]:
all_df['inv_dist'] = 1 / all_df['dist']
gb_type_inv_dist = all_df.groupby('type')['inv_dist']
all_df['normed_inv_dist'] = ((all_df['inv_dist'] - gb_type_inv_dist.transform('mean')) 
                             / gb_type_inv_dist.transform('std'))

In [23]:
all_df.head(20)

Unnamed: 0,atom_index_0,atom_index_1,molecule_name,scalar_coupling_constant,type,atom_0,atom_1,dist,dist_min_rad,dist_electro_neg_adj,normed_dist,inv_dist,normed_inv_dist
0,1,0,dsgdb9nsd_000001,84.8076,1JHC,H,C,1.091953,-0.058047,2.593389,-0.140151,0.91579,0.13305
1,1,2,dsgdb9nsd_000001,-11.257,2JHH,H,H,1.78312,1.02312,3.922863,0.354391,0.560815,-0.359588
2,1,3,dsgdb9nsd_000001,-11.2548,2JHH,H,H,1.783147,1.023147,3.922924,0.355577,0.560806,-0.360743
3,1,4,dsgdb9nsd_000001,-11.2543,2JHH,H,H,1.783157,1.023157,3.922945,0.35597,0.560803,-0.361125
4,2,0,dsgdb9nsd_000001,84.8074,1JHC,H,C,1.091952,-0.058048,2.593385,-0.140363,0.915791,0.13326
5,2,3,dsgdb9nsd_000001,-11.2541,2JHH,H,H,1.783158,1.023158,3.922947,0.356012,0.560803,-0.361166
6,2,4,dsgdb9nsd_000001,-11.2548,2JHH,H,H,1.783148,1.023148,3.922926,0.355616,0.560806,-0.36078
7,3,0,dsgdb9nsd_000001,84.8093,1JHC,H,C,1.091946,-0.058054,2.593373,-0.141133,0.915796,0.134027
8,3,4,dsgdb9nsd_000001,-11.2543,2JHH,H,H,1.783148,1.023148,3.922925,0.355593,0.560806,-0.360759
9,4,0,dsgdb9nsd_000001,84.8095,1JHC,H,C,1.091948,-0.058052,2.593375,-0.140962,0.915795,0.133857


In [24]:
# add atome counts
def add_atom_counts(df, struct_df):
    pd.options.mode.chained_assignment = None
    atoms_per_mol_df = struct_df.groupby(['molecule_name', 'atom']).count()
    atoms_per_mol_map = atoms_per_mol_df['atom_index'].unstack().fillna(0)
    atoms_per_mol_map = atoms_per_mol_map.astype(int).to_dict()
    df['num_atoms'] = 0
    for a in atoms_per_mol_map:
        df[f'num_{a}_atoms'] = df['molecule_name'].map(atoms_per_mol_map[a])
        df['num_atoms'] += df[f'num_{a}_atoms']
    return df

if 'molecule_name' in all_df.columns: all_df = add_atom_counts(all_df, structures_df)

In [25]:
# process scalar coupling types
if all_df['type'].dtype == str:
    all_df['type'] = all_df['type'].map(TYPES_MAP)
    all_df = pd.concat((all_df, pd.get_dummies(all_df['type'], prefix='type')), axis=1)

In [26]:
all_df.head()

Unnamed: 0,atom_index_0,atom_index_1,molecule_name,scalar_coupling_constant,type,atom_0,atom_1,dist,dist_min_rad,dist_electro_neg_adj,normed_dist,inv_dist,normed_inv_dist,num_atoms,num_C_atoms,num_F_atoms,num_H_atoms,num_N_atoms,num_O_atoms
0,1,0,dsgdb9nsd_000001,84.8076,1JHC,H,C,1.091953,-0.058047,2.593389,-0.140151,0.91579,0.13305,5,1,0,4,0,0
1,1,2,dsgdb9nsd_000001,-11.257,2JHH,H,H,1.78312,1.02312,3.922863,0.354391,0.560815,-0.359588,5,1,0,4,0,0
2,1,3,dsgdb9nsd_000001,-11.2548,2JHH,H,H,1.783147,1.023147,3.922924,0.355577,0.560806,-0.360743,5,1,0,4,0,0
3,1,4,dsgdb9nsd_000001,-11.2543,2JHH,H,H,1.783157,1.023157,3.922945,0.35597,0.560803,-0.361125,5,1,0,4,0,0
4,2,0,dsgdb9nsd_000001,84.8074,1JHC,H,C,1.091952,-0.058048,2.593385,-0.140363,0.915791,0.13326,5,1,0,4,0,0


In [50]:
## Create molecules
def mol_from_xyz(filepath, add_hs=True, compute_dist_centre=False):
    """Wrapper function for calling xyz2mol function."""
    charged_fragments = True  # alternatively radicals are made

    # quick is faster for large systems but requires networkx
    # if you don't want to install networkx set quick=False and
    # uncomment 'import networkx as nx' at the top of the file
    quick = True

    atomicNumList, charge, xyz_coordinates = read_xyz_file(filepath)
    mol, dMat = xyz2mol(atomicNumList, charge, xyz_coordinates,
                        charged_fragments, quick, check_chiral_stereo=False)

    # Compute distance from centroid
    xyz_arr = np.array(xyz_coordinates)
    if compute_dist_centre:
        centroid = xyz_arr.mean(axis=0)
        dFromCentroid = norm(xyz_arr - centroid, axis=1)
    else:
        dFromCentroid = None

    return mol, xyz_arr, dMat, dFromCentroid

# get xyx files and number of molecules
xyz_filepath_list = list(glob(DATA_PATH+'structures/*.xyz'))
xyz_filepath_list.sort()
n_mols = len(xyz_filepath_list)
print('total xyz files # ', n_mols)

# transform .xyz to .mol files and store distance matrices
mol_feat_columns = ['ave_bond_length', 'std_bond_length', 'total_bond_length', 
                    'ave_inv_bond_length', 'total_inv_bond_length', 'ave_atom_weight', 
                    'total_atom_weight']
dist_matrices, graph_dist_matrices = {}, {}
xyzs, mols, mol_ids, mol_feats = {}, {}, {}, {}
for i in tqdm_notebook(range(n_mols)):
    filepath = xyz_filepath_list[i]
    mol_name = filepath.split('/')[-1][:-4]
    mol, xyz, dist_matrix, _ = mol_from_xyz(filepath)
    mols[mol_name] = mol
    xyzs[mol_name] = xyz
    dist_matrices[mol_name] = dist_matrix
    mol_ids[mol_name] = i
    
    n_atoms = len(xyz)
    graph_dist_matrix = pd.DataFrame(
        np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, MAX_N_ATOMS - n_atoms)], 'constant'))
    graph_dist_matrix['molecule_id'] = n_atoms * [i]
    graph_dist_matrices[mol_name] = graph_dist_matrix
    
    adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
    atomic_num_list, _, _ = read_xyz_file(filepath)
    dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel()==1]
    inv_dists = 1. / dists
    mol_feats[mol_name] = pd.Series([np.mean(dists), np.std(dists), sum(dists), np.mean(inv_dists), 
                                     sum(inv_dists), np.mean(atomic_num_list)/10, sum(atomic_num_list)/10],
                                   index=mol_feat_columns)

total xyz files #  130775


HBox(children=(IntProgress(value=0, max=130775), HTML(value='')))

Sanitization error: Explicit valence for atom # 4 C greater than permitted
Sanitization error: Explicit valence for atom # 4 C greater than permitted
Sanitization error: Explicit valence for atom # 5 C greater than permitted
Sanitization error: Explicit valence for atom # 3 C greater than permitted
Sanitization error: Explicit valence for atom # 3 C greater than permitted
Sanitization error: Explicit valence for atom # 5 C greater than permitted
Sanitization error: Explicit valence for atom # 5 C greater than permitted
Sanitization error: Explicit valence for atom # 2 C greater than permitted
Sanitization error: Explicit valence for atom # 7 C greater than permitted
Sanitization error: Explicit valence for atom # 2 C greater than permitted
Sanitization error: Explicit valence for atom # 4 C greater than permitted
Sanitization error: Explicit valence for atom # 3 C greater than permitted
Sanitization error: Explicit valence for atom # 5 C greater than permitted
Sanitization error: Expli

In [83]:
graph_dist_df = pd.concat(graph_dist_matrices)
graph_dist_df.reset_index(drop=True, inplace=True)
graph_dist_df.replace(1e8, 10, inplace=True)
graph_dist_df = graph_dist_df.astype(int)
graph_dist_df.tail(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,molecule_id
2358647,1,2,1,2,2,1,0,1,2,2,2,2,3,3,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358648,2,1,2,2,2,2,1,0,1,3,3,3,3,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358649,3,2,2,1,1,2,2,1,0,4,4,3,2,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358650,1,2,3,4,4,3,2,3,4,0,2,4,5,5,4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358651,1,2,3,4,4,3,2,3,4,2,0,4,5,5,4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358652,3,2,1,2,3,3,2,3,3,4,4,0,3,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358653,4,3,2,1,2,3,3,3,2,5,5,3,0,3,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358654,4,4,3,2,1,2,3,3,2,5,5,4,3,0,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358655,3,2,3,3,3,3,2,1,2,4,4,4,4,4,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,130774
2358656,4,3,3,2,2,3,3,2,1,5,5,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130774


In [85]:
graph_dist_df.to_csv(PATH+'graph_dist_df.csv')

## Add Molecule Features

In [28]:
if 'molecule_name' not in all_df.columns:
    mol_names = {v:k for k,v in mol_ids.items()}
    all_df['molecule_name'] = all_df['molecule_id'].map(mol_names)

In [29]:
mol_feat_df = pd.concat(mol_feats, axis=1).T
mol_feat_dict = mol_feat_df.to_dict()
for f in mol_feat_columns: 
    all_df[f] = all_df['molecule_name'].map(mol_feat_dict[f])

In [30]:
all_df.tail(20)

Unnamed: 0,atom_index_0,atom_index_1,molecule_name,scalar_coupling_constant,type,atom_0,atom_1,dist,dist_min_rad,dist_electro_neg_adj,normed_dist,inv_dist,normed_inv_dist,num_atoms,num_C_atoms,num_F_atoms,num_H_atoms,num_N_atoms,num_O_atoms,ave_bond_length,std_bond_length,total_bond_length,ave_inv_bond_length,total_inv_bond_length,ave_atom_weight,total_atom_weight
7163669,13,6,dsgdb9nsd_133885,,3JHC,H,C,3.11929,1.96929,7.408315,0.12875,0.320586,-0.225154,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163670,13,7,dsgdb9nsd_133885,,3JHC,H,C,3.378975,2.228975,8.025065,0.96055,0.295948,-0.956605,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163671,13,8,dsgdb9nsd_133885,,2JHC,H,C,2.32455,1.17455,5.520806,1.572423,0.430191,-1.468509,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163672,13,15,dsgdb9nsd_133885,,3JHH,H,H,2.740317,1.980317,6.028697,0.145742,0.364921,-0.243861,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163673,14,0,dsgdb9nsd_133885,,3JHC,H,C,2.532645,1.382645,6.015032,-1.750345,0.394844,1.979424,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163674,14,1,dsgdb9nsd_133885,,2JHN,H,N,2.248295,1.118295,5.890534,1.659761,0.444781,-1.586774,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163675,14,2,dsgdb9nsd_133885,,3JHC,H,C,3.093895,1.943895,7.348001,0.047407,0.323217,-0.147033,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163676,14,3,dsgdb9nsd_133885,,3JHC,H,C,3.367225,2.217225,7.99716,0.922916,0.29698,-0.925948,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163677,14,4,dsgdb9nsd_133885,,3JHC,H,C,3.192797,2.042797,7.582892,0.3642,0.313205,-0.444273,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4
7163678,14,6,dsgdb9nsd_133885,,2JHC,H,C,2.298957,1.148957,5.460023,1.272861,0.43498,-1.209034,16,7,0,7,1,1,1.36647,0.209055,27.329405,0.751031,15.020621,0.4,6.4


## Compute Angles

In [45]:
# add angular features
def dihedral(p):
    """Praxeolitic formula
    1 sqrt, 1 cross product"""
    p0 = p[0]
    p1 = p[1]
    p2 = p[2]
    p3 = p[3]

    b0 = -1.0*(p1 - p0)
    b1 = p2 - p1
    b2 = p3 - p2

    # normalize b1 so that it does not influence magnitude of vector
    # rejections that come next
    b1 /= np.linalg.norm(b1)

    # vector rejections
    # v = projection of b0 onto plane perpendicular to b1
    #   = b0 minus component that aligns with b1
    # w = projection of b2 onto plane perpendicular to b1
    #   = b2 minus component that aligns with b1
    v = b0 - np.dot(b0, b1)*b1
    w = b2 - np.dot(b2, b1)*b1

    # angle between v and w in a plane is the torsion angle
    # v and w may not be normalized but that's fine since tan is y/x
    x = np.dot(v, w)
    y = np.dot(np.cross(b1, v), w)
    return np.arctan2(y, x)

def cosine_angle(p):
    p0, p1, p2 = p[0], p[1], p[2]
    v1, v2 = p0 - p1, p2 - p1
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1) * np.dot(v2, v2))

In [None]:
all_df['diangle'] = 0.0 # for 3J couplings
all_df['cos_angle'] = 0.0 # for 2J couplings
all_df['cos_angle0'] = 0.0 # for all types cos angle between sc atoms and atom closest to atom 0, given its not an 1J coupling 
all_df['cos_angle1'] = 0.0 # for all types cos angle between sc atoms and atom closest to atom 1

In [66]:
diangles, cos_angles, cos_angles0, cos_angles1 = {}, {}, {}, {}
for idx, row in all_df.iterrows():
    print_progress(idx, 500000)
    mol_name = row['molecule_name']
    mol, xyz = mols[mol_name], xyzs[mol_name]
    dist_matrix, adj_matrix = dist_matrices[mol_name], rdmolops.GetAdjacencyMatrix(mol)
    idx0, idx1 = row['atom_index_0'], row['atom_index_1']
    atom_ids = rdmolops.GetShortestPath(mol, idx0, idx1)
    
    if len(atom_ids)==4:
        diangles[idx] = dihedral(xyz[atom_ids,:])
    elif len(atom_ids)==3:
        cos_angles[idx] = cosine_angle(xyz[atom_ids,:])
    
    try:
        if row['type'] not in [0, 2]:
            neighbors0 = np.where(adj_matrix[idx0]==1)[0]
            idx0_closest = neighbors0[dist_matrix[idx0][neighbors0].argmin()]
            cos_angles0[idx] = cosine_angle(xyz[[idx0_closest, idx0, idx1],:])
        neighbors1 = np.setdiff1d(np.where(adj_matrix[idx1]==1)[0], [idx0])
        idx1_closest = neighbors1[dist_matrix[idx1][neighbors1].argmin()]
        cos_angles1[idx] = cosine_angle(xyz[[idx0, idx1, idx1_closest],:])
    except Exception as e:
        print(e)

0
500000
1000000
1500000
attempt to get argmin of an empty sequence
attempt to get argmin of an empty sequence
attempt to get argmin of an empty sequence
attempt to get argmin of an empty sequence
attempt to get argmin of an empty sequence
attempt to get argmin of an empty sequence
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000


In [85]:
all_df['diangle'] = pd.Series(diangles).abs()
all_df['cos_angle'] = pd.Series(cos_angles)
all_df['cos_angle0'] = pd.Series(cos_angles0)
all_df['cos_angle1'] = pd.Series(cos_angles1)
all_df.fillna(0., inplace=True)

In [113]:
pd.set_option("display.max_columns", 100)
all_df.head(20)

Unnamed: 0,atom_index_0,atom_index_1,scalar_coupling_constant,type,atom_0,atom_1,dist,dist_min_rad,dist_electro_neg_adj,normed_dist,num_atoms,num_C_atoms,num_F_atoms,num_H_atoms,num_N_atoms,num_O_atoms,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,diangle,cos_angle,cos_angle0,cos_angle1,molecule_id
0,1,0,84.8076,0,H,C,1.091953,-0.058047,2.593389,-0.140151,5,1,0,4,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-0.333335,0
1,1,2,-11.257,1,H,H,1.78312,1.02312,3.922863,0.354391,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333287,0.816483,0.816482,0
2,1,3,-11.2548,1,H,H,1.783147,1.023147,3.922924,0.355577,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333335,0.816498,0.816496,0
3,1,4,-11.2543,1,H,H,1.783157,1.023157,3.922945,0.35597,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333347,0.816502,0.8165,0
4,2,0,84.8074,0,H,C,1.091952,-0.058048,2.593385,-0.140363,5,1,0,4,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-0.333352,0
5,2,3,-11.2541,1,H,H,1.783158,1.023158,3.922947,0.356012,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333352,0.816503,0.816501,0
6,2,4,-11.2548,1,H,H,1.783148,1.023148,3.922926,0.355616,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333337,0.816498,0.816497,0
7,3,0,84.8093,0,H,C,1.091946,-0.058054,2.593373,-0.141133,5,1,0,4,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-0.333342,0
8,3,4,-11.2543,1,H,H,1.783148,1.023148,3.922925,0.355593,5,1,0,4,0,0,0,1,0,0,0,0,0,0,0.0,-0.333342,0.816499,0.8165,0
9,4,0,84.8095,0,H,C,1.091948,-0.058052,2.593375,-0.140962,5,1,0,4,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-0.333342,0


In [109]:
## Store processed dfs
# add molecule ids to dataframes
all_df['molecule_id'] = all_df['molecule_name'].map(mol_ids)
all_df.drop(columns=['molecule_name'], inplace=True)

In [110]:
train_df = all_df.iloc[:N_TRAIN_DF]
test_df = all_df.iloc[N_TRAIN_DF:]
test_df.drop(columns='scalar_coupling_constant', inplace=True)
clear_memory(['all_df'])

In [111]:
train_df.shape, test_df.shape

((4658147, 38), (2505542, 37))

In [112]:
train_df.to_csv(PATH + 'train_proc_df.csv')
test_df.to_csv(PATH + 'test_proc_df.csv')

In [114]:
## Engineer features

# functions partially sourced from:
# https://deepchem.io/docs/_modules/deepchem/feat/graph_features.html
def one_hot_encoding(x, set):
    one_hot = [int(x == s) for s in set]
    if 0:
        if sum(one_hot)==0: print('one_hot_encoding() return NULL!', x, set)
    return one_hot

def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception(f"input {x} not in allowable set{allowable_set}:")
    return list(map(lambda s: x == s, allowable_set))

def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

def get_edge_features(mol, eucl_dist):
    """
    Compute the following features for each bond in 'mol':
        - bond type: categorical {1: single, 2: double, 3: triple,
            4: aromatic} (one-hot)
        - is conjugated: bool {0, 1}
        - is in ring: bool {0, 1}
        - euclidean distance: float
        - normalized eucl distance: float
    """
    n_edges = mol.GetNumBonds()
    features = np.zeros((n_edges, N_EDGE_FEATURES))
    pairs_idx = np.zeros((n_edges, 2))
    for n, e in enumerate(mol.GetBonds()):
        i = e.GetBeginAtomIdx()
        j = e.GetEndAtomIdx()
        dc_e_feats = dc.feat.graph_features.bond_features(e).astype(int)
        features[n, :6] = dc_e_feats
        features[n, 6] = eucl_dist[i, j]
        pairs_idx[n] = i, j
    sorted_idx = pairs_idx[:,0].argsort()
    dists = features[:, 6]
    features[:, 7] = (dists - dists.mean()) / dists.std() # normed_dist
    inv_dists = 1. / dists
    features[:, 8] = inv_dists
    features[:, 9] = (inv_dists - inv_dists.mean()) / inv_dists.std() # normed_inv_dist
    return features[sorted_idx], pairs_idx[sorted_idx]

def get_atom_features(mol, dist_matrix):
    """
    Compute the following features for each atom in 'mol':
        - atom type: H, C, N, O, F (one-hot)
        - degree: 1, 2, 3, 4, 5 (one-hot)
        - Hybridization: SP, SP2, SP3, UNSPECIFIED (one-hot)
        - is aromatic: bool {0, 1}
        - formal charge: int
        - atomic number: float
        - donor: bool {0, 1}
        - acceptor: bool {0, 1}
        - average bond length: float
    """
    n_atoms = mol.GetNumAtoms()
    features = np.zeros((n_atoms, N_ATOM_FEATURES))
    adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
    for a in mol.GetAtoms():
        idx = a.GetIdx()
        if sum(adj_matrix[idx]) > 0:
            ave_bond_length = np.mean(dist_matrix[idx][adj_matrix[idx]==1])
            ave_inv_bond_length = np.mean(1. / dist_matrix[idx][adj_matrix[idx]==1])
            ave_neighbor_wt = np.mean([n.GetAtomicNum() / 10 for n in a.GetNeighbors()])
        else:
            ave_bond_length =  0.0
            ave_inv_bond_length = 0.0
            ave_neighbor_wt = 0.0
        
        sym = a.GetSymbol()
        a_feats = one_hot_encoding(sym, SYMBOLS) \
            + one_hot_encoding(a.GetDegree(), DEGREES) \
            + one_hot_encoding(a.GetHybridization(), HYBRIDIZATIONS) \
            + [a.GetIsAromatic(), a.GetFormalCharge(), a.GetAtomicNum() / 10,
               ave_bond_length, ave_inv_bond_length, ave_neighbor_wt]
        features[idx, :len(a_feats)] = np.array(a_feats)

    feat_factory = ChemicalFeatures.BuildFeatureFactory(FDEF)
    try:
        chem_feats = feat_factory.GetFeaturesForMol(mol)
        for t in range(len(chem_feats)):
            if chem_feats[t].GetFamily() == 'Donor':
                for i in chem_feats[t].GetAtomIds():
                    features[i, -2] = 1
            elif chem_feats[t].GetFamily() == 'Acceptor':
                for i in chem_feats[t].GetAtomIds():
                    features[i, -1] = 1
    except RuntimeError as e:
        print(e)

    return features

In [115]:
# create features
atom_features = []
edge_features = []
pairs_idx = []
atom_to_m_id = []
edge_to_m_id = []

for i, m_name in enumerate(mols):
    print_progress(i)
    m_id, mol, = mol_ids[m_name], mols[m_name]
    dist_matrix = dist_matrices[m_name]
    n_atoms, n_edges = mol.GetNumAtoms(), mol.GetNumBonds()
    
    atom_features.append(get_atom_features(mol, dist_matrix))

    e_feats, p_idx = get_edge_features(mol, dist_matrix)
    edge_features.append(e_feats)
    pairs_idx.append(p_idx)
    
    atom_to_m_id.append(np.repeat(m_id, n_atoms))
    edge_to_m_id.append(np.repeat(m_id, n_edges))

0
10000
Pre-condition Violation
	getNumImplicitHs() called without preceding call to calcImplicitValence()
	Violation occurred on line 152 in file Code/GraphMol/Atom.cpp
	Failed Expression: d_implicitValence > -1
	RDKIT: 2019.03.2
	BOOST: 1_68

20000
30000
Pre-condition Violation
	getExplicitValence() called without call to calcExplicitValence()
	Violation occurred on line 161 in file Code/GraphMol/Atom.cpp
	Failed Expression: d_explicitValence > -1
	RDKIT: 2019.03.2
	BOOST: 1_68

Pre-condition Violation
	getNumImplicitHs() called without preceding call to calcImplicitValence()
	Violation occurred on line 152 in file Code/GraphMol/Atom.cpp
	Failed Expression: d_implicitValence > -1
	RDKIT: 2019.03.2
	BOOST: 1_68

40000
Pre-condition Violation
	getExplicitValence() called without call to calcExplicitValence()
	Violation occurred on line 161 in file Code/GraphMol/Atom.cpp
	Failed Expression: d_explicitValence > -1
	RDKIT: 2019.03.2
	BOOST: 1_68

Pre-condition Violation
	getNumImplicitHs(

In [116]:
atom_features = pd.DataFrame(np.concatenate(atom_features), columns=ATOM_COLUMNS)
edge_features = pd.DataFrame(np.concatenate(edge_features), columns=EDGE_COLUMNS)
pairs_idx = np.concatenate(pairs_idx)
edge_features['idx_0'] = pairs_idx[:,0]
edge_features['idx_1'] = pairs_idx[:,1]
atom_features['molecule_id'] = np.concatenate(atom_to_m_id) 
edge_features['molecule_id'] = np.concatenate(edge_to_m_id) 

In [117]:
atom_features.to_csv(PATH + 'atom_df.csv')
edge_features.to_csv(PATH + 'edge_df.csv')

## Create distance dataframe
Zero added rows of distances for each atom to all other atoms in their corresponding molecule.

In [59]:
dist_df = pd.DataFrame(np.concatenate(
    [np.pad(dm, [(0,0), (0, 29-dm.shape[1])], mode='constant')
     for dm in dist_matrices.values()]
))
dist_df['molecule_id'] = atom_features['molecule_id']

In [64]:
dist_df.to_csv(PATH + 'dist_df.csv')