In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
!conda install -y -c rdkit rdkit;
from rdkit import Chem
!conda install -y -c openbabel openbabel;
#import openbabel as ob
#import pybel as pb

In [None]:
# import data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
#structures = pd.read_csv("../input/structures.csv")

In [None]:
pd.DataFrame().to_csv('magic.csv',index=False)
import pybel as pb

We can extract features from the coupling type. See my previous kernel on [here](https://www.kaggle.com/edeanf/190621-predmolprop-exploration).

In [None]:
def PrintDataframe(df):
    print(df.to_string(),'\n')
    
def DecomposeType(df):
    df['num_bonds'] = df.type.map(lambda s: int(s[0]))
    df['atom_end_type'] = df.type.map(lambda s: s[-1])

DecomposeType(train)
DecomposeType(test)
PrintDataframe(train.head())

Although there are some features that can be extracted from the structures.csv, using the .xyz files is more powerful. I tried using the following api from jensengroup [github](https://github.com/jensengroup/xyz2mol) in order to import .xyz files into RDkit's Mol object, but there are errors. Jensen also [mentions](https://proteinsandwavefunctions.blogspot.com/2018/01/xyz2mol-converting-xyz-file-to-rdkit.html) that one can use openbabel to convert to smile and then convert to rdkit Mol object, but the ordering of the atoms can change, which is bad since we need to be able to reference in original indexing. So I wrote my own code which uses open babel

In [None]:
from collections import Counter

def ProcessXYZ(filepath):
    '''
    reads a filepath and returns RDKit Mol Object, 3D Distance Matrix, and Adjacency Matrix
    '''
    
    # load as openbabel molecule
    pbMol = pb.readfile("xyz", filepath).__next__()
    #obMol = pbMol.OBMol
    num_atoms = len(pbMol.atoms)#obMol.NumAtoms()
    
    # extract atomic properties
    rwMol = Chem.RWMol()
    conf = Chem.Conformer(num_atoms)
    AtomCounts = Counter()
    HybOrb = []
    atoms =[]
    PCList = []
    atom_coord = []
    atom_spins = []
    atom_valence = []
    atom_fc = []
    for i,atom in enumerate(pbMol.atoms):#ob.OBMolAtomIter(obMol):
        rwMol.AddAtom(Chem.Atom(atom.atomicnum))
        conf.SetAtomPosition(i, atom.coords)
        atom_coord.append(np.array(atom.coords))
        AtomCounts = AtomCounts+Counter(atom.type)
        atoms.append(atom.type)
        HybOrb.append(int(atom.hyb))
        PCList.append(atom.partialcharge)
        atom_spins.append(int(atom.spin))
        atom_valence.append(int(atom.valence))
        atom_fc.append(atom.formalcharge)
    # make RDKit Mol Object
    mol = rwMol.GetMol()
    mol.AddConformer(conf)
    
    # Get 3D Distance Matrix
    DistMat = np.array(Chem.Get3DDistanceMatrix(mol))
    
    # Now make adjacency matrix
    # note that Atom.GetIdx() appears to use one-based indexing
    AdjMat = np.zeros((num_atoms, num_atoms)).astype(int)
    
    def PBMolBondIter(pbMol):
        for i in range(pbMol.OBMol.NumBonds()):
            yield pbMol.OBMol.GetBond(i)
    
    for bond in PBMolBondIter(pbMol):#ob.OBMolBondIter(obMol):
        i,j = bond.GetBeginAtomIdx()-1, bond.GetEndAtomIdx()-1
        BO = bond.GetBO()
        AdjMat[i][j]=BO
        AdjMat[j][i]=BO
    return DistMat, AdjMat, atoms, HybOrb, pbMol.OBMol.NumBonds(),PCList, atom_coord, atom_spins, atom_valence, atom_fc

Because a single molecule is used multiple times, it is more straightforward if not more efficienct to iterate through the Dataframe instead of using assign. I imagine the efficiency depends on how expensive it is to load the .xyz file repeatedly and how much space it would take to store all the molecules in memory. In any case, we can optimize code later. Let's just get it to work.

In [None]:
def unit_vector(vector):
    """ Returns the unit vector of the vector """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2' """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) * 180/np.pi

def angle_2bond(atom1,atom2,atom3):
    vect12 = atom1-atom2
    vect32 = atom3-atom2
    return angle_between(vect32,vect12)

def dihedral_angle(atom1, atom2, atom3, atom4):
    """
    Returns the dihedral angle between atoms 1 and 4
    given the x, y, z coordinates of all four atoms
    """
    # First calculate the vectors between different atom combinations
    vec_12 = atom1-atom2
    vec_23 = atom2-atom3
    vec_34 = atom3-atom4

    # Then calculate the normal vector for the plane that contains bonds 2-3 and 3-4
    norm = np.cross(vec_23, vec_34)

    # Now calculate the angle between the normal vector and vec_12
    # Subtract 90 deg to get the angle between the plane and vec_12
    return  angle_between(vec_12,norm) - 90


# Dictionary of distance features for each atom
def GetFeatureDict(DistMat,atoms):
    feature_dict ={}
    temp=DistMat[np.nonzero(DistMat)]
    feature_dict['min']=temp.min()
    feature_dict['mean']=temp.mean()
    feature_dict['max']=temp.max()
    temp=DistMat[np.nonzero(DistMat*[atom[0]=='C' for atom in atoms])]
    if temp.size>0:
        feature_dict['Cmin']=temp.min()
        feature_dict['Cmean']=temp.mean()
        feature_dict['Cmax']=temp.max()
    else:
        feature_dict['Cmin']=-1
        feature_dict['Cmean']=-1
        feature_dict['Cmax']=-1
    temp=DistMat[np.nonzero(DistMat*[atom[0]=='O' for atom in atoms])]
    if temp.size>0:
        feature_dict['Omin']=temp.min()
        feature_dict['Omean']=temp.mean()
        feature_dict['Omax']=temp.max()
    else:
        feature_dict['Omin']=-1
        feature_dict['Omean']=-1
        feature_dict['Omax']=-1
    temp=DistMat[np.nonzero(DistMat*[atom[0]=='N' for atom in atoms])]
    if temp.size>0:
        feature_dict['Nmin']=temp.min()
        feature_dict['Nmean']=temp.mean()
        feature_dict['Nmax']=temp.max()
    else:
        feature_dict['Nmin']=-1
        feature_dict['Nmean']=-1
        feature_dict['Nmax']=-1
    temp=DistMat[np.nonzero(DistMat*[atom[0]=='F' for atom in atoms])]
    if temp.size>0:
        feature_dict['Fmin']=temp.min()
        feature_dict['Fmean']=temp.mean()
        feature_dict['Fmax']=temp.max()
    else:
        feature_dict['Fmin']=-1
        feature_dict['Fmean']=-1
        feature_dict['Fmax']=-1
    return feature_dict

In [None]:
import scipy.sparse as sparse
def ExtendDataframe(df):
    # initialize the columns
    num_mol_bonds =[]
    min_d = []
    mean_d = []
    max_d = []
    space_dr=[]
    bond_dr=[]
    bond_1=[]
    bond_2=[]
    bond_3=[]
    
    atom_0_pc=[]
    atom_end_pc=[]
    atom_0_fc=[]
    atom_end_fc=[]
    atom_0_val=[]
    atom_end_val=[]
    atom_0_sm=[]
    atom_end_sm=[]
    
    atom_0_type2=[]
    atom_2_type=[]
    atom_3_type=[]
    atom_end_type2=[]
    atom_2_hyb=[]
    atom_3_hyb=[]
    atom_end_hyb=[]
    path_count = []
    
    atom_0_min = []
    atom_0_mean = []
    atom_0_max = []
    atom_0_Cmin = []
    atom_0_Cmean = []
    atom_0_Cmax = []
    atom_0_Omin = []
    atom_0_Omean = []
    atom_0_Omax = []
    atom_0_Nmin = []
    atom_0_Nmean = []
    atom_0_Nmax = []
    atom_0_Fmin = []
    atom_0_Fmean = []
    atom_0_Fmax = []

    atom_end_min = []
    atom_end_mean = []
    atom_end_max = []
    atom_end_Cmin = []
    atom_end_Cmean = []
    atom_end_Cmax = []
    atom_end_Omin = []
    atom_end_Omean = []
    atom_end_Omax = []
    atom_end_Nmin = []
    atom_end_Nmean = []
    atom_end_Nmax = []
    atom_end_Fmin = []
    atom_end_Fmean = []
    atom_end_Fmax = []

    bond2_angle = []
    bond3_angle = []
    
    # initialize the molecule currently loaded
    loaded_name = ''
    
    # Depth First Search for looking up bonds between atom_0 and atom_1
    def DFS(path):
        # a path is a list of atom indices starting from atom_0 which is at index i
        # only if the (num_bonds+1)th atom has index j is the path saved into paths
        # where j is the index of atom_1
        m = path[-1]
        if len(path) == num_bonds+1:
            if m == j:
                paths.append(path.copy())
        else:
            for _,n in zip(*SparseAdjMat[m].nonzero()):
                if n not in path:
                    path.append(n)
                    DFS(path)
                    path.pop()
    
    for row in df.itertuples(index=False):
        mol_name = row.molecule_name
        i = row.atom_index_0
        j = row.atom_index_1
        num_bonds = row.num_bonds
        
        # construct molecule properties
        if loaded_name != mol_name:
            # load new molecule
            loaded_name = mol_name
            filepath = '../input/structures/'+loaded_name+'.xyz'
            #mol,DistMat,AdjMat,AtomCounts,NumAtoms,atoms,HybOrb = ProcessXYZ(filepath)
            DistMat,AdjMat,atoms,HybOrb,bond_count,PCList,atom_coord,atom_spins, atom_val, atom_fc = ProcessXYZ(filepath)
            SparseAdjMat=sparse.csr_matrix(AdjMat)
            
            num_mol_bonds.append(bond_count)
            min_d.append(DistMat[np.nonzero(DistMat)].min())
            mean_d.append(DistMat[np.nonzero(DistMat)].mean())
            max_d.append(DistMat.max())
            atom_dist_dict={}
        else:
            num_mol_bonds.append(num_mol_bonds[-1])
            min_d.append(min_d[-1])
            mean_d.append(mean_d[-1])
            max_d.append(max_d[-1])
        
        # through-space distance and atom properties
        space_dr.append(DistMat[i][j])
        atom_0_type2.append(atoms[i])
        atom_end_type2.append(atoms[j])
        atom_0_pc.append(PCList[i])
        atom_end_pc.append(PCList[j])
        
        atom_0_fc.append(atom_fc[i])
        atom_end_fc.append(atom_fc[j])
        atom_0_val.append(atom_val[i])
        atom_end_val.append(atom_val[j])
        atom_0_sm.append(atom_spins[i])
        atom_end_sm.append(atom_spins[j])
        
        # find path between the atom_0 and atom_1
        paths = []
        if num_bonds>1:
            DFS([i])
            path_count.append(len(paths))
        
        if num_bonds==1:
            bond_dr.append(DistMat[i][j])
            bond_1.append(AdjMat[i][j])
            bond_2.append(-1)
            bond_3.append(-1)
            atom_2_type.append('')
            atom_3_type.append('')
            atom_2_hyb.append(-1)
            atom_3_hyb.append(-1)
            atom_end_hyb.append(HybOrb[j])
            path_count.append(1)
            bond2_angle.append(-1)
            bond3_angle.append(-1)
        elif num_bonds==2:
            path = paths[0]
            bond_dr.append(DistMat[path[0]][path[1]]+DistMat[path[1]][path[2]])
            bond_1.append(AdjMat[path[0]][path[1]])
            bond_2.append(AdjMat[path[1]][path[2]])
            bond_3.append(-1)
            atom_2_type.append(atoms[path[1]])
            atom_3_type.append('')
            atom_2_hyb.append(HybOrb[path[1]])
            atom_3_hyb.append(-1)
            atom_end_hyb.append(HybOrb[j])
            bond2_angle.append(abs(angle_2bond(atom_coord[path[0]],atom_coord[path[1]],atom_coord[path[2]])))
            bond3_angle.append(-1)
        else:
            path = paths[0]
            bond_dr.append(DistMat[path[0]][path[1]]+DistMat[path[1]][path[2]]+DistMat[path[2]][path[3]])
            bond_1.append(AdjMat[path[0]][path[1]])
            bond_2.append(AdjMat[path[1]][path[2]])
            bond_3.append(AdjMat[path[2]][path[3]])
            atom_2_type.append(atoms[path[1]])
            atom_3_type.append(atoms[path[2]])
            atom_2_hyb.append(HybOrb[path[1]])
            atom_3_hyb.append(HybOrb[path[2]])
            atom_end_hyb.append(HybOrb[j])
            bond2_angle.append(-1)
            bond3_angle.append(float(abs(dihedral_angle(atom_coord[path[0]],atom_coord[path[1]],atom_coord[path[2]],atom_coord[path[3]]))))
        
        # distance features for atom end
        if(i not in atom_dist_dict):
            atom_dist_dict[i]=GetFeatureDict(DistMat[i][:],atoms)
        feature_list = [
            ('min',atom_0_min),('mean',atom_0_mean),('max',atom_0_max),
            ('Cmin',atom_0_Cmin),('Cmean',atom_0_Cmean),('Cmax',atom_0_Cmax),
            ('Omin',atom_0_Omin),('Omean',atom_0_Omean),('Omax',atom_0_Omax),
            ('Nmin',atom_0_Nmin),('Nmean',atom_0_Nmean),('Nmax',atom_0_Nmax),
            ('Fmin',atom_0_Fmin),('Fmean',atom_0_Fmean),('Fmax',atom_0_Fmax)
        ]
        for name, feature in feature_list:
            feature.append(atom_dist_dict[i][name])
        
        if(j not in atom_dist_dict):
            atom_dist_dict[j]=GetFeatureDict(DistMat[j][:],atoms)    
        feature_list = [
            ('min',atom_end_min),('mean',atom_end_mean),('max',atom_end_max),
            ('Cmin',atom_end_Cmin),('Cmean',atom_end_Cmean),('Cmax',atom_end_Cmax),
            ('Omin',atom_end_Omin),('Omean',atom_end_Omean),('Omax',atom_end_Omax),
            ('Nmin',atom_end_Nmin),('Nmean',atom_end_Nmean),('Nmax',atom_end_Nmax),
            ('Fmin',atom_end_Fmin),('Fmean',atom_end_Fmean),('Fmax',atom_end_Fmax)
        ]
        for name, feature in feature_list:
            feature.append(atom_dist_dict[j][name])
        
    cols = {'num_mol_bonds': num_mol_bonds,
            'min_d':min_d,
            'mean_d':mean_d,
            'max_d':max_d,
            'space_dr': space_dr,
            'bond_dr': bond_dr,
            'bond_1': bond_1,
            'bond_2': bond_2,
            'bond_3': bond_3,
            'atom_0_pc': atom_0_pc,
            'atom_end_pc': atom_end_pc,
            'atom_0_fc': atom_0_fc,
            'atom_end_fc': atom_end_fc,
            'atom_0_val': atom_0_val,
            'atom_end_val': atom_end_val,
            'atom_0_sm': atom_0_sm,
            'atom_end_sm':atom_end_sm,
            'atom_0_type2': atom_0_type2,
            'atom_2_type': atom_2_type,
            'atom_3_type': atom_3_type,
            'atom_end_type2': atom_end_type2,
            'atom_2_hyb': atom_2_hyb,
            'atom_3_hyb': atom_3_hyb,
            'atom_end_hyb': atom_end_hyb,
            'path_count': path_count,
            'atom_0_min': atom_0_min,
            'atom_0_mean': atom_0_mean,
            'atom_0_max': atom_0_max,
            'atom_0_Cmin': atom_0_Cmin,
            'atom_0_Cmean': atom_0_Cmean,
            'atom_0_Cmax': atom_0_Cmax,
            'atom_0_Omin': atom_0_Omin,
            'atom_0_Omean': atom_0_Omean,
            'atom_0_Omax': atom_0_Omax,
            'atom_0_Nmin': atom_0_Nmin,
            'atom_0_Nmean': atom_0_Nmean,
            'atom_0_Nmax': atom_0_Nmax,
            'atom_0_Fmin': atom_0_Fmin,
            'atom_0_Fmean': atom_0_Fmean,
            'atom_0_Fmax': atom_0_Fmax,
            'atom_end_min': atom_end_min,
            'atom_end_mean': atom_end_mean,
            'atom_end_max': atom_end_max,
            'atom_end_Cmin': atom_end_Cmin,
            'atom_end_Cmean': atom_end_Cmean,
            'atom_end_Cmax': atom_end_Cmax,
            'atom_end_Omin': atom_end_Omin,
            'atom_end_Omean': atom_end_Omean,
            'atom_end_Omax': atom_end_Omax,
            'atom_end_Nmin': atom_end_Nmin,
            'atom_end_Nmean': atom_end_Nmean,
            'atom_end_Nmax': atom_end_Nmax,
            'atom_end_Fmin': atom_end_Fmin,
            'atom_end_Fmean': atom_end_Fmean,
            'atom_end_Fmax': atom_end_Fmax,
            'bond2_angle': bond2_angle,
            'bond3_angle': bond3_angle
            }
    return df.assign(**cols)

In [None]:
#train_sample = train.sample(10000)
import time
start = time.time()
#train_extend = ExtendDataframe(train_sample)
train_extend = ExtendDataframe(train)
test_extend = ExtendDataframe(test)
end = time.time()
print((end-start)/60)

In [None]:
train_extend.columns

In [None]:
train_extend.atom_0_type2.unique()

In [None]:
train_extend.atom_end_type2.unique()

In [None]:
train_extend[train_extend.columns[:20]].describe()

In [None]:
train_extend[train_extend.columns[6:21]].describe()

In [None]:
train_extend[train_extend.columns[21:36]].describe()

In [None]:
train_extend.atom_end_sm.unique()

In [None]:
train_extend[train_extend.columns[36:51]].describe()

In [None]:
train_extend[train_extend.columns[51:]].describe()

In [None]:
import seaborn as sns

In [None]:
sns.distplot(train_extend[train_extend.bond2_angle!=-1].bond2_angle)

In [None]:
sns.distplot(train_extend[train_extend.bond3_angle!=-1].bond3_angle)

In [None]:
# save for model building
train_extend.to_csv('train_extend.csv', index=False)
test_extend.to_csv('test_extend.csv', index=False)