In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# import data
#train = pd.read_csv("../input/champs-scalar-coupling/train.csv")
#test = pd.read_csv("../input/champs-scalar-coupling/test.csv")
structures = pd.read_csv("../input/champs-scalar-coupling/structures.csv")
train_extend = pd.read_csv("../input/predmolprop-featureengineering-slow/train_extend.csv")
test_extend = pd.read_csv("../input/predmolprop-featureengineering-slow/test_extend.csv")

In [None]:
train_extend.columns

In [None]:
# there are 3 bond couplings with no defined dihedral angle because they are in linear molecules
# define the dihedral angle as -1 and make indicator column
def FindNan(x):
    if str(x)=='nan':
        return 1
    else:
        return 0

def ReplaceNan(x):
    if str(x)=='nan':
        return -2
    else:
        return x

train_extend['is_linear']=train_extend.bond3_angle.map(FindNan)
train_extend['bond3_angle']=train_extend.bond3_angle.map(ReplaceNan)

test_extend['is_linear']=test_extend.bond3_angle.map(FindNan)
test_extend['bond3_angle']=test_extend.bond3_angle.map(ReplaceNan)

Count the atoms of each type

In [None]:
def CleanColsAndIdx(df):
    df.columns = df.columns.set_names(None)
    df.index = df.index.set_names(None)
    return df

In [None]:
# Group by molecule_name and atom, then use size() to count how many of each atom
# Then use unstack to make the atom types, which groupby made into indices, into columns
# Finally, because not all molecule contain all five atoms, fill NAs with 0
AtomTypes=structures.groupby(['molecule_name','atom'],sort=False).size().unstack('atom').fillna(0)
# remove index names (optional)

# rename columns
AtomTypes = AtomTypes.rename(columns={'C':'num_C','H':'num_H','N':'num_N',
                                                  'O':'num_O','F':'num_F'})

AtomTypes=CleanColsAndIdx(AtomTypes)

AtomTypes=AtomTypes.astype('int8')

Measurements from the Center of Mass (COM)

In [None]:
gb=structures.groupby('molecule_name',sort=False)
NumAtoms=pd.DataFrame(gb.size(),columns=['total_atoms'])

COM=gb.mean()
COM.pop('atom_index')
COM = COM.rename(columns={'x':'COM_x','y':'COM_y','z':'COM_z'})
COM = CleanColsAndIdx(COM)
structures = pd.merge(structures,COM,how='left',left_on='molecule_name',right_index=True)

axis = ['x','y','z']
for ax in axis:
    structures['d'+ax]=structures[ax]-structures['COM_'+ax]

structures=structures.assign(COM_dr=lambda x: np.sqrt(x.dx**2+x.dy**2+x.dz**2))

gb=structures[['molecule_name','COM_dr']].groupby('molecule_name',sort=False)
Dmin_COM=gb.min(); Dmin_COM.columns=['Dmin_COM']
Dmin_COM=CleanColsAndIdx(Dmin_COM)
Dmean_COM=gb.mean(); Dmean_COM.columns=['Dmean_COM']
Dmean_COM=CleanColsAndIdx(Dmean_COM)
Dmax_COM=gb.max(); Dmax_COM.columns=['Dmax_COM']
Dmax_COM=CleanColsAndIdx(Dmax_COM)

In [None]:
poplist = ['COM_x','COM_y','COM_z','dx','dy','dz']#,'x','y','z']
for col in poplist:
    structures.pop(col)

structures.head(20)

In [None]:
MolProps = AtomTypes.copy()
MolProps=MolProps.join([NumAtoms,Dmin_COM,Dmean_COM,Dmax_COM])
del AtomTypes,NumAtoms,COM,Dmin_COM,Dmean_COM,Dmax_COM
MolProps.head(20)

In [None]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop(['atom_index','atom','x','y','z'], axis=1)
    df = df.rename(columns={'COM_dr': f'COM_dr_{atom_idx}'})
    return df

train_extend = map_atom_info(train_extend, 0)
train_extend = map_atom_info(train_extend, 1)

test_extend = map_atom_info(test_extend, 0)
test_extend = map_atom_info(test_extend, 1)

In [None]:
def map_mol_info(df):
    df = pd.merge(df, MolProps, how = 'left',
                  left_on  = ['molecule_name'],
                  right_index = True)
    return df

train_extend = map_mol_info(train_extend)
test_extend = map_mol_info(test_extend)

In [None]:
train_extend.columns

In [None]:
len(train_extend.columns)

In [None]:
test_extend.atom_2_type.unique()

In [None]:
train_extend.atom_3_type.unique()

In [None]:
# check that the categorical features are the same
print(len(set(train_extend.atom_2_type.unique())-set(test_extend.atom_2_type.unique())))
print(len(set(test_extend.atom_3_type.unique())-set(train_extend.atom_3_type.unique())))

In [None]:
train_extend[train_extend.columns[6:26]].describe()

In [None]:
train_extend[train_extend.columns[26:46]].describe()

In [None]:
train_extend[train_extend.columns[46:]].describe()

In [None]:
from collections import namedtuple

SpinProp = namedtuple('SpinProp',['mu','spin','NMR'] )
"""
https://en.wikipedia.org/wiki/Nuclear_magnetic_moment

mu: magnetic dipole moment
spin: nuclear spin number
NMR: NMR senstiivity relative to H
"""
AtomicSpinProp = {
    'H': SpinProp(2.79284734, 0.5, 1),
    'C': SpinProp(0.7024118, 0.5, 0.016),
    'O': SpinProp(-1.89379,2.5,0.037),
    'N': SpinProp(0.40376100, 1, 0.001),
    'F': SpinProp(2.628868, 0.5, 0.83)
}

def AddSpinProp(df):
    df = df.assign(mu= lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].mu),
             spin= lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].spin),
             NMR=lambda x: x.atom_end_type.map(lambda element: AtomicSpinProp[element].NMR))
    return df

In [None]:
#train = AddSpinProp(train)
#test=AddSpinProp(test)
#PrintDataframe(train.head())

In [None]:
#MolProps.to_csv('MolecularProperties.csv')
#structures.to_csv('structures_extended.csv', index=False)
train_extend.to_csv('train_extend.csv', index=False)
#test_extend.to_csv('test_extend.csv', index=False)

In [None]:
len(train_extend.bond_2.unique())*len(train_extend.bond_3.unique())

In [None]:
len(train_extend.atom_0_type2.unique())*len(train_extend.atom_end_type2.unique())*len(train_extend.atom_2_type.unique())

In [None]:
len(train_extend.atom_0_type2.unique())*len(train_extend.atom_end_type2.unique())*len(train_extend.atom_2_type.unique())*len(train_extend.atom_3_type.unique())

In [None]:
len(train_extend)