In this notebook, we will implement the code to compute the distances between all pairs of atoms in all the molecules. We will then save the result in a file called 'CHAMPS_rel_pos' 

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
os.getcwd()

'C:\\Users\\agarw\\udacity-mlnd\\Capstone project'

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'CHAMPS_rel_pos.csv',
 'interatomic_distances.ipynb',
 'nearest_neighbors.ipynb',
 'predicting_scalar_couplings_II.ipynb',
 'Proposal Review.pdf',
 'Proposal.pdf',
 'Report.pdf',
 'structures_sample.csv',
 'test_sample.csv',
 'tex files',
 'train_sample.csv']

In [4]:
import gc

In [5]:
# function to obtain relative positions and distances between all pairs of atoms in each molecule
# we will only compute this once and save the results to the file: CHAMPS_rel_pos.csv
# for repeated runs we will simply import the results from CHAMPS_rel_pos.csv
def interatomic_distances():
  structures=pd.read_csv('structures_sample.csv')
  print('step 1: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # obtaining all possible atomic pairs for each molecule
  structures=structures.merge(structures, on='molecule_name', suffixes=('_0','_1'))
  print('step 2: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # removing instances where atom_index_0 and atom_index_1 are same
  # as this implies both the atoms in the pair are identical
  structures=structures.loc[structures['atom_index_0']!=structures['atom_index_1']]
  print('step 3: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # we will also need to reset the index since in the file since it starts with 1
  # this is because row no. 0 got deleted in the previous step
  # to avoid later conflicts we want it to start with 0
  structures=structures.reset_index().drop(columns=['index'])
  print('step 4: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # getting relative coordinates
  rel_pos=pd.DataFrame(structures[['x_0','y_0','z_0']].values-structures[['x_1','y_1','z_1']].values, 
                       columns=['x_rel','y_rel', 'z_rel'])
  #print(len(rel_pos)-len(structures))
  
  # translational invariance requires that only relative positions matter
  # thus the coordinates 'x_0','y_0','z_0','x_1','y_1','z_1' can be safely dropped
  # this helps reduce the memory consumed in storing structures
  structures.drop(['x_0','y_0','z_0','x_1','y_1','z_1'],axis=1, inplace=True)
  print('step 5: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # merge rel_pos and structures
  structures=pd.concat([structures, rel_pos], axis=1)
  print('step 6: {} molecules'.format(len(structures['molecule_name'].unique())))
  print('Null values present: {}'.format(structures.isnull().values.any()))
  
  # delete rel_pos to save memory
  del rel_pos
  # garbage collect
  gc.collect()
  
  
  # the norm of relative position gives the distance between each atomic pair
  # the numpy function norm computes this efficiently  
  structures['distance']=np.linalg.norm(structures[['x_rel','y_rel','z_rel']].values, axis=1)
  print('step 7: {} molecules'.format(len(structures['molecule_name'].unique())))
  
  # we will only compute this once and save the results to the file: CHAMPS_rel_pos.csv
  # for repeated runs we will simply import the results from CHAMPS_rel_pos.csv
  structures.to_csv('CHAMPS_rel_pos.csv',index=False)
  
  #Verifying that all molecules are present 
  mol=len(structures['molecule_name'].unique())
  
  
  
  print('saved info about {} molecules'.format(mol))
  
  
  return

In [6]:
import time

In [7]:
%%time
# executing the function interatomic_distances
interatomic_distances()

step 1: 1600 molecules
step 2: 1600 molecules
step 3: 1600 molecules
step 4: 1600 molecules
step 5: 1600 molecules
step 6: 1600 molecules
Null values present: False
step 7: 1600 molecules
saved info about 1600 molecules
Wall time: 6.69 s


In [8]:
rel_pos=pd.read_csv('CHAMPS_rel_pos.csv')

In [9]:
rel_pos.head()

Unnamed: 0,molecule_name,atom_index_0,atom_0,atom_index_1,atom_1,x_rel,y_rel,z_rel,distance
0,dsgdb9nsd_000658,0,C,1,C,-1.429828,-0.557103,0.058681,1.535648
1,dsgdb9nsd_000658,0,C,2,C,-2.186052,-0.036131,1.289821,2.538458
2,dsgdb9nsd_000658,0,C,3,C,-2.178763,1.498805,1.345985,2.967339
3,dsgdb9nsd_000658,0,C,4,C,-0.748934,2.055909,1.287305,2.538665
4,dsgdb9nsd_000658,0,C,5,C,0.00729,1.534939,0.056167,1.535984
