## Imports and Settings

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import squareform, cdist
import time

In [2]:
path_to_files = "./../Dataset/dsC7O2H10nsd.xyz/"
filenames = os.listdir(path_to_files)
print('number of files: {}'.format(len(filenames)))

number of files: 6095


## Create a Dataframe from Input files

In [3]:
list_ = []
for file in filenames:
    filepath = os.path.join(path_to_files, file)
    try:
        df_single = pd.read_csv(filepath, skiprows=2,
                               skipfooter=3, delimiter='\t',
                               names=['atomtype', 'x', 'y', 'z', 'charge'], 
                               dtype=dict(atomtype=str, x=float, y=float, z=float, charge=float))
    except:
        print(file)
    df_single['file'] = file
    list_.append(df_single)
df_all = pd.concat(list_)
df_all.head(5)

  


dsC7O2H10nsd_0249.xyz
dsC7O2H10nsd_2258.xyz
dsC7O2H10nsd_2556.xyz


Unnamed: 0,atomtype,x,y,z,charge,file
0,C,-1.839613,0.529296,3.182068,-0.414977,dsC7O2H10nsd_0001.xyz
1,C,-2.146536,-0.164467,1.842511,0.232941,dsC7O2H10nsd_0001.xyz
2,C,-3.5866,0.132659,1.386682,-0.415379,dsC7O2H10nsd_0001.xyz
3,O,-1.945267,-1.573642,1.958069,-0.461014,dsC7O2H10nsd_0001.xyz
4,C,-1.164462,0.268959,0.791005,0.388553,dsC7O2H10nsd_0001.xyz


## Prepare raw Data for Transformation

In [4]:
n_atoms = 19
h_atoms = 10
mask_H = dict(H='ZZZ_H')
df_all = df_all.replace(dict(atomtype=mask_H))
# sort by file and atomtype
df_all = df_all.sort_values(['file', 'atomtype']).reset_index(drop=True)
# create file id column
df_all['file_id'] = (df_all.index) // n_atoms + 1

In [5]:
df_all.head(25)

Unnamed: 0,atomtype,x,y,z,charge,file,file_id
0,C,-1.839613,0.529296,3.182068,-0.414977,dsC7O2H10nsd_0001.xyz,1
1,C,-2.146536,-0.164467,1.842511,0.232941,dsC7O2H10nsd_0001.xyz,1
2,C,-3.5866,0.132659,1.386682,-0.415379,dsC7O2H10nsd_0001.xyz,1
3,C,-1.164462,0.268959,0.791005,0.388553,dsC7O2H10nsd_0001.xyz,1
4,C,-0.247116,-0.397562,0.042898,-0.278918,dsC7O2H10nsd_0001.xyz,1
5,C,0.38521,0.587028,-0.788368,-0.178412,dsC7O2H10nsd_0001.xyz,1
6,C,-0.19714,1.772475,-0.48016,-0.025883,dsC7O2H10nsd_0001.xyz,1
7,O,-1.945267,-1.573642,1.958069,-0.461014,dsC7O2H10nsd_0001.xyz,1
8,O,-1.146429,1.597741,0.48276,-0.201531,dsC7O2H10nsd_0001.xyz,1
9,ZZZ_H,-2.56555,0.22277,3.944423,0.100196,dsC7O2H10nsd_0001.xyz,1


## Transform Dataframe to Numpy Array for faster Calculations

In [6]:
raw_matrix = df_all[['file_id', 'atomtype', 'x', 'y', 'z', 'charge']].values

## Transformation Functions

In [7]:
def get_spherical(positions):
    """
    Transform 3D cartesian coordinates to spherical coordinates that can
    be used for the nural network input.
    
    Parameters
    ----------
    positions : ndarray
        Array of shape (N, 3) where N is the number of coordinates that
        needs to be transformed.
    
    Returns
    -------
    ndarray
        Array of shape (N, 4) where N is the number of transformed coordinates.
        Transformed coordinates for one position: (1/r, cos(theta), cos(phi), sin(phi)).
        
    """            
    positions = positions.astype(float)
    r = np.linalg.norm(positions, axis=1)
    theta = np.arcsin(positions[:, 2]/r)
    phi = np.arctan2(positions[:, 0], positions[:, 1])
    return np.array([1/r, np.cos(theta), np.cos(phi), np.sin(phi)])

In [8]:
def change_base(positions, x, y, z, o):
    """
    Calculate the base transformation from the standard basis to the new axes x, y, z.
    
    Parameters
    ----------
    positions : np.array
        3D atom position in the standard basis
    x : np.array
        new x-axis
    y : np.array
        new y-axis
    z : np.array
        new z-axis    
    o : np.array
        new origin
    
    Returns
    -------
    new_positions : np.array
        3D atom position in the new basis.
        Same shape as positions.

    """
    positions -= o
    basis = np.vstack((x, y, z))
    basis_inv = np.linalg.inv(basis)
    new_positions = basis_inv.dot(positions.T).T
    return new_positions


In [9]:
def get_input_data(raw_matrix):
    """
    Calculate the training input for the sub-networks from a given molecular configuration.

    Parameters
    ----------
    raw_matrix : np.array
        Matrix of the raw input data for all files and all atoms
    raw_matrix_cols : list
        Column names for the raw_matrix
    
    Returns
    -------
    X : np.array
        Training data with 'atomtype' and 'relative position-vector' for all other atoms.
    Y : np.array
        Training labels (Mullikan Charge)

    """
    not_H_atoms = n_atoms - h_atoms
    file_id = 0
    atomtype = 1
    x = 2
    y = 3
    z = 4
    charge = 5
    # make a copy
    network_inputs = []
    # create a column for the pos vector
    start = 0
    # loop over all configurations
    for i in np.unique(raw_matrix[:, 0]):
        stop = i*19
        molecule = raw_matrix[start:stop]
        start = stop
        for atom in range(len(molecule)):
            others = molecule.tolist()
            focus_atom = np.array(others.pop(atom))
            others = np.array(others)
            # get distances from focus atom to other atoms
            distances = cdist(focus_atom[2:5].reshape(1, 3), others[:, 2:5])[0]
            zero = focus_atom[2:5].astype(float)
            # get nearest atoms that are not H
            if atom < not_H_atoms:
                one_id, two_id = distances[:not_H_atoms-1].argsort()[0:2]
            else:
                one_id, two_id = distances[:not_H_atoms].argsort()[0:2]
            one = others[one_id, 2:5].astype(float)
            two = others[two_id, 2:5].astype(float)
            # get new basis vectors
            new_x = one - zero
            new_z = np.cross(new_x, two-zero)
            new_y = np.cross(new_x, new_z)
            # normalize basis vectors
            new_x /= np.linalg.norm(new_x)
            new_y /= np.linalg.norm(new_y)
            new_z /= np.linalg.norm(new_z)
            # sort by distance to origin
            sorted_ids = np.lexsort((distances, others[:, 1]))
            others = others[sorted_ids, :]
            cart_coords = others[:, 2:5].astype(float)
            trans_coords = change_base(cart_coords, new_x, new_y,
                                       new_z, zero)
            spherical_coords = get_spherical(trans_coords)
            net_in_coords = spherical_coords.reshape(18*4).tolist()
            network_inputs.append([net_in_coords])
            
    return network_inputs

## Run Calculations

In [29]:
start = time.time()
network_in = np.array(get_input_data(raw_matrix))
print('time: {}'.format(time.time()-start))



time: 68.55561971664429


In [30]:
training_data = network_in.reshape(network_in.shape[0], network_in.shape[2])

## Get Y-labels

In [31]:
energy_file = './../Dataset/c7o2h10_md/c7o2h10_equilibrium.dat'
df_Y = pd.read_csv(energy_file, names=['file_id', 'energy'], sep=' ', header=0, dtype=dict(file_id=float, energy=float))

In [32]:
training_labels = df_Y['energy'].values

## Save arrays to file

In [34]:
data_path = '../Dataset/network_inputs'
label_path = '../Dataset/network_labels'

In [35]:
np.save(data_path, training_data)
np.save(label_path, training_labels)

NameError: name 'torch' is not defined