# Imports and Settings

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import squareform, cdist
import time

In [2]:
from Code.DataGeneration.printer import ProgressTimer
from Code.DataGeneration.saver import create_path
from Code.DataGeneration.transform import get_spherical, change_base, get_input_data

In [3]:
path_to_db = "./Dataset/iso17/reference.db"

In [4]:
from ase.db import connect

molecules = []
energies = []
with connect(path_to_db) as conn:
    for row in conn.select():
        molecules.append(np.hstack((row['numbers'].reshape((19, 1)), row['positions'])))
        energies.append(row['total_energy'])

In [5]:
np.array(molecules).shape

(404000, 19, 4)

In [6]:
molecules

[array([[ 8.        , -3.16802635,  1.96300466,  3.01543855],
        [ 6.        , -2.21710243,  1.84786771,  1.94347761],
        [ 6.        , -0.76454791,  1.39939939,  2.11114829],
        [ 6.        , -0.73312225,  1.24456354,  0.45901193],
        [ 8.        , -0.0912899 ,  0.05377263,  0.02773424],
        [ 6.        , -0.21177398,  2.55254873, -0.20350005],
        [ 6.        , -1.06702881,  3.6645756 ,  0.39346274],
        [ 6.        , -2.30319002,  2.96225401,  0.8978978 ],
        [ 6.        , -2.25073071,  1.50323845,  0.51032201],
        [ 1.        , -3.37392563,  1.14685333,  3.58438005],
        [ 1.        , -0.13862503,  2.21370994,  2.59524003],
        [ 1.        , -0.56851969,  0.46955003,  2.66912103],
        [ 1.        , -0.18954314, -0.00864805, -0.96520476],
        [ 1.        ,  0.87373244,  2.62633871, -0.0443089 ],
        [ 1.        , -0.36143923,  2.29531431, -1.24133153],
        [ 1.        , -0.40297738,  4.16346966,  1.13897915],
        

## Create a Dataframe from Input files

In [None]:
#list_ = []
#for file in filenames:
#    filepath = os.path.join(path_to_files, file)
#    try:
#        df_single = pd.read_csv(filepath, skiprows=2,
#                               skipfooter=3, delimiter='\t',
#                               names=['atomtype', 'x', 'y', 'z', 'charge'], 
#                               dtype=dict(atomtype=str, x=float, y=float, z=float, charge=float))
#    except:
#        print(file)
#    df_single['file'] = file
#    list_.append(df_single)
#df_all = pd.concat(list_)
#df_all.head(5)

## Prepare raw Data for Transformation

In [None]:
#n_atoms = 19
#h_atoms = 10
#mask_H = dict(H='ZZZ_H')
#df_all = df_all.replace(dict(atomtype=mask_H))
## sort by file and atomtype
#df_all = df_all.sort_values(['file', 'atomtype']).reset_index(drop=True)
## create file id column
#df_all['file_id'] = (df_all.index) // n_atoms + 1

In [None]:
#df_all.head(25)

## Transform Dataframe to Numpy Array for faster Calculations

In [None]:
#raw_matrix = df_all[['file_id', 'atomtype', 'x', 'y', 'z', 'charge']].values

# Transformation Functions

# Run Calculations

In [None]:
start = time.time()
network_in = np.array(get_input_data(molecules))
print('time: {}'.format(time.time()-start))

In [None]:
network_in.shape

## Get Y-labels

In [None]:
len(energies)

## Save arrays to file

In [None]:
data_path = './Dataset/c702h10_X'
label_path = './Dataset/c702h10_Y'

In [None]:
np.save(data_path, network_in)
np.save(label_path, energies)

# Testing

## Test Functions

In [None]:
from numpy.testing import *

In [None]:
def test_get_spherical():
    test_positions = np.array([[0, 1, 2],
                               [1, 1, 1],
                               [-1, 2, 1]])
    val_result = np.array([[1/np.sqrt(5), np.cos(np.arccos(2/np.sqrt(5))),
                            np.cos(np.pi/2), np.sin(np.pi/2)],
                           [1/np.sqrt(3), np.cos(np.arccos(1/np.sqrt(3))),
                            np.cos(np.arctan(1)), np.sin(np.arctan(1))],
                           [1/np.sqrt(6), np.cos(np.arccos(1/np.sqrt(6))),
                            np.cos(np.arctan(-2) + np.pi), np.sin(np.arctan(-2) + np.pi)]])
    assert_array_almost_equal(val_result, get_spherical(test_positions)) 

In [None]:
def test_change_base():
    test_positions = np.array([[0, 1, 2],
                               [1, 1, 1],
                               [-1, 2, 1]])
    x = np.array([1, 1, 0])
    y = np.array([0, 0, 1])
    z = np.array([2, 1, 3])
    val_result = np.array([[-7.,-13.,4.],
                           [-8.,-17.,5.],
                           [-4., -8.,2.]])
    o = np.array([-1, 4, 3])
    assert_array_almost_equal(val_result, change_base(test_positions, x, y, z, o))

In [None]:
def test_get_input_data():
    test_mol = np.array([['C', 1, 1, 1],
                         ['O', 1, 0, 0],
                         ['O', 0, 3, 0],
                         ['ZZZ_H', 0, 2, 0]])
    return get_input_data(test_mol, 4)

## Run Tests

In [None]:
test_get_spherical()

In [None]:
test_change_base()

In [None]:
test_get_input_data()

In [None]:
np.cross(np.array([-1, 2, -1]), np.array([0, -1, -1]))

In [None]:
x = np.array([[1, 1, 1], [1, 0, 0], [0, 3, 0], [0, 2, 0]])

In [None]:
cdist(x, x)#.argsort()