# Imports and Settings

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import squareform, cdist
import time
from numpy.testing import *

In [2]:
from Code.DataGeneration.printer import ProgressTimer
from Code.DataGeneration.saver import create_path
from Code.DataGeneration.transform import get_spherical, change_base, get_input_data

In [3]:
path_to_db = "./Dataset/iso17/reference.db"

In [4]:
from ase.db import connect

molecules = []
energies = []
with connect(path_to_db) as conn:
    for row in conn.select(5):
        molecules.append(np.hstack((row['numbers'].reshape((19, 1)), row['positions'])))
        energies.append(row['total_energy'])

In [5]:
np.array(molecules).shape

(1, 19, 4)

In [6]:
molecules

[array([[ 8.        , -3.13311987,  1.88354817,  3.12766679],
        [ 6.        , -2.24705161,  1.93852065,  1.97802995],
        [ 6.        , -0.782912  ,  1.58546259,  2.15014356],
        [ 6.        , -0.65984825,  1.29204325,  0.51449211],
        [ 8.        , -0.27095204,  0.04593806, -0.04992813],
        [ 6.        , -0.11965932,  2.44181756, -0.23724109],
        [ 6.        , -1.08130266,  3.68052005,  0.27874998],
        [ 6.        , -2.31743849,  2.80620132,  0.86798379],
        [ 6.        , -2.21857731,  1.35554898,  0.55055286],
        [ 1.        , -2.96925792,  2.73661176,  3.54130577],
        [ 1.        , -0.27153608,  2.40714079,  2.58637797],
        [ 1.        , -0.64130634,  0.66409026,  2.81151566],
        [ 1.        ,  0.17818655,  0.27267158, -0.90326441],
        [ 1.        ,  0.93555285,  2.60279125, -0.05197857],
        [ 1.        , -0.35827703,  2.41891948, -1.26786949],
        [ 1.        , -0.57402327,  4.26807961,  1.08342976],
        

## Create a Dataframe from Input files

In [7]:
#list_ = []
#for file in filenames:
#    filepath = os.path.join(path_to_files, file)
#    try:
#        df_single = pd.read_csv(filepath, skiprows=2,
#                               skipfooter=3, delimiter='\t',
#                               names=['atomtype', 'x', 'y', 'z', 'charge'], 
#                               dtype=dict(atomtype=str, x=float, y=float, z=float, charge=float))
#    except:
#        print(file)
#    df_single['file'] = file
#    list_.append(df_single)
#df_all = pd.concat(list_)
#df_all.head(5)

## Prepare raw Data for Transformation

In [8]:
#n_atoms = 19
#h_atoms = 10
#mask_H = dict(H='ZZZ_H')
#df_all = df_all.replace(dict(atomtype=mask_H))
## sort by file and atomtype
#df_all = df_all.sort_values(['file', 'atomtype']).reset_index(drop=True)
## create file id column
#df_all['file_id'] = (df_all.index) // n_atoms + 1

In [9]:
#df_all.head(25)

## Transform Dataframe to Numpy Array for faster Calculations

In [10]:
#raw_matrix = df_all[['file_id', 'atomtype', 'x', 'y', 'z', 'charge']].values

# Transformation Functions

# Run Calculations

In [11]:
start = time.time()
network_in = np.array(get_input_data(molecules))
print('time: {}'.format(time.time()-start))

0.962115083042 1.31659759833 1.36844086694
1.06158412591 1.61168170039 1.51818556915
1.14297514763 1.83736603674 1.60752929804
0.990614137232 1.3455923376 1.35834154493
1.08337779833 1.59890234777 1.47584928382
1.05813875861 1.81499136066 1.71526781897
1.11806447238 1.88928192066 1.68977904882
1.11808510373 1.97031049532 1.7622187155
1.0861999907 1.43400504548 1.32020351478
1.13569600328 1.68539172882 1.48401660651
1.41068300312 2.27781905463 1.61469235086
1.51590827099 3.82914630694 2.52597494204
1.42217742002 2.76122288957 1.94154600594
1.47610563579 3.51522703191 2.38141969428
1.62470351593 4.30660332967 2.65070105864
1.41068300312 2.66676984611 1.89041041836
1.4882634645 2.81342875857 1.89041041836
1.45251636783 2.7870803989 1.91879448702
1.42217742002 2.76122288957 1.94154600594
time: 0.18990206718444824


In [12]:
network_in.shape

(1, 19, 72)

## Get Y-labels

In [13]:
len(energies)

1

## Save arrays to file

In [14]:
data_path = './Dataset/c702h10_X'
label_path = './Dataset/c702h10_Y'

In [15]:
np.save(data_path, network_in)
np.save(label_path, energies)

# Testing

## Test Functions

In [16]:
def test_get_spherical():
    test_positions = np.array([[0, 1, 2],
                               [1, 1, 1],
                               [-1, 2, 1]])
    val_result = np.array([[1/np.sqrt(5), np.cos(np.arccos(2/np.sqrt(5))),
                            np.cos(np.pi/2), np.sin(np.pi/2)],
                           [1/np.sqrt(3), np.cos(np.arccos(1/np.sqrt(3))),
                            np.cos(np.arctan(1)), np.sin(np.arctan(1))],
                           [1/np.sqrt(6), np.cos(np.arccos(1/np.sqrt(6))),
                            np.cos(np.arctan(-2) + np.pi), np.sin(np.arctan(-2) + np.pi)]])
    assert_array_almost_equal(val_result, get_spherical(test_positions)) 

In [17]:
def test_change_base():
    test_positions = np.array([[0, 1, 2],
                               [1, 1, 1],
                               [-1, 2, 1]])
    x = np.array([1, 1, 0])
    y = np.array([0, 0, 1])
    z = np.array([2, 1, 3])
    val_result = np.array([[-7.,-13.,4.],
                           [-8.,-17.,5.],
                           [-4., -8.,2.]])
    o = np.array([-1, 4, 3])
    assert_array_almost_equal(val_result, change_base(test_positions, x, y, z, o))

In [18]:
def test_get_input_data():
    test_mol = np.array([['C', 1, 1, 1],
                         ['O', 1, 0, 0],
                         ['O', 0, 3, 0],
                         ['ZZZ_H', 0, 2, 0]])
    return get_input_data(test_mol, 4)

## Run Tests

In [19]:
test_mol = molecules[0]

In [20]:
test_mol = test_mol[test_mol[:,0].argsort(kind='mergesort')]

In [21]:
results = np.zeros((19, 72))

In [50]:
positions = test_mol[:, 1:]
for i in range(10):
    dists = cdist(positions[np.newaxis, i], positions[10:])
    one, two = dists.argsort().reshape(9)[:2] + 10
    zero = i
    x = positions[one] - positions[zero]
    z = np.cross(x, positions[two] - positions[zero])
    y = np.cross(z, x)
    x /= np.linalg.norm(x)
    y /= np.linalg.norm(y)
    z /= np.linalg.norm(z)
    print(i, one, two, positions[one], positions[two])
    others = np.vstack((test_mol[:i], test_mol[i+1:]))
    ch_b_others = change_base(others[:, 1:], x, y, z, positions[zero])
    sph_others = get_spherical(ch_b_others)
    ids = sph_others[:, 0].argsort(kind='mergesort')
    sph_others = sph_others[ids]
    others = others[ids]
    sph_others = sph_others[others[:, 0].argsort(kind='mergesort')]
    results[i] = sph_others.reshape(72)

0 17 10 [-3.13311987  1.88354817  3.12766679] [-2.24705161  1.93852065  1.97802995]
1 11 10 [-0.782912    1.58546259  2.15014356] [-2.24705161  1.93852065  1.97802995]
2 11 10 [-0.782912    1.58546259  2.15014356] [-2.24705161  1.93852065  1.97802995]
3 18 12 [-0.27095204  0.04593806 -0.04992813] [-0.65984825  1.29204325  0.51449211]
4 13 12 [-0.11965932  2.44181756 -0.23724109] [-0.65984825  1.29204325  0.51449211]
5 13 14 [-0.11965932  2.44181756 -0.23724109] [-1.08130266  3.68052005  0.27874998]
6 14 15 [-1.08130266  3.68052005  0.27874998] [-2.31743849  2.80620132  0.86798379]
7 14 13 [-1.08130266  3.68052005  0.27874998] [-0.11965932  2.44181756 -0.23724109]
8 15 10 [-2.31743849  2.80620132  0.86798379] [-2.24705161  1.93852065  1.97802995]
9 16 12 [-2.21857731  1.35554898  0.55055286] [-0.65984825  1.29204325  0.51449211]


In [51]:
val_result = results[:10]

In [52]:
nearest = cdist(positions[np.newaxis, 0], positions[10:])#.argsort().reshape(9) + 10

In [53]:
nearest

array([[ 1.89798911,  2.83559256,  4.07206976,  4.74178576,  3.88582143,
         2.75251915,  3.37867707,  0.96211508,  5.23617613]])

In [54]:
test_result = np.array(get_input_data(test_mol[np.newaxis])).reshape(19, 72)[:10]

0.962115083042 1.31659759833 1.36844086694
1.06158412591 1.61168170039 1.51818556915
1.14297514763 1.83736603674 1.60752929804
0.990614137232 1.3455923376 1.35834154493
1.08337779833 1.59890234777 1.47584928382
1.05813875861 1.81499136066 1.71526781897
1.11806447238 1.88928192066 1.68977904882
1.11808510373 1.97031049532 1.7622187155
1.0861999907 1.43400504548 1.32020351478
1.13569600328 1.68539172882 1.48401660651
1.41068300312 2.27781905463 1.61469235086
1.51590827099 3.82914630694 2.52597494204
1.42217742002 2.76122288957 1.94154600594
1.47610563579 3.51522703191 2.38141969428
1.62470351593 4.30660332967 2.65070105864
1.41068300312 2.66676984611 1.89041041836
1.4882634645 2.81342875857 1.89041041836
1.45251636783 2.7870803989 1.91879448702
1.42217742002 2.76122288957 1.94154600594


In [55]:
test_result.sum(axis=1)

array([ 28.40736695,  30.53797926,  19.76962284,  29.59129429,
        30.68651161,  30.96789707,  18.54641092,  20.56990474,
        20.35913868,  24.21324893])

In [56]:
val_result.sum(axis=1)

array([ 28.40736695,  30.53797926,  19.76962284,  29.59129429,
        30.68651161,  30.96789707,  18.54641092,  20.56990474,
        20.35913868,  24.21324893])

In [57]:
test_result[0]

array([  1.67291431e-01,   1.47438366e-01,   6.02044772e-01,
         7.98462330e-01,   1.82433413e-01,  -1.06055026e-01,
         3.49431343e-01,   9.36961972e-01,   1.88387242e-01,
         1.80183575e-01,   1.91226248e-01,   9.81545986e-01,
         2.09412529e-01,  -3.40970623e-01,   3.19776996e-03,
         9.99994887e-01,   2.41688270e-01,  -2.48215034e-01,
         8.43845839e-01,   5.36585687e-01,   2.66088313e-01,
        -5.48338724e-02,  -1.88972553e-01,   9.81982370e-01,
         3.12388708e-01,   6.71365330e-01,   7.39692399e-01,
         6.72945135e-01,   3.40763355e-01,  -6.81100254e-01,
         3.11006809e-01,   9.50407683e-01,   3.47143938e-01,
         5.52170252e-01,   1.01268480e-01,   9.94859133e-01,
         2.10891013e-01,   3.09688104e-02,   2.95505454e-01,
         9.55341052e-01,   2.45575361e-01,   1.54035312e-01,
         5.44011784e-01,   8.39077576e-01,   2.57345845e-01,
        -2.00466531e-01,   6.41443237e-02,   9.97940632e-01,
         2.95973832e-01,

In [58]:
val_result[0]

array([  1.67291431e-01,   1.47438366e-01,   6.02044772e-01,
         7.98462330e-01,   1.82433413e-01,  -1.06055026e-01,
         3.49431343e-01,   9.36961972e-01,   1.88387242e-01,
         1.80183575e-01,   1.91226248e-01,   9.81545986e-01,
         2.09412529e-01,  -3.40970623e-01,   3.19776996e-03,
         9.99994887e-01,   2.41688270e-01,  -2.48215034e-01,
         8.43845839e-01,   5.36585687e-01,   2.66088313e-01,
        -5.48338724e-02,  -1.88972553e-01,   9.81982370e-01,
         3.12388708e-01,   6.71365330e-01,   7.39692399e-01,
         6.72945135e-01,   3.40763355e-01,  -6.81100254e-01,
         3.11006809e-01,   9.50407683e-01,   3.47143938e-01,
         5.52170252e-01,   1.01268480e-01,   9.94859133e-01,
         2.10891013e-01,   3.09688104e-02,   2.95505454e-01,
         9.55341052e-01,   2.45575361e-01,   1.54035312e-01,
         5.44011784e-01,   8.39077576e-01,   2.57345845e-01,
        -2.00466531e-01,   6.41443237e-02,   9.97940632e-01,
         2.95973832e-01,

In [59]:
assert_array_almost_equal(test_result, val_result)

In [60]:
test = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
test.reshape(8)

array([1, 2, 3, 4, 5, 6, 7, 8])

In [61]:
test_result.sum()

253.64937530350235

In [62]:
np.isnan(test_result)

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
     

In [63]:
val_result.sum()

253.64937530350235

In [64]:
diff = test_result-val_result

In [65]:
for i in range(10):
    for j in range(72):
        if np.abs(diff[i, j]) <= 0.00001:
            print(i, j)

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 32
1 33
1 34
1 35
1 36
1 37
1 38
1 39
1 40
1 41
1 42
1 43
1 44
1 45
1 46
1 47
1 48
1 49
1 50
1 51
1 52
1 53
1 54
1 55
1 56
1 57
1 58
1 59
1 60
1 61
1 62
1 63
1 64
1 65
1 66
1 67
1 68
1 69
1 70
1 71
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
2 25
2 26
2 27
2 28
2 29
2 30
2 31
2 32
2 33
2 34
2 35
2 36
2 37
2 38
2 39
2 40
2 41
2 42
2 43
2 44
2 45
2 46
2 47
2 48
2 49
2 50
2 51
2 52
2 53
2 54
2 55
2 56
2 57
2 58
2 59
2 60
2 61


In [66]:
test_result[:, 1]

array([ 0.14743837,  0.60592196, -0.11561125,  0.08339485,  0.69834739,
        0.4760433 , -0.70573705, -0.08598434,  0.38159225,  0.31132745])

In [67]:
val_result[:, 1]

array([ 0.14743837,  0.60592196, -0.11561125,  0.08339485,  0.69834739,
        0.4760433 , -0.70573705, -0.08598434,  0.38159225,  0.31132745])

In [68]:
test_get_spherical()

In [69]:
test_change_base()

In [70]:
test_get_input_data()

TypeError: get_input_data() takes 1 positional argument but 2 were given

In [71]:
np.cross(np.array([-1, 2, -1]), np.array([0, -1, -1]))

array([-3, -1,  1])

In [72]:
x = np.array([[1, 1, 1], [1, 0, 0], [0, 3, 0], [0, 2, 0]])

In [73]:
cdist(x, x)#.argsort()

array([[ 0.        ,  1.41421356,  2.44948974,  1.73205081],
       [ 1.41421356,  0.        ,  3.16227766,  2.23606798],
       [ 2.44948974,  3.16227766,  0.        ,  1.        ],
       [ 1.73205081,  2.23606798,  1.        ,  0.        ]])

In [74]:
test_mol = mol

NameError: name 'mol' is not defined