In [2]:
import os
import copy
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from icecream import ic
# RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolFromXYZFile
from rdkit.Chem import rdDetermineBonds, rdFMCS
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.addAtomIndices=True
IPythonConsole.drawOptions.annotationFontScale=0.8
IPythonConsole.molSize = 400,400
rdkit.__version__

'2023.09.1'

In [3]:
# Path to main folder where all the data is
data_path = '../GNN_XAS_Node/raw_data/OPT_output'
# Create a list with the path of all the different molecule folders
subdirs = [os.path.join(data_path, d) for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

# Number of molecules in the data set
print(len(subdirs))
# List of paths to all the different molecules
print(subdirs)

91
['../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_2_iEPOXY_0_ieEPOXY_2_2', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_3_ieEPOXY_0_2', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_2_eKETO_0_iOH_1_iEPOXY_1_ieEPOXY_0_1', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_2_eKETO_0_iOH_1_iEPOXY_1_ieEPOXY_0_1', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_1_iOH_2_iEPOXY_1_ieEPOXY_0_0', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_1_eKETO_1_iOH_1_iEPOXY_2_ieEPOXY_0_0', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__2_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_0_ieEPOXY_0_1', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_2_ieEPOXY_1_1

In [4]:
mol_name = []
# Split up into the directory path and list of all the folder/molecule names
for subdir in subdirs:
    dir_path, path = os.path.split(subdir)
    mol_name.append(path)

print(len(mol_name))
# List of all the molecule names
print(mol_name)

91
['cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_2_iEPOXY_0_ieEPOXY_2_2', 'cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0', 'cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_3_ieEPOXY_0_2', 'cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_2_eKETO_0_iOH_1_iEPOXY_1_ieEPOXY_0_1', 'cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_2_eKETO_0_iOH_1_iEPOXY_1_ieEPOXY_0_1', 'cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_1_iOH_2_iEPOXY_1_ieEPOXY_0_0', 'cir_R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_1_eKETO_1_iOH_1_iEPOXY_2_ieEPOXY_0_0', 'cir_R_eOH_0_eCOOH__2_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_0_ieEPOXY_0_1', 'cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_2_ieEPOXY_1_1', 'cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_1_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_2_2', 'cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_1_eKETO_3_iOH_0_iEPOXY_0_ieEPOXY_0_2', 'cir_R_eOH_0_eCOOH__1_eEPOXY_0_eCHO_0_eKETO_1_iOH_1_iEPOXY_2_ieEPOXY_0_0', 'cir_R_eOH_1_eCOOH__1_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_0_ieEPOXY_0_1', 'cir_R_eOH_0_eCOOH__0

In [6]:
subdirs[1]

'../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0'

In [8]:
xyz_files = [f for f in os.listdir(subdirs[1]) if os.path.isfile(os.path.join(subdirs[1], f))]
xyz_files


['slurm-7198315.out',
 'OPT_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.sh',
 'R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.xyz',
 'OPT_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.xyz',
 'OPT_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.inp']

In [9]:
pattern = r'OPT_[\w]+\.xyz$'

for filename in xyz_files:
    if re.match(pattern, filename):
        xyz_name  = filename
        break

filename

'OPT_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.xyz'

In [11]:
xyz_path = subdirs[1] + '/' + xyz_name
xyz_path

'../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/OPT_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0.xyz'

In [12]:
loc_dirs = [os.path.join(subdirs[1], d) for d in os.listdir(subdirs[1]) if os.path.isdir(os.path.join(subdirs[1], d))]
loc_names = [f for f in os.listdir(subdirs[1]) if os.path.isdir(os.path.join(subdirs[1], f))]
name_dirs = [x[1] for x in os.walk(subdirs[1])][0]

In [16]:
print(loc_dirs)

['../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/34', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/37', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/53', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/7', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/56', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/51', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/21', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/24', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_

In [31]:
sort_names = natsorted(loc_names)

print(loc_names)
print(sort_names)

['34', '37', '53', '7', '56', '51', '21', '24', '27', '43', '6', '46', '41', '11', '49', '19', '14', '17', '33', '5', '36', '31', '39', '55', '50', '23', '4', '26', '29', '45', '40', '48', '13', '16', '32', '35', '30', '38', '54', '57', '52', '22', '25', '20', '9', '28', '44', '47', '42', '12', '15', '10', '8', '18']
['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57']


In [32]:
sort_dirs = natsorted(name_dirs)

print(name_dirs)
print(sort_dirs)

['34', '37', '53', '7', '56', '51', '21', '24', '27', '43', '6', '46', '41', '11', '49', '19', '14', '17', '33', '5', '36', '31', '39', '55', '50', '23', '4', '26', '29', '45', '40', '48', '13', '16', '32', '35', '30', '38', '54', '57', '52', '22', '25', '20', '9', '28', '44', '47', '42', '12', '15', '10', '8', '18']
['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57']


In [30]:
print(name_dirs)

['34', '37', '53', '7', '56', '51', '21', '24', '27', '43', '6', '46', '41', '11', '49', '19', '14', '17', '33', '5', '36', '31', '39', '55', '50', '23', '4', '26', '29', '45', '40', '48', '13', '16', '32', '35', '30', '38', '54', '57', '52', '22', '25', '20', '9', '28', '44', '47', '42', '12', '15', '10', '8', '18']


In [38]:
new_id = list(range(0, len(name_dirs)))
print(new_id)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]


In [39]:
 for i in range(len(loc_dirs)):
        loc_dir = loc_dirs[i]
    
        loc_files = [f for f in os.listdir(loc_dir) if os.path.isfile(os.path.join(loc_dir, f))]
        dict_ind = str(int(loc_names[i]))
        print(dict_ind)

34
37
53
7
56
51
21
24
27
43
6
46
41
11
49
19
14
17
33
5
36
31
39
55
50
23
4
26
29
45
40
48
13
16
32
35
30
38
54
57
52
22
25
20
9
28
44
47
42
12
15
10
8
18


In [21]:
print(dict_ind)

18


In [28]:
from natsort import natsorted

print(natsorted(loc_dirs))

['../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/4', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/5', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/6', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/7', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/8', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/9', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/10', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_3_0/11', '../GNN_XAS_Node/raw_data/OPT_output/cir_R_eOH_0_eCOOH__0_eEPOXY_1_eCHO_0_eKETO_0_iOH_0_iEPOX