In [1]:
# Standard modules
import re
import json
import pickle as pkl
import numpy as np
from tabulate import tabulate
# RDKit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
# Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
# Bokeh
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.io import output_notebook, show
output_notebook()

In [7]:
# Load data from json file into dictionary
with open('raw/data_coronene_4sets_0.6.json', 'rb') as file:
    dict = json.load(file)

In [9]:
# Create a list of all the molecule names from the dictionary
tot_ids = list(dict[0].keys())
print(tot_ids)
print(f'Total number of molecules {len(tot_ids)}')

['R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_0_ieEPOXY_3_2', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_1_1', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_1_ieEPOXY_4_1', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_2_ieEPOXY_1_0', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_0_iEPOXY_3_ieEPOXY_1_2', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_1_iEPOXY_0_ieEPOXY_1_2', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_1_iEPOXY_0_ieEPOXY_4_0', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_1_iEPOXY_2_ieEPOXY_0_0', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_1_iEPOXY_2_ieEPOXY_1_2', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_2_iEPOXY_3_ieEPOXY_0_2', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_0_ieEPOXY_1_1', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_1_ieEPOXY_0_0', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_1_ieEPOXY_0_1', 'R_eOH_0_eCOOH__0_eEPOXY_0_eCHO_0_eKETO_0_iOH_3_iEPOXY_1_ieEPOXY_1_2', 'R_eO

In [13]:
atom_count = []
# For each molecule in the dataset
for mol_id in tot_ids:
    # Get the chemical structure from rdkit
    mol = Chem.MolFromSmiles(dict[0][mol_id])
    # Find the total number of atoms of a given atomic number
    num_atoms = 0
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() == 6:
            num_atoms += 1
    tot_atoms = num_atoms
    atom_count.append(tot_atoms)

print(atom_count)

[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 26, 25, 26, 26, 24, 24, 24, 24, 25, 24, 25, 24, 24, 26, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 24, 25, 24, 24, 24, 26, 26, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 27, 26, 25, 26, 26, 26, 26, 25, 26, 25, 25, 26, 26, 26, 26, 25, 25, 27, 27, 27, 25, 27, 25, 27, 28, 27, 28, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 25, 28, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 28, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 24, 24, 24, 24, 25, 24, 24, 24, 24, 25, 25, 24, 25, 25, 25, 25, 24, 24, 24, 26, 26, 26, 26, 26, 26, 27, 24, 27, 28, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 24, 25, 24, 27, 24, 24, 25, 24, 24, 26, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 25, 27, 25, 25, 25, 25, 25, 25, 26, 26, 25, 26,

In [14]:
# For each molecule in dataset
for i in range(len(tot_ids)):
    # Get the molecular structure from dictionary
    test_mol = Chem.MolFromSmiles(dict[0][tot_ids[i]])
    # Get all spectra data from dictionary
    test_spec = dict[1][tot_ids[i]]

In [16]:
test_spec

{'5': [2.0901699947601542e-29,
  3.624348981311875e-28,
  5.9416605966400876e-27,
  9.20905579538005e-26,
  1.3494344736353783e-24,
  1.869467543613103e-23,
  2.448575450262485e-22,
  3.032064841095572e-21,
  3.5497102980231987e-20,
  3.9289524141699763e-19,
  4.1114031054537275e-18,
  4.067548899635251e-17,
  3.804564385430328e-16,
  3.3643914916391646e-15,
  2.812791533777509e-14,
  2.2232996458010307e-13,
  1.66145232478493e-12,
  1.1738353958104247e-11,
  7.840720103648555e-11,
  4.951469780680222e-10,
  2.956254010767768e-09,
  1.6687020293595998e-08,
  8.905232718841688e-08,
  4.4930493095590905e-07,
  2.143218891702042e-06,
  9.665432785933572e-06,
  4.121027510511962e-05,
  0.00016611894135464965,
  0.000633085298092665,
  0.002281049626870224,
  0.007770280059733332,
  0.025024651761594106,
  0.07619541618143079,
  0.21934062709913846,
  0.5969510884864764,
  1.5359880908613124,
  3.7365119956532165,
  8.593584380251873,
  18.685802012741622,
  38.413030650699106,
  74.6577394