In [1]:
import numpy as np
import pandas as pd
import os
import subprocess
import time
import matplotlib.pyplot as plt
import itertools
from copy import deepcopy
import sys

from rdkit import Chem
from rdkit.Chem import AllChem
import ase
from ase.io import read, write
from ase.units import Hartree
from ase.visualize import view

from ase.io.trajectory import Trajectory
import glob
import pickle

from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.drawOptions.addAtomIndices = True

from photocatalysis.learners_treesearch import get_population_completed

In [2]:
### For automatically reloading import modules... allows you to run changes to code in jupyter without having to reload
%load_ext autoreload
%autoreload 2

In [3]:
### LOCAL
# local_path = os.path.join('/home/scakolli/Thesis/photocatalysis/workspace', 'DF_COMPLETE.json')
# df = pd.read_json(local_path, orient='split')

### REMOTE
df = pd.read_json('/home/btpq/bt308495/Thesis/DF_COMPLETE.json', orient='split')

In [4]:
print("Fizzled / Total")
print("{} / {}".format(df.loc[df.calc_status == 'fizzled'].shape[0], df.shape[0]))

Fizzled / Total
1410 / 65552


In [5]:
# Get succesfully completed
df = get_population_completed(df)

------

# GPR MODEL

In [6]:
test_size = 0.95
random_state = 42

In [7]:
from photocatalysis.learners_treesearch import generate_ml_vectors, get_ML_model
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split

In [None]:
### With np.arrays()
# X = generate_ml_vectors(df).morgan_fp_bitvect.values

# Y_IP = df.IP.values
# Y_dGmax = df.dGmax.values

# X_train_IP, X_test_IP, y_train_IP, y_test_IP = train_test_split(X, Y_IP, test_size=test_size, random_state=random_state)
# X_train_dGmax, X_test_dGmax, y_train_dGmax, y_test_dGmax = train_test_split(X, Y_dGmax, test_size=test_size, random_state=random_state)

In [8]:
### With pd.DataFrames()
df_training, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
df_test = generate_ml_vectors(df_test)

In [None]:
df.plot(y='utility_function', kind='hist', bins=40)

----

In [9]:
start = time.perf_counter()
gpr_ip, xtrain_ip, kip = get_ML_model(df_training, 'IP', multiprocess=1, D_scratch_dir='/run/user/1308495/scratch_distance_matrix', niter_local=1)
print('IP Fitting Took:', time.perf_counter() - start)


Fitting property: IP
Size of fitting set for ML model (get_ML_model): 3207
Number of cores to be used: 1
Pbounds {'c': (0.1, 3.0), 'rbf': (1.0, 1.0), 'alpha': (0.001, 1.0)}

initial guess [1.0, 1.0, 0.1]
localopt 1 [1.  1.  0.1] -1856.724615042311 [-2117.35270942     0.          -984.08100701]
localopt 2 [0.1   1.    0.001] -51049.948691713405 [  11116.44073025       0.         3184027.558594  ]
localopt 3 [0.75694191 1.         0.07326361] -1367.1587168856922 [ -986.09436665     0.         -1047.18707977]
localopt 4 [0.5371002 1.        0.001    ] -2762.0835318967083 [2.04701496e+03 0.00000000e+00 3.16137511e+06]
localopt 5 [0.67461149 1.         0.04620099] -1227.62197604435 [-529.95722157    0.         -727.97773579]
localopt 6 [0.58178001 1.         0.001     ] -2733.514208276787 [1.88597625e+03 0.00000000e+00 3.16137373e+06]
localopt 7 [0.64280506 1.         0.03071398] -1187.843811789411 [-337.30912305    0.         -430.42844902]
localopt 8 [0.58765782 1.         0.001     ] -2

In [20]:
main_scratch = '/run/user/1308495'
save_path = os.path.join(main_scratch, 'ML_IP_95_1.pckl')

In [22]:
with open(save_path, 'wb') as p:
    pickle.dump([gpr_ip, xtrain_ip, kip], p)

In [23]:
with open(save_path, 'rb') as p:
    out = pickle.load(p)

In [26]:
X_test = df_test.morgan_fp_bitvect.values

In [27]:
yip, stdip = gpr_ip.predict(X_test, return_std=True)

---

In [35]:
start = time.perf_counter()
gpr_rdg, xtrain_rdg, krdg = get_ML_model(df_training, 'dGmax', multiprocess=1, D_scratch_dir='/run/user/1308495/scratch_distance_matrix', niter_local=1)
print('RDG Fitting Took:', time.perf_counter() - start)


Fitting property: dGmax
Size of fitting set for ML model (get_ML_model): 3207
Number of cores to be used: 1
Pbounds {'c': (0.1, 3.0), 'rbf': (1.0, 1.0), 'alpha': (0.001, 1.0)}

initial guess [1.0, 1.0, 0.1]
localopt 1 [1.  1.  0.1] -2656.1921271330734 [-518.41768524    0.         -279.07466735]
localopt 2 [0.1   1.    0.001] -141274.25658232556 [   29161.30230839        0.         16701678.86625968]
localopt 3 [0.7288803  1.         0.07017683] -2830.049902060685 [1360.4850591    0.         731.1134981]


KeyboardInterrupt: 