In [1]:
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import time


from photocatalysis.learners_treesearch import *

In [2]:
### For automatically reloading import modules... allows you to run changes to code in jupyter without having to reload
%load_ext autoreload
%autoreload 2

# Active Learning

In [3]:
from sklearn.metrics import mean_absolute_error

In [None]:
### Read in dataframe of testspaces
base = '/home/btpq/bt308495/Thesis/'
path = '/home/btpq/bt308495/Thesis/osc_discovery/data/'
run_dir = '/home/btpq/bt308495/Thesis/run'
# reference = pd.read_json(path+'df_chemical_space_chons_4rings.json', orient='split')
# limited = pd.read_json(path+'df_initial_gfn1_testspace.json', orient='split') #limited testspace, gfn1
# unlimited = pd.read_json(path+'df_initial_b3lyp_unlimited.json', orient='split') #unlimited testspace, b3lyp

In [None]:
### Create a small custom test-space for troubleshooting/experimenting
# limited[['XTB1_lamda_h', 'ehomo_gfn1_b3lyp']] = np.nan * np.ones((limited.shape[0], 2))
Ntest = 4
testspace = limited[1:Ntest+1].copy().drop(columns=['XTB1_lamda_h', 'ehomo_gfn1_b3lyp']) #no benzene
testspace.insert(5, 'IP', np.nan)
testspace.insert(6, 'dGmax', np.nan)

In [None]:
# frame = pd.read_json('/home/btpq/bt308495/Thesis/run/initialized_testspace/df_population.json', orient='split')
# frame.loc[frame.calc_status == 'fizzled', 'calc_status'] = 'not written'
# frame.to_json('/home/btpq/bt308495/Thesis/run/df_incomplete_intialized.json', orient='split')

In [4]:
### READ initialized completed frame
# main_frame = pd.read_json('/home/btpq/bt308495/Thesis/df_initial_gfn1_testspace_photocatalysis.json', orient='split')
main_frame = pd.read_json('/home/btpq/bt308495/Thesis/run/df_population_runstep5.json', orient='split')
training_frame = main_frame.loc[main_frame.added_in_round <= 2]
test_frame = main_frame.loc[main_frame.added_in_round > 2]

In [None]:
mip, mrdg = [], []
rips, rrdgs = [], []
for round_added in range(main_frame.added_in_round.min(), main_frame.added_in_round.max()):
    print('Learning Step:', round_added)
    training_frame = main_frame.loc[main_frame.added_in_round <= round_added]
    test_frame = main_frame.loc[main_frame.added_in_round > round_added]
    test_frame_uniq = get_unique_population(get_population_completed(test_frame))

    # Get test data
    Xtest = generate_ml_vectors(test_frame_uniq).morgan_fp_bitvect.values

    # Fit Model on training data
    gpr_ip, _, _ = get_ML_model(training_frame, 'IP')
    gpr_rdg, _, _ = get_ML_model(training_frame, 'dGmax')

    # Predict on test data
    yip_true = test_frame_uniq.IP.values
    yrdg_true = test_frame_uniq.dGmax.values

    yip, stdip = gpr_ip.predict(Xtest, return_std=True)
    yrdg, stdrdg = gpr_rdg.predict(Xtest, return_std=True)

    # Evaluate Performace
    mad_ip = mean_absolute_error(yip_true, yip) # MAD
    mad_rdg = mean_absolute_error(yrdg_true, yrdg)

    rip = yip_true - yip # residuals
    rrdg = yrdg_true - yrdg
    
    # store
    mip.append(mad_ip), mrdg.append(mad_rdg), rips.append(rip), rrdgs.append(rrdg)

In [None]:
plt.plot(mip)
plt.plot(mrdg)

In [None]:
fig, ax = plt.subplots(1, 2)

for j, (r0, r1) in enumerate(zip(rips, rrdgs)):
    ax[0].hist(r0, label=j, density=True, alpha=0.5)
    ax[1].hist(r1, label=j, density=True, alpha=0.5)
plt.legend()

In [None]:
main_frame.loc[main_frame.added_in_round==1].utility_function.values

In [None]:
med_utility_by_round = [df.utility_function.median() for j, df in main_frame.groupby('added_in_round')]

In [None]:
plt.plot(mean_utility_by_round)

In [None]:
# Training on initial set
gpr_ip, _, _ = get_ML_model(training_frame, 'IP')
gpr_rdg, _, _ = get_ML_model(training_frame, 'dGmax')

In [None]:
test_frame_uniq = get_unique_population(get_population_completed(test_frame))
Xtest = generate_ml_vectors(test_frame_uniq).morgan_fp_bitvect.values

In [None]:
## Predicting
yip, stdip = gpr_ip.predict(Xtest, return_std=True)
yrdg, stdrdg = gpr_rdg.predict(Xtest, return_std=True)

# Real
yip_true, yrdg_true = test_frame_uniq.IP.values, test_frame_uniq.dGmax.values

In [None]:
mad_ip = mean_absolute_error(yip_true, yip)
mad_rdg = mean_absolute_error(yrdg_true, yrdg)
print('Mean Abs Errors:',mad_ip, mad_rdg)

In [None]:
rip = yip_true - yip
rrdg = yrdg_true - yrdg

In [None]:
plt.hist(rip)
plt.hist(rrdg)

In [None]:
## Evaluation
plt.scatter(yip_true, yip)
plt.scatter(yip_true, yip_true, c='r')

In [None]:
plt.scatter(yrdg_true, yrdg)
plt.scatter(yrdg_true, yrdg_true, c='r')

----

In [None]:
# Problem Children
main_frame.loc[main_frame.calc_status == 'fizzled', 'molecule_smiles']

In [None]:
al = active_learner(df_initial_population=main_frame, **AL_params)

In [None]:
### FIT GPR to initial frame
gpr_ip, xtrain_ip = al._get_ML_model('IP')
gpr_rdg, xtrain_rdg = al._get_ML_model('dGmax')

In [None]:
# GPR predictions
yip, stdip = gpr_ip.predict(xtrain_ip, return_std=True)
yrdg, stdrdg = gpr_rdg.predict(xtrain_rdg, return_std=True)

# Ground truth labels
yip_true = al.df_population_unique[al.df_population_unique.calc_status == 'completed'].IP.values
yrdg_true = al.df_population_unique[al.df_population_unique.calc_status == 'completed'].dGmax.values

mad_training_ip = np.sqrt(((yip_true-yip)**2).sum())
mad_training_rdg = np.sqrt(((yrdg_true-yrdg)**2).sum())

print('Perfect fit to training data, as expected')
print(mad_training_ip, mad_training_rdg)

print('Limited Variance near training points, as expected')
print(stdip.max(), stdrdg.max())

In [None]:
print('Initial Candidates predicted to water split:', np.sum((yip - yrdg) > 0))

In [None]:
df_new_candidates = al.select_and_add_new_candidates()

In [None]:
df_new_candidates = al._generate_ml_vectors(df_new_candidates)

In [None]:
Xtest = df_new_candidates.morgan_fp_bitvect.values

In [None]:
yip, stdip = gpr_ip.predict(Xtest, return_std=True)
yrdg, stdrdg = gpr_rdg.predict(Xtest, return_std=True)

In [None]:
plt.hist(yip - yrdg, bins=20)
plt.title('Utility Hist')

In [None]:
print('Generated Candidates predicted to water split:', np.sum((yip - yrdg) > 0))

In [None]:
plt.hist(stdip, bins=20, label='IP')
plt.hist(stdrdg, bins=20, label='RDG')
plt.title('Stdevs Hist')
plt.legend()

In [None]:
# u = al._get_utility(np.stack((yip, yrdg)))

-----

#### Production Run (use_reference_frame=False)

In [None]:
# Other args with default values
# kappa = 2.5 # regarding two-fold in article
# n_batch = 100 # candidates to selct at every step
# two_fold = 0 # if 1, two-time application of morphing ops at each step used
# use_reference_frame = 1 # Gfn1 limited testspace
# n_learning_steps = 50
# suffix = '' # Namespace
# random_state = 42
# reduced_search_space = 0 # 1 for turning on search space reduction.. not sure what this means yet
# depth_search = 3 # d_search as discussed in the artice
# Ndeep = 500 # N_deep as discussed in article

# # This is the evaluation in a predefined chemical space, without dft evaluation

# # initial generation dataframe, contains mols and descriptors, already contains B3LYP corrected xTB-GFN1
# df_initial_population = pd.read_json(path+'df_initial_gfn1_testspace.json', orient='split')
# df_reference = pd.read_json(path+'df_chemical_space_chons_4rings.json', orient='split') #reference frame

# preset_chemical_space_boundaries = "test_osc_space"
# n_worker_submit = 1 # How many workers to submit
# n_select_every_step = n_batch
# n_execute_every_step = 0 # 0 means all cases are always found

In [None]:
### Args into active_learner_run
properties=["IP", "dGmax"]

# Worker-HPC stuff (local, not computing cluster)
run_mode = 'local'
# worker_submit_file = 'submit_local.sh'
worker_submit_file = '/home/btpq/bt308495/Thesis/worker/worker_xtb.py'

system = 'ARTHUR'
dir_scratch = '/home/btpq/bt308495/Thesis/scratch/'
submit_command = 'qsub'
queue_status_command = 'qstat'

dir_save_results = 'learner_results'

In [None]:
# Space defining settings
df_reference=[]
# df_initial_population = testspace.copy()
preset_chemical_space_boundaries = '' # unlimited
preset_chemical_space_boundaries = 'test_osc_space' # limited size of molecules to 4rings, etc. Space comprises of 65,552 mols

# Search space reduction setttings
reduced_search_space = 0 # 1 for reduction
depth_search = 3 # reduced space setting
Ndeep = 500 # Reduced search space setting

# Learner Settings
kappa = 0 #Exploitative, 2.5 offers a better balance between exploitation and exploration
two_fold = 0
n_learning_steps = 10
n_select_every_step = 50 # N_batch_first: queries per learning step; choose the Nbatch best Fitness mols and proceed expansion with them
n_execute_every_step = 0 # HPC to avoid idles times #int(args.n_batch) # 0 means all cases are always found before proceeding to next step

# Misc settings
n_worker_submit = 1 #8 for HPC... need SLURM
random_state = 42
suffix = ''

In [None]:
### Unique molecules generated by exhaustively performing all molecular morphing operations
### 65,552 Molecules
# df_reference.copy().drop_duplicates(subset='molecule_smiles').shape

### Initialization population
# df_initial_population.copy().drop_duplicates(subset='molecule_smiles').shape

In [None]:
def learning_step(active_learner_obj, run_only=False):
    while True:
        if active_learner_obj.check_all_calculations_finished():
            print('Finished calculations... moving on')
            print('###############')
            break
        else:
            print('Running Calculations')
            active_learner_obj.run_calculations_population()
            print('###############')

    if not run_only:
        active_learner_obj.select_and_add_new_candidates()
        active_learner_obj.run_calculations_population()
    print('###############')

In [None]:
AL_params = {'properties' : properties,
            'n_worker_submit' : n_worker_submit,
            'Nbatch_first' : n_select_every_step,
            'Nbatch_finish_successful' : n_execute_every_step,
            'run_mode' : run_mode, 
            'worker_submit_file' : worker_submit_file,
            'submit_command' : submit_command,
            'queue_status_command' : queue_status_command,
            'two_generations_search' : two_fold,
            'df_reference_chemical_space' : [], #reference, 
            'kappa' : kappa, 
            'reduced_search_space' : reduced_search_space,
            'depth_search' : depth_search,
            'Ndeep' : Ndeep,
            'preset' : preset_chemical_space_boundaries,
            'dir_save_results' : dir_save_results,
            'dir_scratch' : dir_scratch,
            'ml_model' : 'gpr_tanimoto_kernel',
            'random_state' : int(random_state),
            'suffix' : suffix}

In [None]:
os.chdir(run_dir), os.getcwd()
AL = active_learner(df_initial_population=testspace.copy(), **AL_params)

In [None]:
learning_step(AL, run_only=True)

In [None]:
# gapr, xtrain = tal._get_ML_model('IP')
# xtests = tal._generate_ml_vectors(limited.copy())['morgan_fp_bitvect'].values
# out = gapr.predict(xtests, return_std=True)

-----

In [None]:
# with open('/home/btpq/bt308495/Thesis/run/AL_pickled.pckl', 'wb') as pick:
#     pickle.dump(AL, pick)

# with open('/home/btpq/bt308495/Thesis/run/AL_pickled.pckl', 'rb') as pick:
#     al = pickle.load(pick)

In [None]:
if AL.check_all_calculations_finished():
    print('All Calcs Performed')

In [None]:
AL.select_and_add_new_candidates()

In [None]:
AL.evaluate_performance_external_set()

In [None]:
ytest = AL.df_reference_chemical_space_unique[AL.properties].values.T

In [None]:
ytest.shape

In [None]:
# a, b = AL._predict_Fi_scores(AL.df_population_unique)

In [None]:
AL._get_utility(ytest)

In [None]:
AL.df_reference_chemical_space_unique['utility_function']