# Import

In [None]:
import pandas as pd

import numpy as np

from matplotlib import pyplot as plt

import math

%matplotlib inline
tickfontsize=20
labelfontsize = tickfontsize

import importlib
import efrc_ml_production as ml
importlib.reload(ml)

from rdkit import Chem

import xgboost as xgb

import datetime
import time

# Begin hp opt

In [None]:
#following must be defined
algo = 'nn' #am I using XGBoost (xgb) or Neural Nets (nn)?
total_frac_hp = .05 #total fraction of data set to work with
training_pct = .7 #how much percent of total fraction should be used for training
random_split = True #make True if the training data should be chosen randomly
n_remote = 10000 #the n_remote most remote points will be added to training set if random_split = False
USE_PCA = True #should I use PCA?
N_COMPONENTS=400 #how many PCA Components should I use?
del_defective_mofs = False #make True if you want to remove all MOFs which a '0' value for at least one geometric property
cat_si_sd = False #make True if you want to concatenate size-indep and size-dep fps
add_size_fp = False #make True if you want to add 20 feature columns, where each feature is the number of atoms in a linker

size_dependent = False #make True if the input ML-ready data contains fingerprint which does not normalize each PG feature$
stacked = True #make True if the input ML-ready data contains pressure as feature
n_core = 18 #number of cores to use
if not stacked:
    SD_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_no_norm/ml_data.csv' #path to size-dep data
else:
    SD_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_no_norm/stacked.csv'
if not stacked:
    SI_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_v1/ml_data.csv' #path to size-indep data
else:
    SI_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_v1/stacked.csv'
if not stacked:
    start_str_sd = 'CH4_v/v_248_bar'
    end_str_sd = 'norm_Dom._Pore_(ang.)'
else:
    start_str_sd = 'Density'
    end_str_sd = 'norm_Dom._Pore_(ang.)'

start_str_si = 'filename'
end_str_si = 'valence_pa'
del_geometric_fp = False #make True if you want to ignore the geometric features
cat_col_names = ['oh_1', 'oh_2', 'oh_3', 'oh_4'] #names for interpenetration columns
Y_DATA_PATH = '/data/rgur/efrc/data_DONOTTOUCH/hMOF_allData_March25_2013.xlsx' #path to original hMOF data
default_params = {'objective':'reg:linear', 'colsample_bytree':0.3, 'learning_rate':0.1,
                'max_depth':15, 'alpha':10, 'n_estimators':10}
n_trees = 50 #number of weak learners. Bigger is better until 5000
save_pp = False #make True if you want to save the parity plot
#########################################################################

# Steps before hp_opt

In [None]:
if not stacked:
    ml_data_hp, property_used, target_mean, target_std, features = ml.prepToSplit(cat_si_sd, SD_ML_DATA_PATH, 
                                            SI_ML_DATA_PATH, start_str_sd, end_str_sd, start_str_si, end_str_si, 
                                            total_frac_hp, del_defective_mofs, add_size_fp, size_dependent, stacked, n_core, 
                                            del_geometric_fp, cat_col_names, Y_DATA_PATH)
if stacked:
    ml_data_hp, property_used, target_mean, target_std, features, p_info = ml.prepToSplit(cat_si_sd, SD_ML_DATA_PATH, 
                                            SI_ML_DATA_PATH, start_str_sd, end_str_sd, start_str_si, end_str_si, 
                                            total_frac_hp, del_defective_mofs, add_size_fp, size_dependent, stacked, n_core, 
                                            del_geometric_fp, cat_col_names, Y_DATA_PATH)

In [None]:
train_df_hp, test_df_hp= ml.trainTestSplit(ml_data_hp, property_used, training_pct, stacked, 
                                     n_core, random_split, n_remote, features, USE_PCA, N_COMPONENTS)

In [None]:
train_d_hp, test_d_hp, train_label_hp, test_label_hp = ml.alter_dtype(train_df_hp, test_df_hp, 
                                                                      property_used, n_core, algo, features)

In [None]:
len(train_label_hp) + len(test_label_hp)

# Write functions

In [None]:
def objective(params):
    print("Size of training set %s" %len(train_label_hp))
    MODEL = ml.run_model(algo, train_d_hp, n_trees, params)
    return ml.model_rmse(MODEL, train_d_hp, test_d_hp, stacked, algo, target_mean, target_std, property_used, 
                         test_label_hp, train_label_hp, save=False, fname=None, subset_inds=None)

# Perform optimization

Experiment with two hyperparameters in the model:<br>
<br>
1)Number of units in the first dense layer<br>
2)Learning rate<br>
3)Patience

In [None]:
from skopt import gp_minimize

In [None]:
space = [(100, 400), #n_units
        (.001, .002),#learning rate
        (2, 15), #patience
        (4, 128), #batch size
        (.01, .6)] #validation split

In [None]:
start = time.time()
#r = gp_minimize(objective, space, n_calls=20, n_jobs=n_core)
r = gp_minimize(objective, space, n_calls=20)
end = time.time()
print("\nTime elapsed for hp opt: %s" %(end-start))

In [None]:
r.x

# Plot hp opt results

In [None]:
%matplotlib inline
from skopt.plots import plot_convergence
plot_convergence(r, yscale="log")

# Use best hps to train single model

In [None]:
#following must be defined
algo = 'nn' #am I using XGBoost (xgb) or Neural Nets (nn)?
total_frac = 1 #total fraction of data set to work with
training_pct = .7 #how much percent of total fraction should be used for training
random_split = True #make True if the training data should be chosen randomly
n_remote = 10000 #the n_remote most remote points will be added to training set if random_split = False
USE_PCA = True #should I use PCA?
N_COMPONENTS=400 #how many PCA Components should I use?
del_defective_mofs = False #make True if you want to remove all MOFs which a '0' value for at least one geometric property
cat_si_sd = False #make True if you want to concatenate size-indep and size-dep fps
add_size_fp = False #make True if you want to add 20 feature columns, where each feature is the number of atoms in a linker
size_dependent = False #make True if the input ML-ready data contains fingerprint which does not normalize each PG feature$
stacked = True #make True if the input ML-ready data contains pressure as feature
n_core = 18 #number of cores to use
if not stacked:
    SD_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_no_norm/ml_data.csv' #path to size-dep data
else:
    SD_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_no_norm/stacked.csv'
if not stacked:
    SI_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_v1/ml_data.csv' #path to size-indep data
else:
    SI_ML_DATA_PATH = '/data/rgur/efrc/prep_data/all_v1/stacked.csv'
if not stacked:
    start_str_sd = 'CH4_v/v_248_bar'
    end_str_sd = 'norm_Dom._Pore_(ang.)'
else:
    start_str_sd = 'Density'
    end_str_sd = 'norm_Dom._Pore_(ang.)'

start_str_si = 'filename'
end_str_si = 'valence_pa'
del_geometric_fp = False #make True if you want to ignore the geometric features
cat_col_names = ['oh_1', 'oh_2', 'oh_3', 'oh_4'] #names for interpenetration columns
Y_DATA_PATH = '/data/rgur/efrc/data_DONOTTOUCH/hMOF_allData_March25_2013.xlsx' #path to original hMOF data
default_params = {'objective':'reg:linear', 'colsample_bytree':0.3, 'learning_rate':0.1,
                'max_depth':15, 'alpha':10, 'n_estimators':10}
n_trees = 50 #number of weak learners. Bigger is better until 5000
save_pp = False #make True if you want to save the parity plot
#########################################################################

In [None]:
if not stacked:
    ml_data, property_used, target_mean, target_std, features = ml.prepToSplit(cat_si_sd, SD_ML_DATA_PATH, 
                                            SI_ML_DATA_PATH, start_str_sd, end_str_sd, start_str_si, end_str_si, 
                                            total_frac, del_defective_mofs, add_size_fp, size_dependent, stacked, n_core, 
                                            del_geometric_fp, cat_col_names, Y_DATA_PATH)
if stacked:
    ml_data, property_used, target_mean, target_std, features, p_info = ml.prepToSplit(cat_si_sd, SD_ML_DATA_PATH, 
                                            SI_ML_DATA_PATH, start_str_sd, end_str_sd, start_str_si, end_str_si, 
                                            total_frac, del_defective_mofs, add_size_fp, size_dependent, stacked, n_core, 
                                            del_geometric_fp, cat_col_names, Y_DATA_PATH)

ml_data.head()

train_df, test_df= ml.trainTestSplit(ml_data, property_used, training_pct, stacked, 
                                     n_core, random_split, n_remote, features, USE_PCA, N_COMPONENTS)

if algo == 'xgb':
    train_d, test_d, train_label, test_label = ml.alter_dtype(train_df, test_df, property_used, n_core, algo, features)
else:
    train_d, test_d, train_label, test_label = ml.alter_dtype(train_df, test_df, property_used, n_core, algo, features)

len(train_label) + len(test_label)

# Run Single Model

#Good parameters

SAVE_FIG = False

In [None]:
#params = r.x
params = [204, 0.001, 15, 4, 0.01]
SCALE_BATCH = False
BATCH_IND = 3

In [None]:
if SCALE_BATCH:
    params[BATCH_IND] = int(params[BATCH_IND] * (total_frac/ total_frac_hp))

SAVE_FIG = False

MODEL = ml.run_model(algo, train_d, n_trees, params)
ml.parity_plot(MODEL, train_d, test_d, stacked, algo, target_mean, target_std, property_used, test_label, train_label, save=SAVE_FIG)

# Save model

In [None]:
now = datetime.datetime.now().strftime("%I:%M%p_on_%B_%d_%Y")
now

In [None]:
print("h_units %s" %params[0])
print("lr %s" %params[1])
print("patience %s" %params[2])
print("batch size %s" %params[3])
print("validation split %s" %params[4])

In [None]:
SAVE_FIG = True

In [None]:
ml.parity_plot(MODEL, train_d, test_d, stacked, algo, target_mean, target_std, property_used, test_label, train_label, save=SAVE_FIG, fname=now)

In [None]:
#only run below to save

In [None]:
if algo=='xgb':
    MODEL.save_model('/data/rgur/efrc/ml/models/%s/%s.xgb' %(now, now))
else:
    MODEL.save('/data/rgur/efrc/ml/models/%s/%s.h5' %(now, now),save_format='h5')

In [None]:
train_df['filename'].to_csv('/data/rgur/efrc/ml/models/%s/train_%s.csv' %(now, now))

In [None]:
test_df['filename'].to_csv('/data/rgur/efrc/ml/models/%s/test_%s.csv' %(now, now))

In [None]:
import pickle

In [None]:
with open('/data/rgur/efrc/ml/models/%s/features_%s.pkl' %(now, now), 'wb') as f:
    pickle.dump(features, f, protocol=3)