# Constructing linear model for OER adsorption energies
---


### Import Modules

In [1]:
import os
print(os.getcwd())
import sys
import time; ti = time.time()

import pickle

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

import plotly.graph_objects as go

# #########################################################
from methods import (
    get_df_features_targets,
    get_df_features_targets_seoin,
    )

from methods_models import ModelAgent, GP_Regression

from proj_data import adsorbates
from proj_data import layout_shared
from proj_data import scatter_marker_props

/mnt/f/Dropbox/01_norskov/00_git_repos/PROJ_IrOx_OER/workflow/model_building/SVR
RegressionModel_2 will eventually replace  RegressionModel_1


In [2]:
from methods import isnotebook    
isnotebook_i = isnotebook()
if isnotebook_i:
    from tqdm.notebook import tqdm
    verbose = True
    show_plot = True
else:
    from tqdm import tqdm
    verbose = False
    show_plot = False

In [3]:
root_dir = os.path.join(
    os.environ["PROJ_irox_oer"],
    "workflow/model_building/gaussian_process/my_data/all_features_mine")

### Script Inputs

In [4]:
target_ads_i = "oh"

feature_ads_i = "o"

use_seoin_data = False

if use_seoin_data:
    feature_ads_i = "o"

In [5]:
quick_easy_settings = True
if quick_easy_settings:
    k_fold_partition_size = 170
    do_every_nth_pca_comp = 8
else:
    k_fold_partition_size = 30
    do_every_nth_pca_comp = 1

### Read Data

In [6]:
# #########################################################
df_features_targets = get_df_features_targets()

# #########################################################
df_seoin = get_df_features_targets_seoin()

### Combine mine and Seoin's data

In [7]:
if use_seoin_data:
    # Replace multiindex with index of tuples so that my data and Seoin's data can be combined
    indices = df_features_targets.index.tolist()
    df_features_targets.index = indices

    indices = df_seoin.index.tolist()
    df_seoin.index = indices

    # Remove columns that aren't shared by my and Seoin's data
    cols_0 =df_features_targets.columns.tolist()
    cols_1 = df_seoin.columns.tolist()

    cols_comb = cols_0 + cols_1

    cols_comb_unique = []
    for col_i in cols_comb:
        if col_i not in cols_comb_unique:
            cols_comb_unique.append(col_i)

    shared_cols = []
    for col_i in cols_comb_unique:
        if col_i in df_features_targets.columns and col_i in df_seoin.columns:
            shared_cols.append(col_i)

    # Combine data
    df_data = pd.concat([
        df_features_targets[shared_cols],
        df_seoin[shared_cols],
        ], axis=0)
else:
    df_data = df_features_targets

### Choosing feature columns

In [8]:
# # TEMP
# print(111 * "TEMP | ")

if True:

    if use_seoin_data:
        df_data = df_data[[
            # ('targets', 'g_o', ''),
            ('targets', 'g_oh', ''),
            ('data', 'stoich', ''),

            ('features', 'o', 'active_o_metal_dist'),
            ('features', 'o', 'angle_O_Ir_surf_norm'),
            ('features', 'o', 'ir_o_mean'),
            ('features', 'o', 'ir_o_std'),
            ('features', 'o', 'octa_vol'),
            ('features', 'dH_bulk', ''),
            ('features', 'volume_pa', ''),
            ('features', 'bulk_oxid_state', ''),
            ('features', 'effective_ox_state', ''),
            ]]
    else:
        df_data = df_data[[

            # ('targets', 'g_o', ''),
            ('targets', 'g_oh', ''),

            ('data', 'stoich', ''),
            ('data', 'job_id_o', ''),
            ('data', 'job_id_oh', ''),
            ('data', 'job_id_bare', ''),

            ('features', 'o', 'O_magmom'),
            ('features', 'o', 'Ir_magmom'),
            ('features', 'o', 'Ir*O_bader'),
            ('features', 'o', 'Ir_bader'),
            ('features', 'o', 'O_bader'),
            ('features', 'o', 'active_o_metal_dist'),
            ('features', 'o', 'angle_O_Ir_surf_norm'),
            ('features', 'o', 'ir_o_mean'),
            ('features', 'o', 'ir_o_std'),
            ('features', 'o', 'octa_vol'),
            ('features', 'o', 'p_band_center'),
            ('features', 'o', 'Ir*O_bader/ir_o_mean'),
            ('features', 'dH_bulk', ''),
            ('features', 'volume_pa', ''),
            ('features', 'bulk_oxid_state', ''),
            ('features', 'effective_ox_state', ''),

            # ('features_pre_dft', 'active_o_metal_dist__pre', ''),
            # ('features_pre_dft', 'ir_o_mean__pre', ''),
            # ('features_pre_dft', 'ir_o_std__pre', ''),
            # ('features_pre_dft', 'octa_vol__pre', ''),

            ]]

In [9]:
kdict = [
    {
        "type": "gaussian",
        "dimension": "single",
        "width": 1.8,
        "scaling": 0.5,
        "scaling_bounds": ((0.0001, 10.),),
        }
    ]

GP_R = GP_Regression(
    kernel_list=kdict,
    regularization=0.01,
    optimize_hyperparameters=True,
    scale_data=False,
    )

In [10]:
from methods_models import SVR_Regression

SVR_R = SVR_Regression()

In [13]:
data_dict_list = []
num_feat_cols = df_data.features.shape[1]
for num_pca_i in range(4, num_feat_cols + 1, do_every_nth_pca_comp):

    if verbose:
        print("")
        print(40 * "*")
        print(num_pca_i)

    MA = ModelAgent(
        df_features_targets=df_data,
        Regression=SVR_R,
        Regression_class=SVR_Regression,
        use_pca=True,
        num_pca=num_pca_i,
        adsorbates=adsorbates,
        stand_targets=False,  # True was giving much worse errors, keep False
        )

    MA.run_kfold_cv_workflow(
        k_fold_partition_size=k_fold_partition_size,
        )

    if verbose:
        print("MAE:", np.round(MA.mae, 4))
        print("MA.r2:", np.round(MA.r2, 4))
        print("MAE (in_fold):", np.round(MA.mae_infold, 4))

    data_dict_i = dict()
    data_dict_i["num_pca"] = num_pca_i
    data_dict_i["MAE"] = MA.mae
    data_dict_i["ModelAgent"] = MA
    data_dict_list.append(data_dict_i)

df_models = pd.DataFrame(data_dict_list)
df_models = df_models.set_index("num_pca")




# #########################################################
# Finding best performing model
row_models_i = df_models.sort_values("MAE").iloc[0]

MA_best = row_models_i.ModelAgent

print(4 * "\n")
if verbose:
    print(
        row_models_i.name,
        " PCA components are ideal with an MAE of ",
        np.round(
        row_models_i.MAE,
            4),
        sep="")


****************************************
4
MAE: 0.1939
MA.r2: 0.7289
MAE (in_fold): 0.1633

****************************************
12
MAE: 0.2133
MA.r2: 0.6777
MAE (in_fold): 0.105





4 PCA components are ideal with an MAE of 0.1939


In [12]:
assert False

AssertionError: 

In [None]:
self = MA
k_fold_partition_size=30

# def run_kfold_cv_workflow(self,
#     k_fold_partition_size=None,
#     ):
"""Wed Jun  9 20:48:08 PDT 2021
"""
# | - run_kfold_cv_workflow
# #################################################
_run_kfold_cv_workflow__get_cv_data = \
    self._run_kfold_cv_workflow__get_cv_data
_run_kfold_cv_workflow__process_df_predict = \
    self._run_kfold_cv_workflow__process_df_predict
_run_kfold_cv_workflow__run_infold = \
    self._run_kfold_cv_workflow__run_infold
# #################################################

# _run_kfold_cv_workflow__run_infold()

# _run_kfold_cv_workflow__get_cv_data(
#     k_fold_partition_size=k_fold_partition_size,
#     )

# _run_kfold_cv_workflow__process_df_predict()
# # __|

In [None]:
self = MA
k_fold_partition_size=100
from methods_models import RegressionWorkflow

# def _run_kfold_cv_workflow__run_infold(self):
"""Thu Jun 10 16:03:57 PDT 2021
"""
# | - _run_kfold_cv_workflow__run_infold
# #################################################
df_features_targets = self.df_features_targets
Regression = self.Regression
Regression_class = self.Regression_class
use_pca = self._use_pca
num_pca = self.num_pca
stand_targets = self._stand_targets
# #################################################
_standardize_train_test = self._standardize_train_test
# #################################################
init_params = Regression.init_params
# #################################################


df_data = df_features_targets

df_train = df_data
df_test = df_data

df_train_std, df_test_std = \
    _standardize_train_test(
        df_train,
        df_test=df_test,
        stand_targets=stand_targets,
        )
df_train_final = df_train_std
df_test_final = df_test_std


if use_pca:
    df_pca_train, df_pca_test, PCA = \
        self._run_pca(df_train, df_test=df_test, num_pca=num_pca)
    df_train_final = df_pca_train
    df_test_final = df_pca_test

# #############################################
# Running regression workflow
RC = Regression_class(**init_params)

RW_infold = RegressionWorkflow(
    df_data=df_train_final,
    Regression=RC,
    )
RW_infold.run_Regression()

# RW_infold.predict(df_test_final.features, df_test_final.targets)

# df_predict = RW_infold.df_predict
# mae_infold = df_predict.error.abs().mean()


# # #################################################
# self.RW_infold = RW_infold
# self.mae_infold = mae_infold
# self.PCA_infold = PCA
# # #################################################
# # __|

In [None]:
self = RW_infold

# def run_Regression(self):
"""Wed Jun  9 00:30:03 PDT 2021
"""
# | - run_Regression
# #################################################
df_data = self.df_data
Regression = self.Regression
# #################################################


# Run regression (train model)
Regression.run_regression(
    train_features=df_data.features,
    train_targets=df_data.targets,
    )
# __|

In [None]:
train_features=df_data.features
train_targets=df_data.targets

# def run_regression(self, train_features, train_targets):
"""
"""
# | - run_regression
# #################################################
# kernel_list = self.kernel_list
# regularization = self.regularization
# optimize_hyperparameters = self.optimize_hyperparameters
# scale_data = self.scale_data
# #################################################


from sklearn import svm


model_SVR = svm.SVR(
    kernel='rbf',
    degree=3,
    # gamma='scale',
    gamma='auto',
    coef0=0.0,
    tol=0.001,
    C=1.0,
    epsilon=0.1,
    shrinking=True,
    cache_size=200,
    verbose=False,
    max_iter=-1,
    )

# model_SVR.fit(train_features, train_targets["y"].to_numpy())

# model_SVR.fit(train_features, train_targets)

model_SVR.fit(train_features, train_targets.values.ravel())


# | - __old__
# GP = GaussianProcess(
#     kernel_list=kernel_list,
#     regularization=regularization,
#     train_fp=train_features,
#     train_target=train_targets,
#     scale_data=False,
#     )
#
# if optimize_hyperparameters:
#     GP.optimize_hyperparameters(
#         global_opt=False,
#         algomin='L-BFGS-B',
#         eval_jac=False,
#         loss_function='lml',
#         # loss_function='rmse',
#         )
# __|

self.model = model_SVR
# __|

In [None]:
assert False

In [None]:
RW_infold

In [None]:
self = RW_infold
df_features=df_test_final.features
df_targets=df_test_final.targets


# def predict(self, df_features, df_targets=None):
"""Wed Jun  9 10:11:15 PDT 2021
"""
# | - run_Regression
# #################################################
Regression = self.Regression
# #################################################


# df_predict = Regression.predict(df_features, df_targets=df_targets)

# self.df_predict = df_predict
# # __|

In [None]:
self = Regression
df_features
df_targets=None

# def predict(self,
#     df_features,
#     df_targets=None,
#     ):
"""
"""
# | - predict
#################################################
predict_wrap = self.predict_wrap
#################################################


df_predict = predict_wrap(df_features)

# | - Attach actual target values if test_targets is given
if df_targets is not None:
    df_targets.columns = ["actual"]

    df_predict = pd.concat([df_predict, df_targets], axis=1)
    df_predict["error"] = df_predict.prediction - df_predict.actual

    df_predict_cols = df_predict.columns.tolist()
    cols_to_keep_together = ["prediction", "actual", "error", ]
    for col_i in cols_to_keep_together:
        df_predict_cols.remove(col_i)

    new_cols = cols_to_keep_together + df_predict_cols
    df_predict = df_predict[new_cols]
# __|

# return(df_predict)
# __|

In [None]:
test_features=df_data.features
test_targets=df_data.targets

# def predict_wrap(self,
#     df_features,
#     # df_targets=None,
#     ):
"""
"""
# | - predict_wrap
# #################################################
model = self.model
# #################################################

# prediction = model.predict(
#     test_fp=df_features,
#     uncertainty=True,
#     )

prediction = model.predict(
    # test_fp=df_features,
    df_features,
    # uncertainty=True,
    )

# Construct dataframe of predictions
df_predict = pd.DataFrame()
df_predict["prediction"] = prediction
# df_predict["prediction"] = prediction["prediction"].flatten()
# df_predict["uncertainty"] = prediction["uncertainty"]
# df_predict["uncertainty_with_reg"] = prediction["uncertainty"]

df_predict.index = df_features.index

# return(df_predict)
# __|

In [None]:
df_predict

In [None]:
assert False

In [None]:
# SVR_R.run_regression()

In [None]:
# assert False

In [None]:
# predict_wrap?

In [None]:
# Regression

In [None]:
# # prediction = 

# model.predict(
#     # test_fp=df_features,
#     df_features,
#     # uncertainty=True,
#     )

In [None]:
# model.predict?

In [None]:
# prediction.shape

In [None]:
# Regression.model.