# New ML Active Learning Workflow
---

A model that predicts the mean (~ -6.05 eV/atom) has a MAE of ~0.3 eV/atom)

# Import Modules

In [None]:
%%capture
import os
import sys

import numpy as np
import pandas as pd

# #########################################################
# Python Utils
import itertools
import time

# #########################################################
# Project Imports

sys.path.insert(0, os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/00_ml_workflow/190611_new_workflow/02_gaus_proc"))
from gp_methods import (
    gp_model_catlearn,
    gp_workflow,
    )

# Script Inputs

In [None]:
# stoich_i = "AB2"
stoich_i = "AB3"

# gp_model = gp_model_gpflow
gp_model = gp_model_catlearn

aqs_bin_size = 5

# output_key = "form_e_chris"
output_key = "energy_pa"

verbosity_level = 6  # 1-10 scale

params_dict = {
    "noise": [0.0001],
    "sigma_l": [10.],
    "sigma_f": [5],
    "alpha": [0.1],
    }

c = list(itertools.product(*params_dict.values()))
df_gp_params = pd.DataFrame(c, columns=params_dict.keys())

# Reading Data

In [None]:
sys.path.insert(0,
    os.path.join(os.environ["PROJ_irox"], "workflow/ml_modelling"))
from ml_methods import get_ml_dataframes

DF_dict = get_ml_dataframes(
    # names=["df_dft_final_final_path"]
    )
# list(DF_dict.keys())

df_dft = DF_dict["df_dft_final_final"]
df_feat_pre = DF_dict["df_features_pre_opt"]
df_feat_post = DF_dict["df_features_post_opt"]

df_ids = DF_dict['unique_ids']

# Preprocessing Dataframes

In [None]:
df_dft = df_dft[df_dft.stoich == stoich_i]

df_feat_post = df_feat_post[df_feat_post.data.source == "raul"]
df_feat_post = df_feat_post.drop(columns=["data"])

# #########################################################
df_feat_post = df_feat_post.loc[df_dft.index]
df_feat_pre = df_feat_pre.loc[df_dft.index]

# #########################################################
df_feat_post = df_feat_post["voronoi"]
df_feat_pre = df_feat_pre["voronoi"]

# Preparting CV Folds

In [None]:
n_fold_cv = 15
pca_comp = 11

In [None]:
def run_cv(
    df_dft=None,
    n_fold_cv=None,
    pca_comp=None,
    ):
    """Run Cross-validation runs."""
    
    out_dict = dict()

    # n_fold_cv = df_dft.shape[0]
    # n_fold_cv = 5
    # n_fold_cv = 3


    fold_size = int(df_dft.shape[0] / n_fold_cv)
    # print("fold_size:", fold_size)

    # Shuffling training data
    df_dft = df_dft.sample(
        n=None,
        frac=1.,
        replace=False,
        axis=None)

    # print("n_fold_cv * fold_size:", n_fold_cv * fold_size)

    ids_0 = df_dft.index[:n_fold_cv * fold_size]
    folds = np.split(ids_0, n_fold_cv)

    ids_leftover = df_dft.index[n_fold_cv * fold_size:]

    if ids_leftover.shape[0] > 0:
        folds.append(ids_leftover)
    folds = np.array(folds)



    data_dict_list = []
    for i_cnt, fold_i in enumerate(folds):
        data_dict_i = dict()

        row_i = df_gp_params.iloc[0]

        df_train_dft = df_dft.drop(
            labels=fold_i,
            axis=0)

        df_train_feat = df_feat_post.loc[df_train_dft.index]
        df_test_feat = df_feat_post.loc[fold_i]

        # #####################################################
        # Running GP Model ####################################
        gp_params_i = row_i.to_dict()
        out = gp_workflow(
            df_features_post=df_train_feat, df_test=df_test_feat,
            df_bulk_dft=df_train_dft, df_bulk_dft_all=df_dft,

            df_ids=df_ids,
            gp_model=gp_model_catlearn,
            opt_hyperparameters=True,
            gp_params=gp_params_i,
            y_train_key="energy_pa",


            verbose=False,
            clean_variance_flag=True, clean_skewness_flag=True,
            clean_infinite_flag=True, standardize_data_flag=True,

            pca_comp=pca_comp,
            # pca_comp=11,
            # pca_perc=0.99,
            pca_mode="num_comp",
            # pca_mode="perc",
            )

        model_i = out["model"]
        model_inst = out["model_inst"]

        test_row_i = model_i[model_i["prediction"].notnull()]

        mae_i = abs(
            test_row_i["prediction_unstandardized"] - test_row_i["energy_pa"]
            ).mean()
        data_dict_i["mae"] = mae_i

        # #################################################
        data_dict_list.append(data_dict_i)

    # #####################################################
    df_cv = pd.DataFrame(data_dict_list)
    out_dict["df_cv"] = df_cv

    return(out_dict)

In [None]:
data_dict_list = []
for pca_comp_i in range(1, 35, 1):
    print("pca_comp_i:", pca_comp_i)
    data_dict_i = dict()

    try:
        data_dict_i["pca_comp"] = pca_comp_i

        out_dict = run_cv(
            df_dft=df_dft,
            n_fold_cv=n_fold_cv,
            # pca_comp=11,
            pca_comp=pca_comp_i,
            )
        df_cv = out_dict["df_cv"]
        mae_cv = df_cv.mae.mean()
        data_dict_i["mae_cv"] = mae_cv

        data_dict_list.append(data_dict_i)
    except:
        print("Didn't work")

    print("")

df = pd.DataFrame(data_dict_list)

# Save Data

In [None]:
# Pickling data ###########################################
import os; import pickle
directory = "out_data"
if not os.path.exists(directory): os.makedirs(directory)
with open(os.path.join(directory, "data.pickle"), "wb") as fle:
    pickle.dump(df, fle)
# #########################################################

# Plotting

In [None]:
import plotly.graph_objs as go

x_array = df.pca_comp
y_array = df.mae_cv

trace = go.Scatter(
    x=x_array,
    y=y_array,
    )
data = [trace]

fig = go.Figure(data=data)
fig.show()

In [None]:
assert False

In [None]:
# n_fold_cv = df_dft.shape[0]
# n_fold_cv = 5
n_fold_cv = 3


fold_size = int(df_dft.shape[0] / n_fold_cv)
print("fold_size:", fold_size)

# Shuffling training data
df_dft = df_dft.sample(
    n=None,
    frac=1.,
    replace=False,
    axis=None)

print("n_fold_cv * fold_size:", n_fold_cv * fold_size)

ids_0 = df_dft.index[:n_fold_cv * fold_size]
folds = np.split(ids_0, n_fold_cv)

ids_leftover = df_dft.index[n_fold_cv * fold_size:]

if ids_leftover.shape[0] > 0:
    folds.append(ids_leftover)

folds = np.array(folds)

In [None]:
data_dict_list = []
for i_cnt, fold_i in enumerate(folds):
    data_dict_i = dict()

    row_i = df_gp_params.iloc[0]

    df_train_dft = df_dft.drop(
        labels=fold_i,
        axis=0)

    df_train_feat = df_feat_post.loc[df_train_dft.index]
    df_test_feat = df_feat_post.loc[fold_i]

    # #####################################################
    # Running GP Model ####################################
    gp_params_i = row_i.to_dict()
    out = gp_workflow(
        df_features_post=df_train_feat,
        df_test=df_test_feat,
        df_bulk_dft=df_train_dft,
        df_bulk_dft_all=df_dft,

        df_ids=df_ids,
        gp_model=gp_model_catlearn,
        opt_hyperparameters=True,
        gp_params=gp_params_i,
        y_train_key="energy_pa",

        verbose=False,

        clean_variance_flag=True,
        clean_skewness_flag=True,
        clean_infinite_flag=True,
        standardize_data_flag=True,

        pca_comp=11,
        # pca_comp=11,
        pca_perc=0.99,
        pca_mode="num_comp",
        # pca_mode="perc",
        )

    model_i = out["model"]
    model_inst = out["model_inst"]

    test_row_i = model_i[model_i["prediction"].notnull()]

    mae_i = abs(
        test_row_i["prediction_unstandardized"] - test_row_i["energy_pa"]
        ).mean()
    data_dict_i["mae"] = mae_i

    # #####################################################
    data_dict_list.append(data_dict_i)