# Train Gaussian Process ML Model
___

# Notebook Setup

## Import Modules

In [1]:
import os
import pickle
import numpy as np

from catlearn.regression import GaussianProcess

In [2]:
import pandas as pd

pd.__version__

'0.24.2'

## Script Inputs

In [3]:
file_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/00_ml_workflow/outdata",
    "02_data_featurized.pickle",
    )

## Load Data

In [4]:
with open(file_i, "rb") as fle:
    df_m = pickle.load(fle)

## Pre-process Data

Only train and predict on AB3 structures

In [5]:
df_m = df_m[df_m["default_columns"]["stoich"] == "AB3"]

# df_m[0:3]

In [6]:
df_m.head()

Unnamed: 0_level_0,default_columns,default_columns,default_columns,default_columns,default_columns,prototype_info,prototype_info,prototype_info,prototype_info,prototype_info,...,features_pca,features_pca,features_pca,features_pca,features_pca,features_pca,features_pca,features_pca,features_pca,features_pca
Unnamed: 0_level_1,atoms,formation_e,id,oqmd_id,stoich,name_i,parameter_values_i,spacegroup_i,species_i,wyckoff_i,...,10,11,12,13,14,15,16,17,18,19
697,"(Atom('Ir', [0.0, 0.0, 0.0], tag=0, index=0), ...",,0,,AB3,AB3_1_a_bc_191,"[{'name': 'a', 'value': 5.552499976395137}, {'...",191,"[Ir, O, O]","[a, b, c]",...,0.009133,-0.047166,0.036836,-0.007277,0.097195,-0.004311,0.045155,-0.014266,0.008509,-0.005023
698,"(Atom('O', [0.0, 0.0, 3.853165], tag=0, index=...",,100,,AB3,AB3_1_a_bh_123,"[{'name': 'a', 'value': 2.79322}, {'name': 'b/...",123,"[Ir, O, O]","[a, b, h]",...,-0.085653,0.050189,0.081931,-0.013496,-0.023988,0.039874,-0.025906,-0.006926,0.000122,-0.00353
699,"(Atom('O', [0.0, 0.0, 25.833282699999998], tag...",,101,,AB3,AB3_7_a3b4_a4b3c7_99,"[{'name': 'a', 'value': 3.72125}, {'name': 'b/...",99,"[Ir, Ir, Ir, Ir, Ir, Ir, Ir, O, O, O, O, O, O,...","[a, a, a, b, b, b, b, a, a, a, a, b, b, b, c, ...",...,0.028219,0.047584,0.050935,0.003089,0.027579,0.01128,-0.013778,0.003758,0.005796,-0.003372
700,"(Atom('O', [0.9539667806000002, 1.648481928199...",,102,,AB3,AB3_8_adgi2_l4_149,"[{'name': 'a', 'value': 5.821114940147411}, {'...",149,"[Ir, Ir, Ir, Ir, Ir, O, O, O, O]","[a, d, g, i, i, l, l, l, l]",...,-0.03859,0.029052,0.006647,0.009825,0.002921,-0.007218,-0.001823,0.000468,-0.003188,0.000672
701,"(Atom('Ir', [2.209615, 0.0, 4.641573129], tag=...",,103,,AB3,AB3_3_ac_a2b3c2_99,"[{'name': 'a', 'value': 4.41923}, {'name': 'b/...",99,"[Ir, Ir, O, O, O, O, O, O, O]","[a, c, a, a, b, b, b, c, c]",...,-0.048612,0.024397,0.045955,-0.006091,0.035545,0.020443,-0.002692,-0.003254,-0.009536,-0.009198


# Training Data

In [7]:
df_train = df_m[df_m["default_columns"]["formation_e"].notnull()]
train_x = df_train["features_pca"].values
train_y = df_train["default_columns"]["formation_e"]

In [8]:
# Define initial prediction parameters.
# noise = 0.0042  # Regularisation parameter.

# sigma_l = 2.3917  # Length scale parameter.
# sigma_f = 0.5120  # Scaling parameter.
# alpha = 0.8907  # Alpha parameter.

noise = 0.0042  # Regularisation parameter.
sigma_l = 6.3917  # Length scale parameter.
sigma_f = 0.5120  # Scaling parameter.
alpha = 0.3907  # Alpha parameter.


kdict = [
    {
        'type': 'quadratic',
        'dimension': 'single',
        # 'dimension': 'features',
        'slope': sigma_l,
        'scaling': sigma_f,
        'degree': alpha,
        }
    ]

gp = GaussianProcess(
    kernel_list=kdict, regularization=noise, train_fp=train_x,
    train_target=train_y, optimize_hyperparameters=False,
    scale_data=False)

print('Optimized kernel:', gp.kernel_list)

# Optimize hyperparameters:
gp.optimize_hyperparameters(global_opt=True)

Optimized kernel: [{'type': 'quadratic', 'dimension': 'single', 'slope': array([6.3917]), 'scaling': 0.512, 'degree': array([0.3907])}]


# Predictions

Do the optimized predictions.

In [9]:
pred = gp.predict(
    test_fp=df_m["features_pca"].values,
    # test_fp=test_data[:249],
    uncertainty=True,
    )

# prediction = np.array(pred['prediction'][:, 0])
prediction = pred["prediction"]

# Calculate the uncertainty of the predictions.
uncertainty = np.array(pred['uncertainty'])

# Sort predictions (ascending):
sorted_x = np.arange(0, len(prediction))
sorted_y = prediction[np.argsort(prediction)]
sorted_unc = uncertainty[np.argsort(prediction)]

  uncertainty = np.sqrt(scale - var)


# Plotting

In [10]:
import plotly.plotly as py
import plotly.graph_objs as go

import os

x_array = [0, 1, 2, 3]
y_array = [0, 1, 2, 3]

trace = go.Scatter(
    x=sorted_x,
    y=sorted_y,

    error_y={
        "type": 'data',
        "array": sorted_unc,
        "visible": True,
        },

    mode="markers",
    )

data = [trace]

py.iplot(data,
    filename=os.path.join(
        "__temp__",
        "temp_plot_0"
        )
    )