# Gaussian Process Regression on IrO3

# Import Modules

In [None]:
import os
import sys

import chart_studio.plotly as py
import plotly.graph_objs as go

import numpy as np
import pandas as pd
from catlearn.regression import GaussianProcess
import matplotlib.pyplot as plt

In [None]:
sys.path.insert(0, os.path.join(os.environ["PROJ_irox"], "data"))
from proj_data_irox import unique_ids_path

df_id = pd.read_csv(unique_ids_path)

id_mapp_iro2 = dict(zip(
    df_id[df_id["stoich"] == "AB2"]["id"],
    df_id[df_id["stoich"] == "AB2"]["unique_ids"]))

id_mapp_iro3 = dict(zip(
    df_id[df_id["stoich"] == "AB3"]["id"],
    df_id[df_id["stoich"] == "AB3"]["unique_ids"]))

# Script Inputs

In [None]:
stoich_i = "AB2"

# Read Data

In [None]:
train_x = pd.read_csv('./RAW/train_x.csv', sep=',', header=None)
train_y = pd.read_csv('./RAW/train_y.csv', sep=',', header=None)

# test_data = (pd.read_csv('./RAW/test_features.csv', sep=',',
#              header=None)).drop([0], axis=1)

test_data = pd.read_csv(
    './RAW/test_features.csv',
    sep=',',
    header=None,
    )

test_data = test_data.sort_values(0)
test_data = test_data.rename(columns={0: "id_old"})


oqmd_ids = list(range(259, 268 + 1))
mask = np.logical_not(test_data["id_old"].isin(oqmd_ids))
test_data = test_data[mask]

# Removing duplicates
row_list = []
for i in test_data["id_old"].unique():
    row_i = test_data[test_data["id_old"] == i].iloc[0]
    row_list.append(row_i)

test_data = pd.DataFrame(row_list)

In [None]:
def method(row_i, id_mapp):
#     print(row_i)
    id_old = int(row_i["id_old"])
    id_unique = id_mapp[id_old]
    return(id_unique)

df_i = test_data
df_i["id"] = df_i.apply(
    method, axis=1,
    args=(id_mapp_iro3,))
test_data = df_i

test_data = test_data.set_index("id")

# Gaussian Process

In [None]:
# Define initial prediction parameters.
noise = 0.0042  # Regularisation parameter.
sigma_l = 6.3917  # Length scale parameter.
sigma_f = 0.5120  # Scaling parameter.
alpha = 0.3907  # Alpha parameter.

kdict = [
    {
        'type': 'quadratic',
        'dimension': 'single',
        'slope': sigma_l,
        'scaling': sigma_f,
        'degree':alpha
        }
    ]

gp = GaussianProcess(
    kernel_list=kdict, regularization=noise, train_fp=train_x,
    train_target=train_y, optimize_hyperparameters=True,
    scale_data=False)

print('Optimized kernel:', gp.kernel_list)

# Optimize hyperparameters:
gp.optimize_hyperparameters(global_opt=False)

In [None]:
gp.kernel_list

# Predictions

In [None]:
# Do the optimized predictions.
# pred = gp.predict(test_fp=test_data[:259], uncertainty=True)
pred = gp.predict(
#     test_fp=test_data,
    test_fp=test_data.drop("id_old", axis=1),
    uncertainty=True)

prediction = np.array(pred['prediction'][:, 0])

# Calculate the uncertainty of the predictions.
uncertainty = np.array(pred['uncertainty'])

# Sort predictions (ascending):
sorted_x = np.arange(0, len(prediction))
sorted_y = prediction[np.argsort(prediction)]
sorted_unc = uncertainty[np.argsort(prediction)]

In [None]:
model = pd.DataFrame()

model["prediction"] = prediction
model["uncertainty"] = uncertainty

model["id_old"] = test_data["id_old"].to_list()
model["id_old"] = model["id_old"].astype(int)

model = model.set_index(test_data.index)

model_sorted = model.sort_values("prediction")

# Plotting

In [None]:
trace = go.Scatter(
    y=model_sorted["prediction"],
    error_y={
        "type": 'data',
        "array": model_sorted["uncertainty"],
        "visible": True,
        },
    text=model_sorted["id_old"],
    mode="markers")
data = [trace]

fig = go.Figure(data=data)
fig.show()

In [None]:
# train_x = (pd.read_csv('./RAW/train_x.csv', sep=',', header=None)).as_matrix()
# train_y = (pd.read_csv('./RAW/train_y.csv', sep=',', header=None)).as_matrix()

# test_data = (pd.read_csv('./RAW/test_features.csv', sep=',',
#              header=None)).drop([0], axis=1).as_matrix()

# train_x = (pd.read_csv('./RAW/train_x.csv', sep=',', header=None)).as_matrix()
# train_y = (pd.read_csv('./RAW/train_y.csv', sep=',', header=None)).as_matrix()

# test_data = (pd.read_csv('./RAW/test_features.csv', sep=',',
#              header=None)).drop([0], axis=1).as_matrix()

# # (pd.read_csv('./RAW/test_features.csv', sep=',',
# #              header=None)).drop([0], axis=1)

# # test_data.drop(["id_old"])

# test_data.drop("id_old", axis=1)

# # test_data.drop?

In [None]:
# 132, 4, 118, 72, 174

model_sorted[0:6]["id_old"].tolist()

In [None]:
test_data.drop("id_old", axis=1).describe()

In [None]:
train_x.describe()