# <div style="text-align: center">Random Forests - KS Mapping</div> 

# Gaussian Potential - to - Energy

## Setup

In [5]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from KRR_reproduce import *

In [6]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.08

N_ESTIMATORS = 200     # more is computationally infeasible for grid_space = 0.08
MAX_DEPTH = 10

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 200
Maximum depth: 10


## Load data

In [7]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 
    

## Build Gaussian potentials

In [8]:
pots = []
grid_len = 5.29177 * 2

for n in range(SIM_NO):
    dist = seps[n]
    pot = pot_rep(dist, grid_len, grid_space=GRID_SPACE)
    pot = pot.flatten()
    pots.append(pot)

## Set up training and test data

In [9]:
data = pots
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

# convert to np arrays
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [10]:
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

KeyboardInterrupt: 

## Results

In [None]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMSE on training data: \t{}".format(mean_squared_error(y_true_train, y_pred_train)))
print("MSE on test data: \t{}".format(mean_squared_error(y_true, y_pred)))

# Separation-to-Energy

In [None]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.08

# params found from hyperparameter optimization  
N_ESTIMATORS = 500     
MAX_DEPTH = 50

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

## Load data

In [None]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 

## Setup training and test data

In [None]:
# setup training and test datas
data = seps
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

x_train = np.array(x_train)
x_train = x_train.reshape(-1,1)
x_test = np.array(x_test)
x_test = x_test.reshape(-1,1)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [None]:
# train random forest
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

## Results

In [None]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMSE on training data: \t{}".format(mean_squared_error(y_true_train, y_pred_train)))
print("MSE on test data: \t{}".format(mean_squared_error(y_true, y_pred)))