# <div style="text-align: center">Random Forests - KS Mapping</div> 

# Gaussian Potential - to - Energy

## Setup

In [38]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from KRR_reproduce import *

In [39]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.8

# params found from hyperparameter optimization (see RF_hyperparam.py)
if GRID_SPACE == 0.8: 
    N_ESTIMATORS = 1000    # more = better
    MAX_DEPTH = 20 
else: 
    N_ESTIMATORS = 200     # more is computationally infeasible for grid_space = 0.08
    MAX_DEPTH = 10

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 1000
Maximum depth: 20


## Load data

In [40]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 
    

## Build Gaussian potentials

In [41]:
pots = []
grid_len = 5.29177 * 2

for n in range(SIM_NO):
    dist = seps[n]
    pot = pot_rep(dist, grid_len, grid_space=GRID_SPACE)
    pot = pot.flatten()
    pots.append(pot)

## Set up training and test data

In [42]:
data = pots
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

# convert to np arrays
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [43]:
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

## Results

In [44]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMAE on training data: \t{}".format(mean_absolute_error(y_true_train, y_pred_train)))
print("MAE on test data: \t{}".format(mean_absolute_error(y_true, y_pred)))


Number of estimators:	 1000
Maximum depth:		 20

Prediction on test data:


	Pred   |   True

[[-31.13959469 -31.12002121]
 [-31.27238819 -31.35579816]
 [-29.74821619 -29.74172427]
 [-30.98198223 -30.97719376]
 [-31.05339684 -31.03509757]
 [-31.70891254 -31.71298488]
 [-31.36251182 -31.35711097]
 [-29.05914636 -29.06048683]
 [-31.26216004 -31.25555287]
 [-30.86624421 -30.85878473]
 [-29.99645033 -29.98936226]
 [-31.009631   -31.00192919]
 [-31.74402118 -31.74565502]
 [-30.72678891 -30.7632198 ]
 [-31.38780598 -31.40113248]]


MAE on training data: 	0.004606007421824827
MAE on test data: 	0.014908301232305377


# Separation-to-Energy

In [50]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1

# params found from hyperparameter optimization (see RF_hyperparam_gaussian.py)
N_ESTIMATORS = 500     
MAX_DEPTH = 50

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 500
Maximum depth: 50


## Load data

In [51]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 

## Setup training and test data

In [52]:
# setup training and test datas
data = seps
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

x_train = np.array(x_train)
x_train = x_train.reshape(-1,1)
x_test = np.array(x_test)
x_test = x_test.reshape(-1,1)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [53]:
# train random forest
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

## Results

In [54]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMAE on training data: \t{}".format(mean_absolute_error(y_true_train, y_pred_train)))
print("MAE on test data: \t{}".format(mean_absolute_error(y_true, y_pred)))


Number of estimators:	 500
Maximum depth:		 50

Prediction on test data:


	Pred   |   True

[[-31.13702872 -31.12002121]
 [-31.30242427 -31.35579816]
 [-29.75641263 -29.74172427]
 [-30.98343299 -30.97719376]
 [-31.05078668 -31.03509757]
 [-31.70595213 -31.71298488]
 [-31.36462983 -31.35711097]
 [-29.07510937 -29.06048683]
 [-31.26962127 -31.25555287]
 [-30.87384393 -30.85878473]
 [-30.00471222 -29.98936226]
 [-30.96375655 -31.00192919]
 [-31.74334771 -31.74565502]
 [-30.7346724  -30.7632198 ]
 [-31.44538361 -31.40113248]]


MAE on training data: 	0.007022504041392458
MAE on test data: 	0.019595220766003982
