# <div style="text-align: center">Random Forests - KS Mapping</div> 

# Gaussian Potential - to - Energy

## Setup

In [46]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from KRR_reproduce import *

In [47]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.8

N_ESTIMATORS = 1000  
MAX_DEPTH = 20

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 1000
Maximum depth: 20


## Load data

In [36]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 
    

## Build Gaussian potentials

In [37]:
pots = []
grid_len = 5.29177 * 2

for n in range(SIM_NO):
    dist = seps[n]
    pot = pot_rep(dist, grid_len, grid_space=GRID_SPACE)
    pot = pot.flatten()
    pots.append(pot)

## Set up training and test data

In [38]:
data = pots
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

# convert to np arrays
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [39]:
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

## Results

In [40]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMSE on training data: \t{}".format(mean_absolute_error(y_true_train, y_pred_train)))
print("MSE on test data: \t{}".format(mean_absolute_error(y_true, y_pred)))


Number of estimators:	 1000
Maximum depth:		 20

Prediction on test data:


	Pred   |   True

[[-31.10484257 -31.12002121]
 [-31.25849535 -31.35579816]
 [-29.81542525 -29.74172427]
 [-31.07269758 -30.97719376]
 [-31.08837153 -31.03509757]
 [-31.28729479 -31.71298488]
 [-31.1468641  -31.35711097]
 [-29.15884242 -29.06048683]
 [-31.12472963 -31.25555287]
 [-30.82628468 -30.85878473]
 [-30.17013048 -29.98936226]
 [-31.17554309 -31.00192919]
 [-31.28594489 -31.74565502]
 [-31.17251618 -30.7632198 ]
 [-31.28757504 -31.40113248]
 [-31.20796875 -31.53602468]
 [-30.22606755 -30.17712973]
 [-31.12472963 -31.22918843]
 [-31.20796875 -31.55562192]
 [-29.69338739 -29.32045729]
 [-31.28179395 -31.68621403]
 [-29.73082057 -29.46878253]
 [-31.2850592  -31.62907309]
 [-29.73020837 -29.43887626]
 [-29.70024819 -29.34985962]
 [-29.15884242 -28.94799546]
 [-30.17458986 -30.05179323]
 [-29.15884242 -29.00395066]
 [-31.25254909 -31.70583223]
 [-31.28179395 -31.70064478]
 [-31.2850592  -31.51679575]
 [-31.

# Separation-to-Energy

In [41]:
# setup
SIM_NO = 150
SEED = 42
GRID_SPACE = 0.08

# params found from hyperparameter optimization  
N_ESTIMATORS = 1000 
MAX_DEPTH = 20

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 1000
Maximum depth: 20


## Load data

In [42]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 

## Setup training and test data

In [43]:
# setup training and test datas
data = seps
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

x_train = np.array(x_train)
x_train = x_train.reshape(-1,1)
x_test = np.array(x_test)
x_test = x_test.reshape(-1,1)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [44]:
# train random forest
estimator = RandomForestRegressor(random_state=SEED, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

## Results

In [45]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMAE on training data: \t{}".format(mean_absolute_error(y_true_train, y_pred_train)))
print("MAE on test data: \t{}".format(mean_absolute_error(y_true, y_pred)))


Number of estimators:	 1000
Maximum depth:		 20

Prediction on test data:


	Pred   |   True

[[-31.06620264 -31.12002121]
 [-31.33341422 -31.35579816]
 [-29.83266435 -29.74172427]
 [-31.06620264 -30.97719376]
 [-31.06620264 -31.03509757]
 [-31.33341422 -31.71298488]
 [-31.09757261 -31.35711097]
 [-29.15376972 -29.06048683]
 [-31.09757261 -31.25555287]
 [-30.79490398 -30.85878473]
 [-30.12323566 -29.98936226]
 [-31.20821852 -31.00192919]
 [-31.33341422 -31.74565502]
 [-31.20821852 -30.7632198 ]
 [-31.33341422 -31.40113248]
 [-31.10022944 -31.53602468]
 [-30.2469448  -30.17712973]
 [-31.09757261 -31.22918843]
 [-31.11269032 -31.55562192]
 [-29.49332342 -29.32045729]
 [-31.33341422 -31.68621403]
 [-29.71421788 -29.46878253]
 [-31.33341422 -31.62907309]
 [-29.49332342 -29.43887626]
 [-29.49332342 -29.34985962]
 [-29.15376972 -28.94799546]
 [-30.12323566 -30.05179323]
 [-29.15376972 -29.00395066]
 [-31.33709913 -31.70583223]
 [-31.33341422 -31.70064478]
 [-31.33341422 -31.51679575]
 [-31.