# <div style="text-align: center">Random Forests - KS Mapping</div> 

# Gaussian Potential - to - Energy

## Setup

In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from KRR_reproduce import *

In [7]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.08

# params found from hyperparameter optimization (see RF_hyperparam.py)
if GRID_SPACE == 0.8: 
    N_ESTIMATORS = 1000    # more = better
    MAX_DEPTH = 20 
else: 
    N_ESTIMATORS = 200     # more is computationally infeasible for grid_space = 0.08
    MAX_DEPTH = 10

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 200
Maximum depth: 10


## Load data

In [8]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 
    

## Build Gaussian potentials

In [9]:
pots = []
grid_len = 5.29177 * 2

for n in range(SIM_NO):
    dist = seps[n]
    pot = pot_rep(dist, grid_len, grid_space=GRID_SPACE)
    pot = pot.flatten()
    pots.append(pot)

## Set up training and test data

In [10]:
data = pots
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

# convert to np arrays
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [11]:
estimator = RandomForestRegressor(random_state=0, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

## Results

In [12]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMSE on training data: \t{}".format(mean_squared_error(y_true_train, y_pred_train)))
print("MSE on test data: \t{}".format(mean_squared_error(y_true, y_pred)))


Number of estimators:	 200
Maximum depth:		 10

Prediction on test data:


	Pred   |   True

[[-31.11718574 -31.12002121]
 [-31.34169204 -31.35579816]
 [-29.74511183 -29.74172427]
 [-30.97446798 -30.97719376]
 [-31.04191826 -31.03509757]
 [-31.70184994 -31.71298488]
 [-31.36278651 -31.35711097]
 [-29.06394419 -29.06048683]
 [-31.23463319 -31.25555287]
 [-30.87968543 -30.85878473]
 [-29.98734904 -29.98936226]
 [-30.98686661 -31.00192919]
 [-31.74296216 -31.74565502]
 [-30.7605959  -30.7632198 ]
 [-31.42646553 -31.40113248]]


MSE on training data: 	0.00017295610044738293
MSE on test data: 	0.0001467938428517112


# Separation-to-Energy

In [21]:
# setup
SIM_NO = 150
SEED = 42
TEST_SIZE = 0.1
GRID_SPACE = 0.08

# params found from hyperparameter optimization (see RF_hyperparam_gaussian.py)
N_ESTIMATORS = 500     
MAX_DEPTH = 50

# path to data
os.environ['PROJDIR'] = '/Users/simonbatzner1/Desktop/Research/Research_Code/ML-electron-density'
STR_PREF = os.environ['PROJDIR'] + '/data/H2_DFT/temp_data/store/'

print("Number of estimators: {}".format(N_ESTIMATORS))
print("Maximum depth: {}".format(MAX_DEPTH))

Number of estimators: 500
Maximum depth: 50


## Load data

In [22]:
# LOAD DATA
ens = []
seps = []
fours = []

for n in range(SIM_NO):
    # load separation, energy, and density
    sep = np.load(STR_PREF + 'sep_store/sep' + str(n) + '.npy')
    en = np.load(STR_PREF + 'en_store/en' + str(n) + '.npy')
    four = np.load(STR_PREF + 'four_store/four' + str(n) + '.npy')

    # put results in a nicer format
    sep = np.reshape(sep, (1,))[0]
    en = np.reshape(en, (1,))[0]['energy']
    four = np.real(four)

    # store quantities
    ens.append(en)
    seps.append(sep)
    fours.append(four) 

## Setup training and test data

In [23]:
# setup training and test datas
data = seps
labels = ens
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SIZE, random_state=SEED)

x_train = np.array(x_train)
x_train = x_train.reshape(-1,1)
x_test = np.array(x_test)
x_test = x_test.reshape(-1,1)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Train Random Forest

In [24]:
# train random forest
estimator = RandomForestRegressor(random_state=0, n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
estimator.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

## Results

In [25]:
# eval on training data
y_true_train, y_pred_train = y_train, estimator.predict(x_train)

# eval on test data
y_true, y_pred = y_test, estimator.predict(x_test)

# print results
print("\nNumber of estimators:\t {}".format(N_ESTIMATORS))
print("Maximum depth:\t\t {}".format(MAX_DEPTH))
print("\nPrediction on test data:\n")
print("\n\tPred   |   True\n")
print(np.c_[y_pred, y_true])
print("\n\nMSE on training data: \t{}".format(mean_squared_error(y_true_train, y_pred_train)))
print("MSE on test data: \t{}".format(mean_squared_error(y_true, y_pred)))


Number of estimators:	 500
Maximum depth:		 50

Prediction on test data:


	Pred   |   True

[[-31.13294575 -31.12002121]
 [-31.30402356 -31.35579816]
 [-29.75886236 -29.74172427]
 [-30.98547369 -30.97719376]
 [-31.04845149 -31.03509757]
 [-31.70581939 -31.71298488]
 [-31.36598456 -31.35711097]
 [-29.0776006  -29.06048683]
 [-31.26807838 -31.25555287]
 [-30.87145413 -30.85878473]
 [-30.00602195 -29.98936226]
 [-30.96744233 -31.00192919]
 [-31.74354007 -31.74565502]
 [-30.73455082 -30.7632198 ]
 [-31.4390302  -31.40113248]]


MSE on training data: 	0.0001307784138965707
MSE on test data: 	0.0005238752942314319
