# Create the design space to take data

The paper uses a Latin hypercube to do a space filling design with conservative boundaries for 20 experiments.

In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import matplotlib.pyplot as plt
#from scipy.optimize import minimize
#from scipy.optimize import Bounds
from pyDOE import lhs
#from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
#Define Boundaries for each feature

##Continuous features should be distributed between [0, upper_bound]
NaCl_boundaries = [0,140]
LArginine_boundaries = [0,150]
LLysine_boundaries = [0,150]
LProline_boundaries = [0,150]

#Discrete features are [0, upper_bound]
pH_boundaries = [5,5.5,6,6.5]
Trehalose_boundaries = [0,8]
Mannitol_boundaries = [0,4]
Tween20_boundaries = [0,0.01]

In [3]:
#Generate data for continuous features

from numpy.random import uniform

NaCl_vector = uniform(NaCl_boundaries[0], NaCl_boundaries[1], 10)
LArginine_vector = uniform(LArginine_boundaries[0], LArginine_boundaries[1], 10)
LLysine_vector = uniform(LLysine_boundaries[0], LLysine_boundaries[1], 10)
LProline_vector = uniform(LProline_boundaries[0], LProline_boundaries[1], 10)

## Initial Design Space

### Using  RandomizedSearchCV

In [9]:
#Construct the design space using GaussianProcesses

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.gaussian_process.kernels import Matern
from scipy import stats

# Instanciate model with matern Kernel
model = GaussianProcessRegressor(kernel=Matern(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5),
                                 random_state=0)

# Hyperparameter Grid
grid = {'NaCl': NaCl_vector, 
        #'L-Arginine' : LArginine_vector,
        #'L-Lysine' : LLysine_vector, 
        #'L-Proline' : LProline_vector, 
        'PH' : pH_boundaries, 
        'Trehalose' : Trehalose_boundaries, 
        'Mannitol': Mannitol_boundaries, 
        'Tween20' : Tween20_boundaries }

grid

{'NaCl': array([ 47.31772185,  73.61055802,   6.08459591,   3.95405791,
         49.13478951,  57.04140297,  58.73188013, 114.49138458,
        104.805946  , 126.800807  ]),
 'PH': [5, 5.5, 6, 6.5],
 'Trehalose': [0, 8],
 'Mannitol': [0, 4],
 'Tween20': [0, 0.01]}

In [10]:
# Instanciate Grid Search
search = RandomizedSearchCV(model, grid, 
                            n_iter=20,  # number of draws
                            #cv=1,
                            #scoring='r2',
                            #n_jobs=-1
                           )
search

RandomizedSearchCV(estimator=GaussianProcessRegressor(kernel=Matern(length_scale=1, nu=1.5),
                                                      random_state=0),
                   n_iter=20,
                   param_distributions={'Mannitol': [0, 4],
                                        'NaCl': array([ 47.31772185,  73.61055802,   6.08459591,   3.95405791,
        49.13478951,  57.04140297,  58.73188013, 114.49138458,
       104.805946  , 126.800807  ]),
                                        'PH': [5, 5.5, 6, 6.5],
                                        'Trehalose': [0, 8],
                                        'Tween20': [0, 0.01]})

In [11]:
import pandas as pd

data = pd.read_excel('mp1c00469_si_002.xlsx')
data.keys()#

Index(['Round', 'Exp No.', 'pH', 'NaCl (mM)', 'Trehalose (w/v %)',
       'Mannitol (w/v %)', 'Tween 20 (v/v%)', 'Tm', 'NP', 'NP_std'],
      dtype='object')

In [12]:
ref_data = data[['pH', 'NaCl (mM)', 'Trehalose (w/v %)', 'Mannitol (w/v %)', 'Tween 20 (v/v%)', 'Tm']].iloc[0]
ref_data

pH                    5.000000
NaCl (mM)            15.000000
Trehalose (w/v %)     7.000000
Mannitol (w/v %)      0.000000
Tween 20 (v/v%)       0.010000
Tm                   59.347822
Name: 0, dtype: float64

**Small detail...** Use of 'RandomizedSearchCV' should be done to create a grid of the tunable hyperparameters of the model to fit (i.e. theta, nu, etc from the Matern kernel), not to create a grid for the variables X...

### Using Randomized Designs

In [16]:
from pyDOE import *

lhs(5, 20)

array([[0.85176229, 0.76466907, 0.32725789, 0.38394626, 0.76355036],
       [0.83517292, 0.94149338, 0.02721199, 0.7454621 , 0.63753012],
       [0.11846173, 0.03778195, 0.37296061, 0.97125767, 0.21566037],
       [0.50016787, 0.80232746, 0.59691478, 0.68766896, 0.08535166],
       [0.28346902, 0.52976659, 0.8848308 , 0.86240224, 0.54567265],
       [0.22731454, 0.98668752, 0.5418105 , 0.76539719, 0.58209296],
       [0.37707744, 0.56536673, 0.19731255, 0.90599076, 0.49699489],
       [0.93367454, 0.34148738, 0.41123377, 0.31204469, 0.90953995],
       [0.66067437, 0.60624296, 0.72449065, 0.12344633, 0.34063908],
       [0.45123004, 0.20611134, 0.27562335, 0.1640197 , 0.1091121 ],
       [0.99781535, 0.36242038, 0.6033091 , 0.08097054, 0.8980762 ],
       [0.06056769, 0.42459858, 0.83268274, 0.25653423, 0.71024087],
       [0.03519366, 0.73842122, 0.11202546, 0.44244973, 0.28151636],
       [0.43714332, 0.27957788, 0.77355223, 0.57595547, 0.15980365],
       [0.70011887, 0.45231016, 0.