# Create the design space to take data

The paper uses a Latin hypercube to do a space filling design with conservative boundaries for 20 experiments.

In [4]:
#Define Boundaries for each feature

##Continuous features should be distributed between [0, upper_bound]
NaCl_boundaries = [0,140]
LArginine_boundaries = [0,150]
LLysine_boundaries = [0,150]
LProline_boundaries = [0,150]

#Discrete features are [0, upper_bound]
pH_boundaries = [5,5.5,6,6.5]
Trehalose_boundaries = [0,8]
Mannitol_boundaries = [0,4]
Tween20_boundaries = [0,0.01]

In [23]:
#Generate data for continuous features

from numpy.random import uniform

NaCl_vector = uniform(NaCl_boundaries[0], NaCl_boundaries[1], 10)
LArginine_vector = uniform(LArginine_boundaries[0], LArginine_boundaries[1], 10)
LLysine_vector = uniform(LLysine_boundaries[0], LLysine_boundaries[1], 10)
LProline_vector = uniform(LProline_boundaries[0], LProline_boundaries[1], 10)

In [24]:
#Construct the design space using GaussianProcesses

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.gaussian_process.kernels import Matern
from scipy import stats

# Instanciate model with matern Kernel
model = GaussianProcessRegressor(kernel=Matern(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5),
                                 random_state=0)

# Hyperparameter Grid
grid = {'NaCl': NaCl_vector, 
        'L-Arginine' : LArginine_vector,
        'L-Lysine' : LLysine_vector, 
        'L-Proline' : LProline_vector, 
        'PH' : pH_boundaries, 
        'Trehalose' : Trehalose_boundaries, 
        'Mannitol': Mannitol_boundaries, 
        'Tween20' : Tween20_boundaries }

grid

{'NaCl': array([ 46.34923381, 119.75826496,  88.82490897, 106.35126251,
         53.35396234, 130.30085835,  98.56182141,  90.78022769,
         48.55229403,  14.15457452]),
 'L-Arginine': array([134.51426878, 105.08662769,  99.17278548, 128.24559857,
        130.53003787,  48.04938712, 148.73379452, 106.6514834 ,
        118.46397788,  31.17709225]),
 'L-Lysine': array([ 37.2181626 ,  53.46338128,  33.77386778,  99.97664244,
        122.91265539,  37.05759249, 143.65840528,   9.97638422,
         41.50778613, 125.52797934]),
 'L-Proline': array([ 87.13548444,  64.17542572,  52.54418662,  35.17550124,
        112.72626702,  62.62502666, 129.50521166, 110.86029673,
         95.3680621 , 113.20293423]),
 'PH': [5, 5.5, 6, 6.5],
 'Trehalose': [0, 8],
 'Mannitol': [0, 4],
 'Tween20': [0, 0.01]}

In [25]:
# Instanciate Grid Search
search = RandomizedSearchCV(model, grid, 
                            n_iter=20,  # number of draws
                            #cv=1,
                            #scoring='r2',
                            #n_jobs=-1
                           )
search

RandomizedSearchCV(cv=1,
                   estimator=GaussianProcessRegressor(kernel=Matern(length_scale=1, nu=1.5),
                                                      random_state=0),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'L-Arginine': array([134.51426878, 105.08662769,  99.17278548, 128.24559857,
       130.53003787,  48.04938712, 148.73379452, 106.6514834 ,
       118.46397788,  31.17709225]),
                                        'L-Lysine': array([ 37.2181626 ,  53.46338128,  33.77386...
                                        'L-Proline': array([ 87.13548444,  64.17542572,  52.54418662,  35.17550124,
       112.72626702,  62.62502666, 129.50521166, 110.86029673,
        95.3680621 , 113.20293423]),
                                        'Mannitol': [0, 4],
                                        'NaCl': array([ 46.34923381, 119.75826496,  88.82490897, 106.35126251,
        53.35396234, 130.30085835,  98.56182141,  90.78022769,
   

In [43]:
import pandas as pd

data = pd.read_excel('mp1c00469_si_002.xlsx')
data.keys()#

Index(['Round', 'Exp No.', 'pH', 'NaCl (mM)', 'Trehalose (w/v %)',
       'Mannitol (w/v %)', 'Tween 20 (v/v%)', 'Tm', 'NP', 'NP_std'],
      dtype='object')

In [44]:
ref_data = data[['pH', 'NaCl (mM)', 'Trehalose (w/v %)', 'Mannitol (w/v %)', 'Tween 20 (v/v%)', 'Tm']].iloc[0]
ref_data

pH                    5.000000
NaCl (mM)            15.000000
Trehalose (w/v %)     7.000000
Mannitol (w/v %)      0.000000
Tween 20 (v/v%)       0.010000
Tm                   59.347822
Name: 0, dtype: float64