# Create the design space to take data

The paper uses a Latin hypercube to do a space filling design with conservative boundaries for 20 experiments.

In [2]:
#Define Boundaries for each feature

##Continuous features should be distributed between [0, upper_bound]
NaCl_boundaries = [0,140]
LArginine_boundaries = [0,150]
LLysine_boundaries = [0,150]
LProline_boundaries = [0,150]

#Discrete features are [0, upper_bound]
pH_boundaries = [5,5.5,6,6.5]
Trehalose_boundaries = [0,8]
Mannitol_boundaries = [0,4]
Tween20_boundaries = [0,0.01]

## Initial Design Space

### Construct a Latin hypercube

In [34]:
#using SMT: Surrogate Modeling Toolbox
from smt.sampling_methods import LHS

##Generate the continuous variables data points
continuous_variables_boundaries = np.array([NaCl_boundaries, 
                                            LArginine_boundaries, 
                                            LLysine_boundaries, 
                                            LProline_boundaries])
sampling = LHS(xlimits=continuous_variables_boundaries)

num = 20    #initial 20 experiments
continuous_variables_ = sampling(num)
continuous_variables_

array([[ 38.5 ,  41.25,  18.75, 101.25],
       [101.5 ,  93.75, 108.75, 146.25],
       [ 87.5 , 146.25,   3.75,  56.25],
       [ 31.5 ,  26.25,  41.25,  26.25],
       [ 52.5 ,  86.25, 146.25,  78.75],
       [115.5 ,  63.75,  11.25,  63.75],
       [ 59.5 , 101.25,  56.25,  86.25],
       [129.5 , 138.75,  63.75,  18.75],
       [ 80.5 ,  56.25,  86.25,   3.75],
       [ 94.5 , 108.75, 131.25,  11.25],
       [ 10.5 ,  78.75, 116.25, 138.75],
       [ 45.5 ,  18.75, 101.25, 108.75],
       [ 73.5 ,  33.75, 123.75,  33.75],
       [ 66.5 ,  11.25,  33.75,  41.25],
       [108.5 , 116.25,  48.75, 131.25],
       [122.5 ,  71.25,  78.75,  71.25],
       [ 24.5 ,  48.75,  26.25, 123.75],
       [  3.5 , 131.25, 138.75, 116.25],
       [ 17.5 ,   3.75,  93.75,  48.75],
       [136.5 , 123.75,  71.25,  93.75]])

In [35]:
import random

##Generate the discrete variables data points
discrete_variables_ = np.array([random.choices(pH_boundaries, k=20), 
                                random.choices(Mannitol_boundaries, k=20),
                                random.choices(Tween20_boundaries, k=20),
                                random.choices(Trehalose_boundaries, k=20)]).T

discrete_variables_

array([[6.  , 0.  , 0.01, 8.  ],
       [6.  , 0.  , 0.01, 8.  ],
       [5.5 , 4.  , 0.  , 8.  ],
       [6.5 , 0.  , 0.  , 0.  ],
       [5.  , 0.  , 0.  , 8.  ],
       [6.  , 0.  , 0.01, 8.  ],
       [6.  , 0.  , 0.  , 8.  ],
       [6.5 , 0.  , 0.  , 0.  ],
       [6.5 , 4.  , 0.  , 0.  ],
       [5.5 , 0.  , 0.  , 0.  ],
       [5.5 , 0.  , 0.01, 8.  ],
       [5.5 , 4.  , 0.  , 0.  ],
       [5.5 , 4.  , 0.  , 0.  ],
       [5.5 , 0.  , 0.01, 0.  ],
       [5.5 , 0.  , 0.01, 8.  ],
       [5.  , 4.  , 0.  , 8.  ],
       [6.  , 0.  , 0.  , 0.  ],
       [5.  , 0.  , 0.  , 8.  ],
       [6.  , 4.  , 0.  , 0.  ],
       [5.  , 4.  , 0.01, 8.  ]])

In [38]:
import pandas as pd

#merge both, continuous and discrete data points
initial_data_points = np.concatenate((continuous_variables_, discrete_variables_), axis=1)

#create a pandas dataframe
column_values = ['NaCl', 'LArginine', 'LLysine', 'LProline_boundaries', 'pH', 
                 'Mannitol', '']
df_initial = pd.DataFrame(data = initial_data_points,
                          columns = column_values)

ValueError: Shape of passed values is (20, 8), indices imply (20, 4)