In [332]:
%run function_dbs.py
%run MLRegression_dbs.py
%matplotlib inline

fs, color = 10, 'k'
today = ''.join(str(datetime.today()).split(' ')[0].split('-'))

## load data set and split into training and validation data

In [354]:
# select which ML algorithm for which analyte shall be analysed 
file_ox = '460nm/20220825_trainingData-O2_balanced_absoluteInt.csv'
file_ph = '460nm/20220921_trainingData-pH_balanced_absoluteInt.csv'

df_features = pd.read_csv(file_ox, sep='\t', index_col=0)

# ----------------------------------------------------------------------------------------
# split the data into training and validation data
y_truth = df_features.index              # n_samples (3720,) chosen analyte
X = df_features                          # shape (n_features, n_samples) with (3720, 151)
X_train, X_valid, truth_train, truth_valid = train_test_split(X, y_truth, test_size=0.1, random_state=42)
# X_train, X_valid = X_train.T, X_valid.T

print('in ML a rule of thumbs says that a suitable size for a data set is >1,000 samples')
print('size of data sets:')
print('> training data  ', X_train.shape, '\n> validation data', X_valid.shape)

in ML a rule of thumbs says that a suitable size for a data set is >1,000 samples
size of data sets:
> training data   (2255, 151) 
> validation data (251, 151)


In [355]:
# select which ML algorithm that shall be optimized (HPO)

# Random Forest 
RF = RandomForestRegressor()

# model parameter that will be optimized
rf_random = {'n_estimators': [1, 25, 50, 75, 100, 150, 200],
             'max_features': [1, 2, 3, 4, 5, 7, 10, 20],
             'max_depth': [None, 1, 3, 5, 7, 9, 11, 20, 50],
             'min_samples_split': [2, 3, 4],
             'min_samples_leaf': [1, 2, 4, 6, 8, 10], 
             'bootstrap': [True, False],  
             'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90],
             'min_weight_fraction_leaf': [0, 0.1, 0.2, 0.3, 0.4, 0.45]}


# Decision Tree
DT = DecisionTreeRegressor() #make_pipeline(preprocessing.StandardScaler(), )

# model parameter that will be optimized
dt_random = {"splitter":["best","random"],
             "max_depth" : [None, 1,3,5,7,9,11,12],
             "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
             "min_weight_fraction_leaf":[0, 0.1,0.2,0.3,0.4,0.45],
             "max_features":["auto","log2","sqrt",None],
             "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90]}


# XGBoost
XG = XGBRegressor()

# model parameter that will be optimized
xg_random = {'n_estimators': [1, 100, 200, 400],
             'max_depth': [1, 5, 7, 9, 13],
             'learning_rate': [0, 0.05, 0.1, 0.5],
             'min_child_weight': [0, 5, 10, 15, 20]}


# ----------------------------------------------------------------------------------------
model_id = input('Which ML regressor shall be optimized - (1) Random Forest, (2) Decision tree, or (3) XGBoost? ')

Which ML regressor shall be optimized - (1) Random Forest, (2) Decision tree, or (3) XGBoost? 1


In [356]:
if model_id == '1':
    model = RF
    random_grid = rf_random
elif model_id == '2':
    model = DT
    random_grid = dt_random
elif model_id == '3':
    model = XG
    random_grid = xg_random
else:
    print('Please specify the model via its number: (1) Random Forest, (2) Decision tree, or (3) XGBoost')
    model = None
    random_grid = None

In [357]:
# Random search of parameters, using 3 fold cross validation (cv), 
# search across 100 different combinations, and use all available cores
if model != None:
    ML_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

    # Fit the random search model
    ML_random.fit(X_train, truth_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [358]:
print('>> Best estimation of hyperparameter for ' + str(model)[:-2])
ML_random.best_params_

>> Best estimation of hyperparameter for RandomForestRegressor


{'n_estimators': 150,
 'min_weight_fraction_leaf': 0,
 'min_samples_split': 3,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 70,
 'max_features': 10,
 'max_depth': 50,
 'bootstrap': False}

#### Assess model performance with identified HPO

In [359]:
if model_id == '1':
    model.estimator_params = ML_random.best_params_['n_estimators']
    model.min_weight_fraction_leaf = ML_random.best_params_['min_weight_fraction_leaf']
    model.min_samples_split = ML_random.best_params_['min_samples_split']
    model.min_samples_leaf = ML_random.best_params_['min_samples_leaf']
    model.max_leaf_nodes = ML_random.best_params_['max_leaf_nodes']
    model.max_features = ML_random.best_params_['max_features']
    model.max_depth = ML_random.best_params_['max_depth']
    model.bootstrap = ML_random.best_params_['bootstrap']
    
elif model_id == '2':
    model.splitter = ML_random.best_params_['splitter']
    model.min_weight_fraction_leaf = ML_random.best_params_['min_weight_fraction_leaf']
    model.min_samples_leaf = ML_random.best_params_['min_samples_leaf']
    model.max_leaf_nodes = ML_random.best_params_['max_leaf_nodes']
    model.max_features = ML_random.best_params_['max_features']
    model.max_depth = ML_random.best_params_['max_depth']  
    
elif model_id == '3':
    model.n_estimators = ML_random.best_params_['n_estimators']
    model.max_depth = ML_random.best_params_['max_depth']  
    model.learning_rate = ML_random.best_params_['learning_rate']
    model.min_child_weight = ML_random.best_params_['min_child_weight']
            
regressor = make_pipeline(preprocessing.StandardScaler(), model)

In [361]:
# define the model performance with optimized hyper-parameters
[model_, y_predT, y_pred, para] = _fitPredict(model=regressor, X_train=X_train, truth_train=truth_train, 
                                              X_valid=X_valid, truth_valid=truth_valid)

# ---------------------------------------------------------
para = pd.DataFrame(para, index=['mae', 'rmse', 'sdz'])
para