## Import relavant libraries

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Reading dataset

In [63]:
concrete = pd.read_excel("Concrete.xlsx")
concrete.head()

Unnamed: 0,Cement,BF_slag,Fly_ash,Water,Super_plasticizer,Coarse_aggregate,Fine_aggregate,Age,Comp_str
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [64]:
concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cement             1030 non-null   float64
 1   BF_slag            1030 non-null   float64
 2   Fly_ash            1030 non-null   float64
 3   Water              1030 non-null   float64
 4   Super_plasticizer  1030 non-null   float64
 5   Coarse_aggregate   1030 non-null   float64
 6   Fine_aggregate     1030 non-null   float64
 7   Age                1030 non-null   int64  
 8   Comp_str           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [65]:
concrete.describe()

Unnamed: 0,Cement,BF_slag,Fly_ash,Water,Super_plasticizer,Coarse_aggregate,Fine_aggregate,Age,Comp_str
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


## Train test split

In [66]:
X = concrete.drop("Comp_str", axis = 1)
y = concrete["Comp_str"]

print(X.shape)
print(y.shape)

(1030, 8)
(1030,)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [72]:
scaler = StandardScaler()
train_X = scaler.fit_transform(X_train)
test_X = scaler.transform(X_test)

## Grid search

## Preparing grid

In [73]:
# Create a Random Forest Classifier with specified criterion
gb_regressor = GradientBoostingRegressor(criterion = "mae")

# Create the parameter grid
param_grid = {'max_depth' : [2, 4, 8, 10, 12],
              'n_estimators' : [100, 200, 300],
              'max_features' : ['auto', 'sqrt'],
              "criterion" : ["friedman_mse", "mse", "mae"]} 

# Create a GridSearchCV object
grid_gb = GridSearchCV(
    estimator = gb_regressor,
    param_grid = param_grid,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 4,
    cv = 10,
    refit = True,
    return_train_score = True)

print(grid_gb)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
             n_jobs=4,
             param_grid={'criterion': ['friedman_mse', 'mse', 'mae'],
                         'max_depth': [2, 4, 8, 10, 12],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='neg_mean_absolute_error')


## Train model

In [74]:
grid_gb.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
             n_jobs=4,
             param_grid={'criterion': ['friedman_mse', 'mse', 'mae'],
                         'max_depth': [2, 4, 8, 10, 12],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='neg_mean_absolute_error')

### Best parameters

In [77]:
grid_gb.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 4,
 'max_features': 'sqrt',
 'n_estimators': 300}

### Best score

In [78]:
grid_gb.best_score_

-2.8922804066590815

In [79]:
cv_results = pd.DataFrame(grid_gb.cv_results_)

# Extract and print the row that had the best mean test score
best_row = cv_results[cv_results['rank_test_score'] == 1]
print(best_row)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
11       0.171043      0.010576         0.002992        0.000631   

   param_criterion param_max_depth param_max_features param_n_estimators  \
11    friedman_mse               4               sqrt                300   

                                               params  split0_test_score  ...  \
11  {'criterion': 'friedman_mse', 'max_depth': 4, ...          -2.908899  ...   

    split2_train_score  split3_train_score  split4_train_score  \
11           -1.173298           -1.105502           -1.171747   

    split5_train_score  split6_train_score  split7_train_score  \
11           -1.113335           -1.171388           -1.164973   

    split8_train_score  split9_train_score  mean_train_score  std_train_score  
11           -1.113733           -1.171978         -1.154877         0.029265  

[1 rows x 34 columns]


In [80]:
cv_results.loc[[grid_gb.best_index_]]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
11,0.171043,0.010576,0.002992,0.000631,friedman_mse,4,sqrt,300,"{'criterion': 'friedman_mse', 'max_depth': 4, ...",-2.908899,...,-1.173298,-1.105502,-1.171747,-1.113335,-1.171388,-1.164973,-1.113733,-1.171978,-1.154877,0.029265


In [81]:
grid_gb.best_params_["n_estimators"]

300

## Predict test score

In [82]:
predictions = grid_gb.best_estimator_.predict(X_test)

In [83]:
mean_absolute_error(predictions, y_test)

2.7527702025120906

# Random Search

In [84]:
# Create a Random Forest Classifier with specified criterion
gb_regressor = GradientBoostingRegressor()

# Create the parameter grid
param_grid = {'max_depth' : [2, 4, 8, 10, 12],
              'n_estimators' : [100, 200, 300],
              'max_features' : ['auto', 'sqrt'],
              "criterion" : ["friedman_mse", "mse", "mae"]} 

# Create a GridSearchCV object
random_grid_gb = RandomizedSearchCV(
    estimator = gb_regressor,
    param_distributions = param_grid,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 4,
    cv = 5,
    refit = True, return_train_score = True)
print(random_grid_gb)

RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=4,
                   param_distributions={'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'max_depth': [2, 4, 8, 10, 12],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [100, 200, 300]},
                   return_train_score=True, scoring='neg_mean_absolute_error')


In [85]:
random_grid_gb.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=4,
                   param_distributions={'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'max_depth': [2, 4, 8, 10, 12],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [100, 200, 300]},
                   return_train_score=True, scoring='neg_mean_absolute_error')

In [86]:
print(random_grid_gb.cv_results_['param_max_depth'])
print(random_grid_gb.cv_results_['param_n_estimators'])

[2 2 2 2 10 10 8 10 10 12]
[100 100 300 200 100 100 300 300 300 200]


In [87]:
random_grid_gb.best_params_

{'n_estimators': 300,
 'max_features': 'sqrt',
 'max_depth': 8,
 'criterion': 'mse'}

In [88]:
random_grid_gb.best_score_

-3.214888746558168

In [89]:
predictions = random_grid_gb.best_estimator_.predict(X_test)

In [90]:
mean_absolute_error(predictions, y_test)

2.892431814888138

## Genetic hyperparameters tuning

In [92]:
from tpot import TPOTRegressor

In [104]:
# Assign the values outlined to the inputs
number_generations = 20
population_size = 10
offspring_size = 10
scoring_function = 'neg_mean_absolute_error'

# Create the tpot classifier
tpot_clf = TPOTRegressor(generations = number_generations,
                         population_size = population_size,
                         offspring_size = offspring_size,
                         scoring = scoring_function,
                         verbosity = 2,
                         random_state = 2,
                         cv = 10,
                         n_jobs = -1)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=210.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: -3.0332376203892255
Generation 2 - Current best internal CV score: -3.0232139854289315
Generation 3 - Current best internal CV score: -3.0232139854289315
Generation 4 - Current best internal CV score: -3.0232139854289315
Generation 5 - Current best internal CV score: -3.0232139854289315
Generation 6 - Current best internal CV score: -2.9932558899473256
Generation 7 - Current best internal CV score: -2.989680286433173
Generation 8 - Current best internal CV score: -2.989680286433173
Generation 9 - Current best internal CV score: -2.9644307849216744
Generation 10 - Current best internal CV score: -2.9644307849216744
Generation 11 - Current best internal CV score: -2.954487632500787
Generation 12 - Current best internal CV score: -2.954487632500787
Generation 13 - Current best internal CV score: -2.8990675290908134
Generation 14 - Current best internal CV score: -2.8990675290908134
Generation 15 - Current best internal CV score: -2.899067529