## Import relavant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Reading dataset

In [8]:
concrete = pd.read_excel("Concrete.xlsx")
concrete.head()

Unnamed: 0,Cement,BF_slag,Fly_ash,Water,Super_plasticizer,Coarse_aggregate,Fine_aggregate,Age,Comp_str
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [9]:
concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cement             1030 non-null   float64
 1   BF_slag            1030 non-null   float64
 2   Fly_ash            1030 non-null   float64
 3   Water              1030 non-null   float64
 4   Super_plasticizer  1030 non-null   float64
 5   Coarse_aggregate   1030 non-null   float64
 6   Fine_aggregate     1030 non-null   float64
 7   Age                1030 non-null   int64  
 8   Comp_str           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [10]:
concrete.describe()

Unnamed: 0,Cement,BF_slag,Fly_ash,Water,Super_plasticizer,Coarse_aggregate,Fine_aggregate,Age,Comp_str
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


## Train test split

In [11]:
X = concrete.drop("Comp_str", axis = 1)
y = concrete["Comp_str"]

print(X.shape)
print(y.shape)

(1030, 8)
(1030,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Grid search

## Preparing grid

In [15]:
# Create a Random Forest Classifier with specified criterion
gb_regressor = GradientBoostingRegressor(criterion = "mae")

# Create the parameter grid
param_grid = {'max_depth' : [2, 4, 8, 10, 12],
              'n_estimators' : [100, 200, 300],
              'max_features' : ['auto', 'sqrt'],
              "criterion" : ["friedman_mse", "mse", "mae"]} 

# Create a GridSearchCV object
grid_gb = GridSearchCV(
    estimator = gb_regressor,
    param_grid = param_grid,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 4,
    cv = 10,
    refit = True,
    return_train_score = True)

print(grid_gb)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
             n_jobs=4,
             param_grid={'criterion': ['friedman_mse', 'mse', 'mae'],
                         'max_depth': [2, 4, 8, 10, 12],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='neg_mean_absolute_error')


## Train model

In [16]:
grid_gb.fit(X_train_scaled, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
             n_jobs=4,
             param_grid={'criterion': ['friedman_mse', 'mse', 'mae'],
                         'max_depth': [2, 4, 8, 10, 12],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 200, 300]},
             return_train_score=True, scoring='neg_mean_absolute_error')

### Best parameters

In [18]:
grid_gb.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 4,
 'max_features': 'sqrt',
 'n_estimators': 300}

### Best score

In [19]:
grid_gb.best_score_

-2.9084712007676776

In [20]:
cv_results = pd.DataFrame(grid_gb.cv_results_)

# Extract and print the row that had the best mean test score
best_row = cv_results[cv_results['rank_test_score'] == 1]
print(best_row)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
11       0.300672      0.011414         0.002689        0.001341   

   param_criterion param_max_depth param_max_features param_n_estimators  \
11    friedman_mse               4               sqrt                300   

                                               params  split0_test_score  ...  \
11  {'criterion': 'friedman_mse', 'max_depth': 4, ...          -2.968202  ...   

    split2_train_score  split3_train_score  split4_train_score  \
11           -1.161659           -1.121162           -1.126631   

    split5_train_score  split6_train_score  split7_train_score  \
11           -1.145858           -1.162898           -1.197257   

    split8_train_score  split9_train_score  mean_train_score  std_train_score  
11           -1.140079           -1.176149         -1.161242         0.027253  

[1 rows x 34 columns]


In [21]:
cv_results.loc[[grid_gb.best_index_]]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
11,0.300672,0.011414,0.002689,0.001341,friedman_mse,4,sqrt,300,"{'criterion': 'friedman_mse', 'max_depth': 4, ...",-2.968202,...,-1.161659,-1.121162,-1.126631,-1.145858,-1.162898,-1.197257,-1.140079,-1.176149,-1.161242,0.027253


In [22]:
grid_gb.best_params_["n_estimators"]

300

## Predict test score

In [24]:
predictions = grid_gb.best_estimator_.predict(X_test_scaled)

In [25]:
mean_absolute_error(predictions, y_test)

2.8632340005272474

# Random Search

In [26]:
# Create a Random Forest Classifier with specified criterion
gb_regressor = GradientBoostingRegressor(criterion = "mae")

# Create the parameter grid
param_grid = {'max_depth' : [2, 4, 8, 10, 12],
              'n_estimators' : [100, 200, 300],
              'max_features' : ['auto', 'sqrt'],
              "criterion" : ["friedman_mse", "mse", "mae"]} 

# Create a GridSearchCV object
random_grid_gb = RandomizedSearchCV(
    estimator = gb_regressor,
    param_distributions = param_grid,
    scoring = 'neg_mean_absolute_error',
    n_jobs = 4,
    cv = 10,
    refit = True, return_train_score = True)
print(random_grid_gb)

RandomizedSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
                   n_jobs=4,
                   param_distributions={'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'max_depth': [2, 4, 8, 10, 12],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [100, 200, 300]},
                   return_train_score=True, scoring='neg_mean_absolute_error')


In [27]:
random_grid_gb.fit(X_train_scaled, y_train)

RandomizedSearchCV(cv=10, estimator=GradientBoostingRegressor(criterion='mae'),
                   n_jobs=4,
                   param_distributions={'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'max_depth': [2, 4, 8, 10, 12],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [100, 200, 300]},
                   return_train_score=True, scoring='neg_mean_absolute_error')

In [28]:
print(random_grid_gb.cv_results_['param_max_depth'])
print(random_grid_gb.cv_results_['param_n_estimators'])

[4 8 2 12 10 12 2 12 10 8]
[300 300 200 100 300 300 300 100 300 300]


In [29]:
random_grid_gb.best_params_

{'n_estimators': 300,
 'max_features': 'sqrt',
 'max_depth': 8,
 'criterion': 'mae'}

In [31]:
random_grid_gb.best_score_

-2.9905477926610717

In [32]:
predictions = random_grid_gb.best_estimator_.predict(X_test_scaled)

In [33]:
mean_absolute_error(predictions, y_test)

2.994156945716245

## Genetic hyperparameters tuning

In [34]:
from tpot import TPOTRegressor

In [35]:
# Assign the values outlined to the inputs
number_generations = 20
population_size = 10
offspring_size = 10
scoring_function = 'neg_mean_absolute_error'

# Create the tpot classifier
tpot_clf = TPOTRegressor(generations = number_generations,
                         population_size = population_size,
                         offspring_size = offspring_size,
                         scoring = scoring_function,
                         verbosity = 2,
                         random_state = 2,
                         cv = 10,
                         n_jobs = -1)

# Fit the classifier to the training data
tpot_clf.fit(X_train_scaled, y_train)

# Score on the test set
print(tpot_clf.score(X_test_scaled, y_test))

Optimization Progress:   5%|▌         | 11/210 [00:16<22:17,  6.72s/pipeline]
Optimization Progress:  10%|█         | 22/210 [00:21<12:26,  3.97s/pipeline]
Optimization Progress:  16%|█▌        | 33/210 [00:39<10:58,  3.72s/pipeline]
Optimization Progress:  20%|██        | 42/210 [00:55<11:20,  4.05s/pipeline]
Optimization Progress:  25%|██▌       | 53/210 [01:00<06:22,  2.44s/pipeline]
Optimization Progress:  30%|██▉       | 62/210 [01:09<06:12,  2.52s/pipeline]
Optimization Progress:  34%|███▍      | 72/210 [01:14<04:13,  1.84s/pipeline]
Optimization Progress:  39%|███▉      | 82/210 [01:30<06:59,  3.27s/pipeline]
Optimization Progress:  43%|████▎     | 91/210 [01:40<08:51,  4.47s/pipeline]
Optimization Progress:  48%|████▊     | 101/210 [01:47<07:22,  4.06s/pipeline]
Optimization Progress:  53%|█████▎    | 111/210 [01:52<05:28,  3.32s/pipeline]
Optimization Progress:  58%|█████▊    | 121/210 [01:58<04:39,  3.14s/pipeline]
Optimization Progress:  63%|██████▎   | 132/210 [02:02<02:45,