In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('../cases/Concrete Strength/Concrete_Data.csv')
df.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
df.isnull().sum()

Cement              0
Blast               0
Fly                 0
Water               0
Superplasticizer    0
Coarse              0
Fine                0
Age                 0
Strength            0
dtype: int64

In [4]:
df.shape

(1030, 9)

In [7]:
X = df.drop('Strength', axis=1)
y = df['Strength']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
gbr = GradientBoostingRegressor(random_state=42, verbose=1)
params = {
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [100, 200, 300, 500],
    'min_samples_split': [2, 3, 4, 5],
    'max_depth': [3, 4, 5]
}
gcv = GridSearchCV(gbr, param_grid=params, cv=5, n_jobs=-1,
                   scoring='neg_mean_squared_error')
gcv.fit(X, y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1         237.9132          37.1346            1.49s
         2         213.4422          40.3123            1.38s
         3         159.0646          -2.3599            1.48s
         4         145.8802          30.1738            1.40s
         5         139.2695          32.1612            1.41s
         6         127.7283          21.7915            1.51s
         7         113.9092          13.8328            1.50s
         8          90.2346          -0.1604            1.48s
         9          88.3406          19.2086            1.52s
        10          79.9901           8.9684            1.53s
        20          35.4960          -0.0202            1.44s
        30          24.1291          -0.2883            1.41s
        40          17.8366          -0.2582            1.36s
        50          14.3787          -1.4676            1.30s
        60          13.4785           1.6916            1.34s
       

In [12]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 5, 'n_estimators': 300, 'subsample': 0.5}
-87.54020862592007


In [15]:
bm = gcv.best_estimator_
y_pred = bm.predict(X_test)
r2_score(y_test, y_pred)



-3.551598094159708