# Context: Concrete strength dataset

In this dataset, we are going to calculate concrete strength by using gradient boosting regressor with features like Cement, blast, furnace slag, fly ash, water, Superplasticizer, Coarse Aggregate, Fine Aggregate, Age.,

In [None]:
# Importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Gradient boosting regression/concrete_data.csv')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [None]:
# Summarizing the dataset

df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


It seems there are no null values present in the dataset and all the columns are numeric.

# Data Pre-processing

In [None]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [None]:
df[df.duplicated()].shape

(25, 9)

# Segregating the dataset for X and Y

In [None]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# Spilitting the dataset for training and testing

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 45)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((824, 8), (206, 8), (824,), (206,))

In [None]:
df.shape

(1030, 9)

# Loading the *model*

In [None]:
model = GradientBoostingRegressor()
model

# Training the model

In [None]:
model.fit(x_train,y_train)

# Predicting the result using trained model

In [None]:
y_pred = model.predict(x_test)
y_pred

array([16.34825889,  9.2808359 , 33.3603171 , 70.95513983, 38.93306413,
       26.21752251, 27.31863928, 35.07293786, 33.54279449, 19.38189132,
       44.99875293, 27.87333323, 13.36261161, 14.06116284, 16.88356329,
       40.30851805, 60.2683929 ,  8.74389493, 31.10946687, 40.01495909,
       27.45602436, 30.109577  , 48.25055499, 46.32960431, 47.14969822,
       72.02295136, 25.56880917, 22.02801743, 34.21666142, 35.02793215,
       17.03917688, 13.0657306 , 31.16846794, 57.19536686, 30.94220938,
       33.62249002, 27.10190893, 34.46329882, 40.46107896, 29.64297197,
       15.37129203, 42.8579124 , 22.2516449 , 60.11087302, 34.8352127 ,
       21.52628772, 43.4349893 , 34.32881527, 34.14757433, 45.41698229,
       55.25729649, 39.10367109, 58.39334812, 43.08486477, 17.96103358,
       32.85821796, 24.41183441, 26.82344142, 37.09339319, 38.7371936 ,
       24.37438006, 17.2174431 , 51.68203904, 25.95899613, 62.34102018,
        5.67691661, 42.20140221, 25.92941368, 37.8250699 , 27.13

# Calculating the score

In [None]:
r2_score(y_test,y_pred)

0.8957293805681384

In [None]:
# Predicting the result for a single date

model.predict(x_test[[10]])

array([44.99875293])