In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

# import datetime class from datetime module

from datetime import datetime

In [2]:
# load the Concrete dataset 

concrete = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Ensemble Learning\Concrete_Data_V1.0.csv", header=0)

# copy the file to back-up file

concrete_bk = concrete.copy()

# display first 5 records

concrete.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# Display concrete data information

concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [4]:
# Prepare cols1 for scaling

cols1 = ['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer', 'CoarseAggregate', 'FineAggregate', 'Age']

In [5]:
# Identify the independent and Target variables

IndepVar = []
for col in concrete.columns:
    if col != 'CSinMPa':
        IndepVar.append(col)

TargetVar = 'CSinMPa'

x = concrete[IndepVar]
y = concrete[TargetVar]

In [6]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

# Gradient Boosting Regressor Algorithm

In [8]:
# Build the model with Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor  

modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, 
                                     subsample=1.0,
                                     criterion='friedman_mse', 
                                     min_samples_split=2, 
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, 
                                     max_depth=3, 
                                     min_impurity_decrease=0.0,
                                     init=None, 
                                     random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, 
                                     max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, 
                                     n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
modelGBR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = modelGBR.predict(x_test)

# Evaluation metrics for Regression analysis

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 4.822
Mean Squared Error (MSE): 42.295
Root Mean Squared Error (RMSE): 6.503
R2_score: 0.838561
Root Mean Squared Log Error (RMSLE): 1.872
Mean Absolute Percentage Error (MAPE): 15.844 %
Adj R Square:  0.837296


In [9]:
Results = pd.DataFrame({'CSinMPa_A':y_test, 'CSinMPa_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = concrete_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(10)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P
425,167.0,75.4,167.0,164.0,7.9,1007.3,770.1,14,32.9,32.9,29.713209
823,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,29.59,29.59,37.425121
265,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,14,31.35,31.35,21.622967
117,313.3,262.2,0.0,175.5,8.6,1046.9,611.8,28,59.8,59.8,51.393427
435,167.4,129.9,128.6,175.5,7.8,1006.3,746.6,28,41.2,41.2,28.223175
858,322.0,149.0,0.0,186.0,8.0,951.0,709.0,28,52.42,52.42,45.978973
197,194.7,0.0,100.5,165.6,7.5,1006.4,905.9,56,33.96,33.96,36.56103
495,387.0,20.0,94.0,157.0,13.9,938.0,845.0,56,47.97,47.97,57.070984
544,289.0,0.0,0.0,192.0,0.0,913.2,895.3,7,14.6,14.6,14.027524
128,401.8,94.7,0.0,147.4,11.4,946.8,852.1,28,68.5,68.5,59.245099


In [10]:
# Calculate the Max, Min and ave values of target variable

print('Min of CSinMPa:', concrete.CSinMPa.min())
print('Max of CSinMPa:', concrete.CSinMPa.max())
print('Ave of CSinMPa:', concrete.CSinMPa.mean())

Min of CSinMPa: 2.33
Max of CSinMPa: 82.6
Ave of CSinMPa: 35.81796116504851


In [11]:
# Load the result dataset

RGRResults = pd.read_csv(r"C:\Users\Admin\Downloads\Raju Sir DLS\Ensemble Learning\RGRResults.csv", header=0)
RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


# Gradient Boosting Regressor & Compare with all regressors

In [18]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelABR = AdaBoostRegressor(n_estimators=50, base_estimator=None ,learning_rate=1)
modelSVR = SVR()
modelXGR = xgb()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()
modelMLPR = MLPRegressor()

modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelrfr, modelABR, modelSVR, modelXGR, modelKNN, modelETR, modelMLPR, modelGBR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 8.386
Mean Squared Error (MSE): 115.568
Root Mean Squared Error (RMSE): 10.75
R2_score: 0.558885
Root Mean Squared Log Error (RMSLE): 2.375
Mean Absolute Percentage Error (MAPE): 31.33 %
Adj R Square:  0.555429
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 5.978
Mean Squared Error (MSE): 76.467
Root Mean Squared Error (RMSE): 8.745
R2_score: 0.70813
Root Mean Squared Log Error (RMSLE): 2.168
Mean Absolute Percentage Error (MAPE): 19.22 %
Adj R Square:  0.705843
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 4.641
Mean Squared Error (MSE): 40.903
Root Mean Squared Error (RMSE): 6.396
R2_score: 0.843878
Root Mean Squared Log Error (RMSLE): 1.856
Mean Absolute Percentage Error (MAPE

In [20]:
# Results with comparing the all the algorithms 

RGRResults.to_csv("C://Users//Admin//Downloads//Raju Sir DLS//Ensemble Learning//RGRResults.csv")

RGRResults.head(15)

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),8.385699,0.555429,10.750258,31.326908,115.568039,2.37493,0.558885
1,DecisionTreeRegressor(),6.022853,0.695044,8.903613,19.140255,79.274317,2.186457,0.697415
2,"(DecisionTreeRegressor(max_features='auto', ra...",4.838488,0.830056,6.646629,14.773628,44.177675,1.89411,0.831377
3,"(DecisionTreeRegressor(max_depth=3, random_sta...",6.687897,0.733656,8.320872,25.151152,69.236915,2.118767,0.735727
4,SVR(),8.445729,0.537147,10.969064,32.725504,120.320371,2.395079,0.540745
5,"XGBRegressor(base_score=0.5, booster='gbtree',...",5.066323,0.822676,6.789394,16.231458,46.095873,1.915362,0.824055
6,KNeighborsRegressor(),7.276524,0.650777,9.527948,26.921693,90.78179,2.254229,0.653492
7,"(ExtraTreeRegressor(random_state=1753356582), ...",4.385799,0.853519,6.170752,13.443835,38.078176,1.819821,0.854658
8,MLPRegressor(),10.10084,0.401075,12.477693,43.759423,155.692812,2.523942,0.405731
9,([DecisionTreeRegressor(criterion='friedman_ms...,4.8177,0.837539,6.498646,15.813317,42.232394,1.871594,0.838802
