In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# Load the 'Concrete' data

concrete = pd.read_csv(r"Concrete_Data_V1.0.csv", header=0)

# Copy the file to back-up

concrete_bk = concrete.copy()

# Display first 5 rows in the dataset

concrete.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# Display the information of dataset

concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [4]:
# Display the columns
concrete.columns

Index(['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer',
       'CoarseAggregate', 'FineAggregate', 'Age', 'CSinMPa'],
      dtype='object')

In [5]:
# prepare columns for scaling
cols1 = ['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer',
       'CoarseAggregate', 'FineAggregate', 'Age']

In [7]:
# Identify the Independent and Target variables

IndepVar = []
for col in concrete.columns:
    if col != 'CSinMPa':
        IndepVar.append(col)

TargetVar = 'CSinMPa'

x = concrete[IndepVar]
y = concrete[TargetVar]

In [9]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)

# Display the shape of train and test data 

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [10]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

In [11]:
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

# Create model object

MulRGR = LinearRegression()

#LinearRegression(fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False)

# Train the model with training dataset (70%)

MulRGR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = MulRGR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Mean Absolute Error (MAE): 8.355
Mean Squared Error (MSE): 111.763
Root Mean Squared Error (RMSE): 10.572
R2_score: 0.58694
Root Mean Squared Log Error (RMSLE): 2.358


In [12]:
# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

Mean Absolute Percentage Error (MAPE): 32.463 %


In [13]:
# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Adj R Square:  0.583703


In [14]:
Results = pd.DataFrame({'CSinMPa_A':y_test, 'CSinMPa_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = concrete_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P
682,183.9,122.6,0.0,203.5,0.0,959.2,800.0,28,24.05,24.05,18.786191
580,290.2,193.5,0.0,185.7,0.0,998.2,704.3,7,21.86,21.86,37.915192
950,326.5,0.0,137.9,199.0,10.8,801.1,792.5,28,38.63,38.63,33.555912
453,190.3,0.0,125.2,166.6,9.9,1079.0,798.9,56,31.72,31.72,29.945226
371,218.9,0.0,124.1,158.5,11.3,1078.7,794.9,28,30.22,30.22,31.464104


# Compare withh all Regression / Regressors

In [16]:
# Load the results dataset

RegResults = pd.read_csv(r"E:\SDP-2\S7\RGRResults.csv", header=0)

# Display the first 5 records

RegResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [18]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

ModelMLR= LinearRegression()
ModelDCR = DecisionTreeRegressor()
ModelRFR = RandomForestRegressor()
ModelETR = ExtraTreesRegressor()

MM = [ModelMLR, ModelDCR, ModelRFR, ModelETR]

for models in MM:
    
 # Fit the model with train data
    
    models.fit(x_train, y_train)
    
 # Predict the model with test data

    y_pred = models.predict(x_test)
    
 # Print the model name
    
    print('Model Name: ', models)
    
# Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
                                                
# Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
# Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    
                                                        
#-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RegResults = RegResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 8.355
Mean Squared Error (MSE): 111.763
Root Mean Squared Error (RMSE): 10.572
R2_score: 0.58694
Root Mean Squared Log Error (RMSLE): 2.358
Mean Absolute Percentage Error (MAPE): 32.46 %
Adj R Square:  0.583703
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 4.974
Mean Squared Error (MSE): 54.098
Root Mean Squared Error (RMSE): 7.355
R2_score: 0.800064
Root Mean Squared Log Error (RMSLE): 1.995
Mean Absolute Percentage Error (MAPE): 17.22 %
Adj R Square:  0.798497
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 3.965
Mean Squared Error (MSE): 32.891
Root Mean Squared Error (RMSE): 5.735
R2_score: 0.878438
Root Mean Squared Log Error (RMSLE): 1.747
Mean Absolute Percentage Error (MAP

In [19]:
# Results with comparing the all the algorithms 

#RGRResults.to_csv("D://00 Henotic//SRKR//Datasets//REsults//RGRResults.csv")

RegResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),8.354868,0.583703,10.57182,32.462977,111.763381,2.358192,0.58694
1,DecisionTreeRegressor(),4.974045,0.798497,7.355101,17.215273,54.097509,1.995394,0.800064
2,"(DecisionTreeRegressor(max_features=1.0, rando...",3.964597,0.877486,5.735109,13.286224,32.891476,1.746607,0.878438
3,"(ExtraTreeRegressor(random_state=298641149), E...",4.016977,0.878424,5.713119,13.294039,32.639729,1.742765,0.879369


In [20]:
# predict the values with Et Algorithm

y_predET = ModelETR.predict(x_test)

In [23]:
Results = pd.DataFrame({'CSinMPa_A':y_test, 'CSinMPa_P_F':y_predET})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = concrete_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P_F
523,359.0,19.0,141.0,154.0,10.9,942.0,801.0,28,62.94,62.94,56.8246
394,405.0,0.0,0.0,175.0,0.0,1120.0,695.0,28,52.3,52.3,44.3584
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29,39.29,40.2955
629,225.0,0.0,0.0,181.0,0.0,1113.0,833.0,7,11.17,11.17,10.7476
870,149.0,139.0,109.0,193.0,6.0,892.0,780.0,28,23.69,23.69,25.0817


In [24]:
# calculate % of error

ResultsFinal['%Error'] = round(((ResultsFinal['CSinMPa_A']-ResultsFinal['CSinMPa_P_F'])/ResultsFinal['CSinMPa_A'])*100,3)

In [25]:
ResultsFinal.sample(5)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P_F,%Error
453,190.3,0.0,125.2,166.6,9.9,1079.0,798.9,56,31.72,31.72,38.5277,-21.462
902,331.0,170.0,0.0,195.0,8.0,811.0,802.0,28,56.61,56.61,54.7387,3.306
210,230.0,0.0,118.3,195.5,4.6,1029.4,758.6,14,20.08,20.08,20.4842,-2.013
86,362.6,189.0,0.0,164.9,11.6,944.7,755.8,3,35.3,35.3,34.6388,1.873
66,139.6,209.4,0.0,192.0,0.0,1047.0,806.9,360,44.7,44.7,40.5236,9.343
