In [15]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [16]:
df = pd.read_csv('Concrete_Data.csv')

In [17]:
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
Cement (component 1)(kg in a m^3 mixture)                1030 non-null float64
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    1030 non-null float64
Fly Ash (component 3)(kg in a m^3 mixture)               1030 non-null float64
Water  (component 4)(kg in a m^3 mixture)                1030 non-null float64
Superplasticizer (component 5)(kg in a m^3 mixture)      1030 non-null float64
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     1030 non-null float64
Fine Aggregate (component 7)(kg in a m^3 mixture)        1030 non-null float64
Age (day)                                                1030 non-null int64
Concrete compressive strength(MPa, megapascals)          1030 non-null float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [21]:
df.duplicated().sum()

25

In [28]:
df.drop_duplicates(inplace=True)

Preparing X and Y variables

In [29]:
X = df.drop(['Concrete compressive strength(MPa, megapascals) '],axis=1)

In [30]:
y = df['Concrete compressive strength(MPa, megapascals) ']

In [31]:
X.shape

(1005, 8)

In [32]:
y.shape

(1005,)

In [33]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((804, 8), (201, 8))

In [34]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [35]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 10.0018
- Mean Absolute Error: 7.9599
- R2 Score: 0.6099
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.1913
- Mean Absolute Error: 8.8953
- R2 Score: 0.5802


Lasso
Model performance for Training set
- Root Mean Squared Error: 10.0049
- Mean Absolute Error: 7.9641
- R2 Score: 0.6096
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.1846
- Mean Absolute Error: 8.8945
- R2 Score: 0.5807


Ridge
Model performance for Training set
- Root Mean Squared Error: 10.0018
- Mean Absolute Error: 7.9599
- R2 Score: 0.6099
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.1913
- Mean Absolute Error: 8.8953
- R2 Score: 0.5802


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 7.4639
- Mean Absolute Error: 5.5137
- R2 Score: 0.7827
-----------------

  if getattr(data, 'base', None) is not None and \


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 3.8650
- Mean Absolute Error: 2.8722
- R2 Score: 0.9417
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4985
- Mean Absolute Error: 4.0289
- R2 Score: 0.8987


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 1.8651
- Mean Absolute Error: 1.2405
- R2 Score: 0.9864
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.2333
- Mean Absolute Error: 2.5480
- R2 Score: 0.9399


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 7.0157
- Mean Absolute Error: 5.8683
- R2 Score: 0.8080
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 7.9753
- Mean Absolute Error: 6.4897
- R2 Score: 0.7868




Results

In [36]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.939929
5,Random Forest Regressor,0.909198
6,XGBRegressor,0.898657
4,Decision Tree,0.882245
8,AdaBoost Regressor,0.78679
3,K-Neighbors Regressor,0.697188
1,Lasso,0.580673
2,Ridge,0.580171
0,Linear Regression,0.58017


Linear Regression

In [38]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 58.02


Difference between Actual and Predicted Values

In [39]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
951,19.01,23.460192,-4.450192
654,24.29,16.703043,7.586957
706,26.32,22.452090,3.867910
538,34.57,37.396243,-2.826243
389,44.13,42.885008,1.244992
...,...,...,...
232,50.77,30.605747,20.164253
802,31.65,26.974240,4.675760
358,66.95,46.913969,20.036031
234,13.18,24.692249,-11.512249
