# Structural Concrete Strength Prediction Model with 85% Accuracy 

In [None]:
#Main Library Imports 
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use('ggplot')
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import data and show first 5 rows *units are kg/m^3, days, and MPa*
concCompStrength = pd.read_csv('/kaggle/input/concrete-compressive-strength/Concrete Compressive Strength.csv')
#rename the columns to access them easier 
concCompStrength.columns = ['cement','slag','fly ash','water','superplasticizer','coarse agg','fine agg','age','strength']
#add a column for W/C ratio 
concCompStrength.insert(7,'wc ratio',concCompStrength['water'] / concCompStrength['cement'])
concCompStrength.head()

I added one new column to the data based on my experience with concrete mix design. Often the ratio of water to cement is a better indicator of the strength compared with just water content or cement content alone. 

# Explore The Data 

In [None]:
#number of rows and columns
originalRows = concCompStrength.shape[0]
concCompStrength.shape

We have 1030 concrete mix samples and 9 variables to predict the strength

In [None]:
#check if any values are null
concCompStrength.isnull().any()

None of the values are missing which is good

In [None]:
#check the descriptive statistics
concCompStrength.describe()

I noticed that the minimum age here is 1 day which is not nearly enough for structural concrete to gain its full strength. 
"Concrete gains 16% strength in one day, 40% in 3 days, 65% in 7 days, 90% in 14 days, and 99% strength in 28 days" (from theconstructor.org). Based on this I decided to remove all samples with strength measured before 28 days.

In [None]:
#remove rows with age less than 28 days 
concCompStrength = concCompStrength.drop(concCompStrength[concCompStrength.age < 28].index)
concCompStrength.head(10)

In [None]:
#check the descriptive statistics
concCompStrength.describe()

When I removed the mixes with age less than 28 days, this got rid of 324 samples. The min strength is now a bit more reasonable and the variation in the overall strength is lower 

# Exploratory Data Analysis 

In [None]:
#create boxplot to visualize outliers
plt.figure(figsize = (15,10))
box = sns.boxplot(data=concCompStrength)

The water, superplasricizer, fine aggregate, wc ratio, and age variables all contain some outliers so I decided to remove those outliers with values of 3 standard deviations away from the mean.

In [None]:
#remove outliers defined as values greater than 3 STD from the mean
z_scores = stats.zscore(concCompStrength) #z-score = (value-mean)/STD

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
concCompStrength = concCompStrength[filtered_entries]

concCompStrength.head(10)

In [None]:
#check the descriptive statistics
concCompStrength.describe()

Remiving outliers got rid of 32 samples. However, most of the descriptive statistics for strength remain the same. Also, some rows with very high age were removed. This is okay because an older concrete will not be as representative of its original strength. 

In [None]:
#define the weak mixes are less than 15MPa strength (usually these are for non-structural purposes)
#is there any feature that makes a mix weaker? 
weakVal = 15
weakMixes = concCompStrength[concCompStrength.strength < weakVal]
normalMixes = concCompStrength[concCompStrength.strength >= weakVal]
weakOrNorm = np.where(concCompStrength.strength < weakVal,'Weak','Normal')
from collections import Counter
counts = Counter(weakOrNorm)
plt.bar(counts.keys(),counts.values())
print(counts)

At this point I was concerned with the minimum strength of the samples being only 8 MPa. Structural Concrete typically has a strength greater than 15MPa and anything less that that is mainly used as backfill or for purposes where the strength is not so important. In this dataset there are 17 weak mixes with strength less than 15 MPa. 

In [None]:
#visualize all weak mix data versus strength
plt.figure(figsize = (15, 15))
for idx,col in enumerate(weakMixes.columns,start=1):
    if idx>9:
        break
    plt.subplot(3,3,idx)
    sns.scatterplot(data=weakMixes, x=col, y="strength")

It is clear that all the weak mixes have no slag content. Although 15 samples is not really enough to make any conclusion from this, it is still an interesting find that could be investigated with more data. Another note to make here is that the w/c ratio are all greater than 1 except for one sample. This is expected since having more water than cement typically weakens a mix.

In [None]:
#Check the correlation of the weak mix variables
plt.figure(figsize = (8, 8))
sns.heatmap(weakMixes.corr(),annot=True).set_title(f'Weak Mixes (<{weakVal}MPa)')

The slag content and age do not show up here vbecuase their values do not change. Every weak mix has the same age ans same slag content. Also, water content has a high negative correlation. This means that more water = weaker mix. Coarse aggregate has a pretty high positive correlation meaning more coarse agg = stronger mix.

In [None]:
#visualize the distribution of the Normal Strength mixes 
plt.figure(figsize = (15, 15))
for idx,col in enumerate(normalMixes.columns,start=1):
    if idx==9:
        continue
    if idx==10:
        idx=9
    plt.subplot(3,3,idx)
    sns.histplot(data=normalMixes, x=col)

Note for the variables of slag, fly ash, and superplasticizer, we have a lot of mixes with a very small amount of them (i.e close to 0). In general, these components are the most expensive in concrete mixes and can be thought of as "add-ons" so it is reasonable that most mixes will not contain a lot of these components. I notice also that for fly ash there is not much data between 25-75 kg/m^3 and so I expect predictions for mixes wtih fly ash content in this range will not be very good.

In [None]:
#visualize all normal mix data (strength > 15MPa) versus strength
plt.figure(figsize = (15, 15))
for idx,col in enumerate(normalMixes.columns,start=1):
    if idx>9:
        break
    plt.subplot(3,3,idx)
    sns.scatterplot(data=normalMixes, x=col, y="strength")

In [None]:
#Check the correlation of the normal mix variables
plt.figure(figsize = (8, 8))
sns.heatmap(normalMixes.corr(),annot=True).set_title(f'Normal Mixes (>{weakVal}MPa)')

In general, for normal strength mixes, the WC ratio is correlated most with the strength at -0.6. We can see here that the new variable of WC ratio is probably more useful than cement or water content alone. Coarse aggregate seems to be mostly irrelevant in normal mix strength. In weak mixes this was not the case.

# Setting up Numerical Model 

In [None]:
#normalize the data to use for ML algorithms 
#use standard scaler to make means of all distributions 0
from sklearn.preprocessing import StandardScaler
strength = normalMixes['strength']
variables = normalMixes.drop(columns='strength')

sc=StandardScaler()
scaledVars = sc.fit_transform(variables)
scaledVars = pd.DataFrame(scaledVars,columns=variables.columns)
scaledVars.head()

In [None]:
#split the data into an 70/30 train/test set
from sklearn.model_selection import train_test_split
trainVars,testVars,trainStrength,testStrength=train_test_split(scaledVars,strength,test_size=.30,random_state=0)

In [None]:
#define the ML regression models to use 
from sklearn.linear_model import SGDRegressor,GammaRegressor,Lasso,GammaRegressor,ElasticNet,Ridge,LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
# Import model evaluation Tools 
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV 

lr=LinearRegression()
knn=KNeighborsRegressor()
rf=RandomForestRegressor()
dt=DecisionTreeRegressor()
lasso=Lasso()
sgd=SGDRegressor()
ridge=Ridge()
gboost=GradientBoostingRegressor()
bagging=BaggingRegressor()
adboost=AdaBoostRegressor()
etr=ExtraTreesRegressor()

# Train All The Models

In [None]:
#loop through the algortihms to train them all 
models=[lr,knn,rf,dt,lasso,sgd,ridge,gboost,bagging,adboost,etr]
modelNames = ['LR','KNN','RF','DT','Lasso','SGD','Ridge','GBoost','Bagging','ADBoost','ETR']

trainAccuracy=[]
testAccuracy=[]
for model in models:
    #fit the model to the data
    model.fit(trainVars,trainStrength)
    #get the r-squared score
    trainAccuracy.append(model.score(trainVars,trainStrength))
    testAccuracy.append(model.score(testVars,testStrength))
    
mod=pd.DataFrame([modelNames,trainAccuracy,testAccuracy]).T
mod.columns=['model','train score','test score']

In [None]:
plt.figure(figsize = (20, 8))
bars = sns.barplot(x='model',y='test score',data=mod)
for i,score in enumerate(mod['test score']):
    bars.text(i,score,round(score,4),ha='center')
plt.ylim(0.50, 1)
plt.show()

The Extra Trees Regressor is the most accurate but Random Forest, Gradient Boost, and Bagging are also intriguing. However, to further simplify the model, it may be useful to remove some variables such as water & cement since the w/c ratio is already included. Also since coarse aggregate has a very low correlation with strength (-0.06) it is probably okay to try and remove this variable as well.

In [None]:
#remove coarse aggregate from the variables 
#variables2 = variables.drop(columns='coarse agg')
variables2 = variables.drop(columns=['water', 'cement','coarse agg'])

scaledVars2 = sc.fit_transform(variables2)
scaledVars2 = pd.DataFrame(scaledVars2,columns=variables2.columns)

In [None]:
#split the data into an 70/30 train/test set
trainVars2,testVars2,trainStrength2,testStrength2=train_test_split(scaledVars2,strength,test_size=.30,random_state=0)

trainAccuracy2=[]
testAccuracy2=[]
for model in models:
    #fit the model to the data
    model.fit(trainVars2,trainStrength2)
    #get the r-squared score
    trainAccuracy2.append(model.score(trainVars2,trainStrength2))
    testAccuracy2.append(model.score(testVars2,testStrength2))
    
mod2=pd.DataFrame([modelNames,trainAccuracy2,testAccuracy2]).T
mod2.columns=['model','train score','test score']

In [None]:
plt.figure(figsize = (20, 8))
bars2 = sns.barplot(x='model',y='test score',data=mod2)
for i,score in enumerate(mod2['test score']):
    bars2.text(i,score,round(score,4),ha='center')
plt.ylim(0.50, 1)
plt.show()

After removing the water, cement, and coarse aggregate variables, the test score accuracy does not change significantly (less than 1%) so it is better to use this simpler model. Since RF and GBoost performed the best in this quick test, I will use a grid search and cross validation to validate results and optimize parametrs.

# Model Optimization and Validation

In [None]:
# hyper parameter tuning of gradient boost regressor 
grid_params = {
    'loss': ['ls', 'lad', 'huber', 'quantile'],
    'n_estimators': [1,2,5,10,20,50,100],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
    'min_samples_split' : [1,2,3,4,5],
    'min_samples_leaf' : [1,2,3,4,5]
}

grid_search = GridSearchCV(gboost, grid_params, cv = 5, n_jobs = -1)
grid_search.fit(trainVars2, trainStrength2)

# best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

In [None]:
# hyper parameter tuning of random forest regressor 
grid_params = {
    'n_estimators': [1,2,5,10,20,50,100],
    'max_depth' : [None,3,5,7,9,10,20],
    'min_samples_split' : [1,2,3,4,5],
    'min_samples_leaf' : [1,2,3,4,5]
}

grid_search = GridSearchCV(rf, grid_params, cv = 5, n_jobs = -1)
grid_search.fit(trainVars2, trainStrength2)

# best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

The Gradient Boost regressor has a higher accuracy of 84.2% so this model will be trained with those selected parameters from the grid search.

In [None]:
strengthPredictionModel = GradientBoostingRegressor(criterion='friedman_mse', loss='huber', min_samples_leaf=4, min_samples_split=3, n_estimators=100)
strengthPredictionModel.fit(trainVars2, trainStrength2)
finalTestScore = model.score(testVars2,testStrength2)
print(f'Final Test Score: {finalTestScore}')

In [None]:
from numpy import arange
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
plt.figure(figsize = (20, 8))
actual = testStrength2
predictions = strengthPredictionModel.predict(testVars2)

rowsToPlot = 30
plt.scatter(arange(len(predictions[:rowsToPlot])),predictions[:rowsToPlot])
plt.scatter(arange(len(actual[:rowsToPlot])),actual[:rowsToPlot],marker='^')
plt.legend(['prediction','actual'])
plt.ylabel('Strength (kPa)')

r2 = r2_score(actual,predictions)
rmse = mean_squared_error(actual,predictions,squared=False)
print(f'r2:{r2}')
print(f'rmse:{rmse}')

The RMSE indicates that the average error in the prediction is about 5.6 MPa which is not bad. The final accuracy is 85%.