# Concrete Compressive Strength Prediction

*Dataset :* [link](https://www.kaggle.com/maajdl/yeh-concret-data)

## Abstract:
- Concrete is the most important material in civil engineering.
- The concrete compressive strength is a highly nonlinear function of age and ingredients.

## Data Attributes Information:

Given are the variable name, variable type, the measurement unit and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database.

Name -- Data Type -- Measurement -- Description

- Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
- Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
- Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
- Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
- Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
- Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
- Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable
- Age -- quantitative -- Day (1~365) -- Input Variable
- Concrete compressive strength -- quantitative -- MPa -- Output Variable

# ***Plan of attack***

***- Imputation***

***- Feature Selection***

***- Dimensionality Reduction***

***- Exploratory Data Analysis***

***- Dealing with outliers***

***- Feature Transformation*** (Standardization/Normalization)

***- Model Building*** (Iterative process)

***- Cross Validating*** (Iterative process)

***- Builing secure deployment ready pipelines***

***- Exporting binary file for production*** (Testing Environment)

# Imports

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import set_config
set_config(display='diagram')
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, StackingRegressor, BaggingRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
import missingno
from xgboost import XGBRegressor
from matplotlib import rcParams
import pickle



In [None]:
dataset = pd.read_csv('../input/yeh-concret-data/Concrete_Data_Yeh.csv')
dataset.head()

In [None]:
dataset.rename(columns={'csMPa': 'strength'}, inplace=True)
print(dataset.shape)
dataset.head()

# Any missing values ?

In [None]:
plt.rcParams.update({'font.size': 30})
missingno.matrix(dataset, figsize = (30,10));
plt.title("No missing values");

# Splitting dataset into training set and test set

In [None]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.2, random_state=15)

In [None]:
X_train.shape

In [None]:
temp = y_test
temp.shape=(206,1)
testcsv = np.concatenate((X_test,temp),axis=1)
testcsv = pd.DataFrame(testcsv,columns=dataset.columns)
testcsv.head()

In [None]:
#testcsv.to_csv('concrete_strength_testing.csv', index=False)

In [None]:
df = pd.DataFrame(X_train, columns = X.columns)
df.head()

# Feature Selection

In [None]:
plt.rcParams.update({'font.size': 18})
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(dataset.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', ax=ax);

In [None]:
model = ExtraTreesRegressor()
model.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(10,7))
plt.style.use('fivethirtyeight')
feature_imps = pd.Series(model.feature_importances_, index = df.columns)
feature_imps.nlargest(8).plot(kind='barh')
plt.grid(True)
plt.title("Feature Selection", pad=50)
plt.xlabel('Feature Importance')
plt.show()

# How many features to extract/select ?

In [None]:
plt.figure(figsize=(7,5))
pca_dummy = PCA(n_components=None)
X_dummy = X_train
X_dummy = pca_dummy.fit_transform(X_dummy)
plt.plot(np.cumsum(pca_dummy.explained_variance_ratio_))
plt.axhline(y = 0.99, c='r', linewidth =1, linestyle='--')
plt.axvline(x=5, c='r', linestyle = '--', linewidth =1)
plt.axhline(y = 0.95, c='g', linewidth =1, linestyle='--' )
plt.axvline(x=3.7, c='g', linestyle = '--', linewidth =1)
plt.axhline(y = 0.90, c='m', linewidth =1, linestyle='--' )
plt.axvline(x=3.3, c='m', linestyle = '--', linewidth =1)
plt.margins(0.1)
plt.title('Number of feature to select : 5', pad=30)
plt.plot()

### ***We select cement, slag, water, superplasticizer and age. Hence we eliminate flyash, fine aggregate and coarse aggregate***

In [None]:
plt.rcParams.update({'font.size': 15})
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(dataset[['cement','slag','water','superplasticizer','age','strength']].corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', ax=ax);

In [None]:
df = df[['cement', 'slag', 'water', 'superplasticizer', 'age']]

In [None]:
columns=list(df.columns)
columns

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(17,45))
for i in enumerate(columns):
    plt.style.use('fivethirtyeight')
    plt.subplot(8,2,(2*i[0])+1)
    plt.hist(df[i[1]],alpha=0.75,edgecolor='black',linewidth=2)
    plt.title(f'{i[1]} frequency distribution plot',fontsize=25)
    plt.xlabel(f'{i[1]} kg/m3',fontsize=15)
    plt.ylabel('Count',fontsize=15)
    plt.tight_layout(pad=5.0)
    plt.grid(False)
    
    plt.subplot(8,2,(2*i[0])+2)

    sns.boxplot(df[i[1]],color='tab:orange',linewidth=1)
    plt.title(f'{i[1]} box plot',fontsize=25)
    plt.tight_layout(pad=5.0)

## ***Inferences from EDA***

- The data is very much skewed. We need to transform it to nearly normal distribution.
- Water, Superplasticizer and Age columns have some outliers. We need to fix them before feature transformation

# Dealing with Outliers

In [None]:
# Setting upper limit and lower limit for cement
p25_cement = df.cement.quantile(0.25)
p75_cement = df.cement.quantile(0.75)
iqr_cement = p75_cement - p25_cement
upper_cement = p75_cement + 1.5*iqr_cement
lower_cement = 0

# Setting upper limit and lower limit for slag
p25_slag = df.slag.quantile(0.25)
p75_slag = df.slag.quantile(0.75)
iqr_slag = p75_slag - p25_slag
upper_slag = p75_slag + 1.5*iqr_slag
lower_slag = 0

# Setting upper limit and lower limit for flyash
#p25_flyash = df.flyash.quantile(0.25)
#p75_flyash = df.flyash.quantile(0.75)
#iqr_flyash = p75_flyash - p25_flyash
#upper_flyash = p75_flyash + 1.5*iqr_flyash
#lower_flyash = 0

# Setting upper limit and lower limit for water
p25_water = df.water.quantile(0.25)
p75_water = df.water.quantile(0.75)
iqr_water = p75_water - p25_water
upper_water = p75_water + 1.5*iqr_water
lower_water = p25_water - 1.5*iqr_water


# Setting upper limit and lower limit for superplasticizer
p25_sup = df.superplasticizer.quantile(0.25)
p75_sup = df.superplasticizer.quantile(0.75)
iqr_sup = p75_sup - p25_sup
upper_superplasticizer = p75_sup + 1.5*iqr_sup
lower_superplasticizer = 0


# Setting upper limit and lower limit for courseaggregate
#p25_coarseaggregate = df.coarseaggregate.quantile(0.25)
#p75_coarseaggregate = df.coarseaggregate.quantile(0.75)
#iqr_coarseaggregate = p75_coarseaggregate - p25_coarseaggregate
#upper_coarseaggregate = p75_coarseaggregate + 1.5*iqr_coarseaggregate
#lower_coarseaggregate = p25_coarseaggregate - 1.5*iqr_coarseaggregate


# Setting upper limit and lower limit for fineaggregate
#p25_fa = df.fineaggregate.quantile(0.25)
#p75_fa = df.fineaggregate.quantile(0.75)
#iqr_fa = p75_fa - p25_fa
#upper_fineaggregate = p75_fa + 1.5*iqr_fa
#lower_fineaggregate = p25_fa - 1.5*iqr_fa

# Setting upper limit and lower limit for age
p25_age = df.age.quantile(0.25)
p75_age = df.age.quantile(0.75)
iqr_age = p75_age - p25_age
upper_age = p75_age + 1.5*iqr_age
lower_age = 0


### Outliers in water column

In [None]:
print(upper_water, lower_water)

In [None]:
print(df[df.water > upper_water].shape)
df[df.water > upper_water]

In [None]:
print(df[df.water < lower_water].shape)
df[df.water < lower_water]

In [None]:
df.water = np.where(df.water>upper_water, upper_water, np.where(df.water<lower_water,lower_water,df.water))

In [None]:
df[df.water > upper_water]

In [None]:
df[df.water < lower_water]

### Outliers in Superplasticizer column

In [None]:
print(upper_superplasticizer, lower_superplasticizer)

In [None]:
print(df[df.superplasticizer > upper_superplasticizer].shape)
df[df.superplasticizer > upper_superplasticizer]

In [None]:
print(df[df.superplasticizer < lower_superplasticizer].shape)
df[df.superplasticizer < lower_superplasticizer]

In [None]:
df.superplasticizer = np.where(df.superplasticizer>upper_superplasticizer, upper_superplasticizer, df.superplasticizer)

In [None]:
df[df.superplasticizer > upper_superplasticizer]

### Outliers in Fine aggregate column

In [None]:
#print(upper_fineaggregate, lower_fineaggregate)

In [None]:
#print(df[df.fineaggregate > upper_fineaggregate].shape)
#df[df.fineaggregate > upper_fineaggregate]

In [None]:
#print(df[df.fineaggregate < lower_fineaggregate].shape)
#df[df.fineaggregate < lower_fineaggregate]

In [None]:
#df.fineaggregate = np.where(df.fineaggregate>upper_fineaggregate, upper_fineaggregate, df.fineaggregate)

In [None]:
#df[df.fineaggregate > upper_fineaggregate]

### Outliers in Age column

In [None]:
print(upper_age, lower_age)

In [None]:
print(df[df.age > upper_age].shape)
df[df.age > upper_age]

In [None]:
print(df[df.age < lower_age].shape)
df[df.age < lower_age]

In [None]:
df.age = np.where(df.age>upper_age, upper_age, df.age)

In [None]:
df[df.age > upper_age]

## Distribution/Shape of data after dealing with outliers

In [None]:
plt.figure(figsize=(17,45))
for i in enumerate(columns):
    plt.style.use('fivethirtyeight')
    plt.subplot(8,2,(2*i[0])+1)
    plt.hist(df[i[1]],alpha=0.75,edgecolor='black',linewidth=2)
    plt.title(f'{i[1]} frequency distribution plot',fontsize=25)
    plt.xlabel(f'{i[1]} kg/m3',fontsize=15)
    plt.ylabel('Count',fontsize=15)
    plt.tight_layout(pad=5.0)
    plt.grid(False)
    
    plt.subplot(8,2,(2*i[0])+2)

    sns.boxplot(df[i[1]],color='tab:orange',linewidth=1)
    plt.title(f'{i[1]} box plot',fontsize=25)
    plt.tight_layout(pad=5.0)

#### We still need to transform the data into nearly normal distribution

# Feature Transformation

In [None]:
X_train = df.iloc[:,:]
X_train_processed = X_train

In [None]:
pt = PowerTransformer()
X_train = pt.fit_transform(X_train)

In [None]:
df = pd.DataFrame(X_train, columns = columns)
df.head()

## Distribution/Shape of data after Feature Transformation

In [None]:
plt.figure(figsize=(17,45))
for i in enumerate(columns):
    plt.style.use('fivethirtyeight')
    plt.subplot(8,2,(2*i[0])+1)
    plt.hist(df[i[1]],alpha=0.75,edgecolor='black',linewidth=2)
    plt.title(f'{i[1]} frequency distribution plot',fontsize=25)
    plt.xlabel(f'{i[1]} kg/m3',fontsize=15)
    plt.ylabel('Count',fontsize=15)
    plt.tight_layout(pad=5.0)
    plt.grid(False)
    
    plt.subplot(8,2,(2*i[0])+2)

    sns.boxplot(df[i[1]],color='tab:orange',linewidth=1)
    plt.title(f'{i[1]} box plot',fontsize=25)
    plt.tight_layout(pad=5.0)

#### Now our data is ready for the Machine Learning algorithms

# ML Modelling

### Linear Models

In [None]:
lr = LinearRegression()
sgd = SGDRegressor()
en = ElasticNet()

In [None]:
print(cross_val_score(lr, X_train, y_train, cv =5, scoring = 'r2').mean())
print(cross_val_score(sgd, X_train, y_train, cv =5, scoring = 'r2').mean())
print(cross_val_score(en, X_train, y_train, cv =5, scoring = 'r2').mean())

### Polynomial linear model

In [None]:
poly_reg = PolynomialFeatures(degree=4)
X_poly= poly_reg.fit_transform(X_train)
print(cross_val_score(lr, X_poly, y_train, cv =5, scoring = 'r2').mean())

### Kernel based model

In [None]:
svr = SVR(kernel='rbf',C=70)
print(cross_val_score(svr, X_train, y_train, cv =5, scoring = 'r2').mean())

### Tree based model

In [None]:
dt = DecisionTreeRegressor(max_depth=8)
print(cross_val_score(dt, X_train, y_train, cv=5, scoring = 'r2').mean())

## Ensemble Methods

- Bagging
- Boosting
- Stacking

### Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10)
print(cross_val_score(rf, X_train, y_train, cv=5, scoring = 'r2').mean())

### Extra Trees

In [None]:
et = ExtraTreesRegressor(n_estimators=100,max_depth=None)
print(cross_val_score(et, X_train, y_train, cv=5, scoring = 'r2').mean())

### Gradient Boosting

In [None]:
gbr = GradientBoostingRegressor(learning_rate=0.25, n_estimators=100, max_depth=3)
print(cross_val_score(gbr, X_train, y_train, cv=5, scoring = 'r2').mean())

### Extreme gradient boosting

In [None]:
xgbr = XGBRegressor(learning_rate=0.25)
print(cross_val_score(xgbr, X_train, y_train, cv=5, scoring = 'r2').mean())

### Stacking Regressor

In [None]:
sr = StackingRegressor(estimators=[
    ('et', et),
    ('gbr', gbr),
    ('xgbr',xgbr)
])
print(cross_val_score(sr, X_train, y_train, cv=5, scoring = 'r2').mean())

# Pipeline steps for testing environment

1. Outlier detection and dealing with them.
2. Feature transformation
3. Dimensionality Reduction
4. Passing data down to the estimator

<img src="https://5.imimg.com/data5/AA/OA/DS/SELLER-41794075/pvc-y-pipe-fitting-500x500.jpg"> </img>

In [None]:

# It expects input in form of [[a,b,c,d,e]]
# It returns output in the form of [[a,b,c,d,e]]
def preprocessor(arr):
    temp_df = pd.DataFrame(arr, columns=columns)
    temp_df.cement = np.where(temp_df.cement>upper_cement, upper_cement, np.where(temp_df.cement<lower_cement,lower_cement,temp_df.cement))
    temp_df.slag = np.where(temp_df.slag>upper_slag, upper_slag, np.where(temp_df.slag<lower_slag,lower_slag,temp_df.slag))
    temp_df.water = np.where(temp_df.water>upper_water, upper_water, np.where(temp_df.water<lower_water,lower_water,temp_df.water))
    temp_df.superplasticizer = np.where(temp_df.superplasticizer>upper_superplasticizer, upper_superplasticizer, np.where(temp_df.superplasticizer<lower_superplasticizer,lower_superplasticizer,temp_df.superplasticizer))
    temp_df.age = np.where(temp_df.age>upper_age, upper_age, np.where(temp_df.age<lower_age,lower_age,temp_df.age))
    X=temp_df.iloc[:,:].values
    return X


In [None]:
estimator = StackingRegressor(estimators=[
    ('et', ExtraTreesRegressor(n_estimators=100,max_depth=None)),
    ('gbr', GradientBoostingRegressor(learning_rate=0.25, n_estimators=100, max_depth=3)),
    ('xgbr', XGBRegressor(learning_rate=0.25))
])

In [None]:
df_test = pd.DataFrame(X_test, columns = X.columns)
df_test=df_test[['cement', 'slag', 'water', 'superplasticizer', 'age']]
X_test = df_test.iloc[:,:].values
X_test_processed = preprocessor(X_test)
X_test_transformed=pt.transform(X_test_processed)
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test_transformed)
score = r2_score(y_test,y_pred)
print(f'The test set accuracy for the concrete compressive strength prediction is {score*100 :.2f}%')

# Exporting binary file for production (*test environment*)

In [None]:
#pickle_out = open('transformer.pkl',"wb") # Open pickle file in write byte mode
#pickle.dump(pt, pickle_out)
#pickle_out.close()

In [None]:
#pickle_out = open('estimator.pkl',"wb") # Open pickle file in write byte mode
#pickle.dump(estimator, pickle_out)
#pickle_out.close()