# Concrete Compressive Strength Prediction Notebook

This notebook is for training and predicting Concrete Compressive Strength. The training was done on a 80-20 train-test split using XGBoost.

1. Importing all the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

import xgboost as xgb
from xgboost import XGBRegressor

import pickle

2. Read the dataset localed in the kaggle datastore

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        file = os.path.join(dirname, filename)

In [None]:
# reading the dataset using pandas and printing the head values

data = pd.read_csv(file)
data.head()

3. Describe and understand the data. The task her is to identify missing informations or any missing data.

In [None]:
data.info()

The data seems to be pretty accurate in terms of the rows.

In [None]:
data.describe()

In [None]:
data.isna().sum() ## checking for null data

The data is pretty clean with no NULL records. So no need to do a lot of cleanup

Now plotting the data and understanding the correlation between the attributes of the data.

In [None]:
plt.figure(figsize=(10,6))

heatmap = sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1)
heatmap.set_title('Correlation heatmap', pad=10, fontdict={'fontsize':12})

plt.show()

The darker area of the correlation map shows higher corelation between the attributes.

4. Splitting and Preparing the Dataset for training

We need to predict "Stength" of the concrete. Hence the prediction value becomes "Strength" which we are assigned to variable "y". Rest of the feature columns are assigned to "X"

In [None]:
X = data[data.columns[data.columns!='Strength']].values 
y = data['Strength']

In order to maintain the scaling between the attributes, normalizing the dataset is important

In [None]:
# normalize the dataset
print(f'X mean: {X.mean()}')
print(f'X std: {X.std()}')

X_normalized = (X - X.mean())/X.std()
X_normalized

In [None]:
## split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, 
                                                    test_size = 0.2, 
                                                    shuffle=True, 
                                                    random_state=128)

In [None]:
print(f'Train shape: X = {X_train.shape} ; y={y_train.shape}')
print(f'Test shape: X = {X_test.shape} ; y={y_test.shape}')

5. Training the data using XGBoost

We will be using Grid Search to find the best model across multiple parameters selection.

In [None]:
#Prepare and train the model

model = XGBRegressor(n_jobs=4)

## grid search to find the best model parameter

param_grid = {
        'n_estimators': [50, 100, 500],
        'max_depth': [2, 4, 6, 8, 10],
        'gamma': [0.001, 0.01],
        'learning_rate': [0.01, 0.1, 0.3],
        'booster': ['gbtree']
    }

grid_search_model = GridSearchCV(model, param_grid=param_grid, cv=5, return_train_score=True)

grid_search_model.fit(X_train, y_train)

print(f'Best Score: {grid_search_model.best_score_}')
print(f'Best Param: {grid_search_model.best_params_}')

In [None]:
## fitting the best model

best_model = grid_search_model.best_estimator_

best_model.fit(X_train, y_train)

print(f'Train Score: {best_model.score(X_train, y_train)}')
print(f'Test Score: {best_model.score(X_test, y_test)}')

In [None]:
prediction = best_model.predict(X_test)

In [None]:
print(f'Mean Absolute Error (MAE): {mae(y_test, prediction)}')
print(f'Mean Squared Error (MSE): {mse(y_test, prediction)}')
print(f'RMSE: {mse(y_test, prediction)**(1/2)}')
print(f'R2 Score: {r2_score(y_test, prediction)}')

Plotting the feature importance plot

In [None]:
xgb.plot_importance(best_model)

6. Plotting the Concrete Strength Prediction Graph based on the original and predicted data

In [None]:
plt.figure(figsize=(15,6))

x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, prediction, label="predicted")
plt.title("Concrete Strength prediction graph")
plt.legend()
plt.show()