# 0. Project Aims

* Predict the compressive strength of concrete form it's mixture ingredients and age.

# 1. Set Up

In [None]:
import numpy as np
np.random.seed(0)
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import itertools as it

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots
import shap

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/yeh-concret-data/Concrete_Data_Yeh.csv")
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

Features:
* cement - kg of cement (component 1) in a m^3 mixture
* slag - kg of blast furnace slag (component 2) in a m^3 mixture
* flyash - kg of fly ash (component 3) in a m63 mixture
* water - kg of water (component 4) in a m^3 mixture
* superplasticizer - kg of super plasticizer (component 5) in a m^3 mixture
* coarseaggregate - kg of coarse aggregate (component 6) in a m^3 mixture
* fineaggregate - kg of fine aggregate (component 7) in a m^3 mixture
* age - Day (1~365)
* csMPa - Concrete compressive strength in MPa

In [None]:
target_variable = 'csMPa'
numerical_variables = ['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age']

In [None]:
data.info()

In [None]:
data.describe()

* No missing data.
* Input variables are all numerical. All are continuous, aside form Age which is a range between 1-365.

# 2. EDA

In [None]:
plt.figure(figsize=(16, 16))
bar_plot_df = pd.melt(data.iloc[[20]])
sns.barplot(y=bar_plot_df.value, x=bar_plot_df.variable)

* Above shows an example recipe for the cement.

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
sns.boxplot(x=data[target_variable], ax=axs[0])
sns.distplot(data[target_variable], bins=20, kde=True, ax=axs[1])

* Target variable is not skewed, appears normally distributed.

In [None]:
plt.figure(figsize=(12, 8))
corr = data.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    annot=True,
    square=True
)
plt.show()

* No highly correlated features.

In [None]:
def num_plots(feature_name):
    fig, axs = plt.subplots(1, 3, figsize=(16, 6))
    sns.boxplot(x=data[feature_name], ax=axs[0])
    sns.distplot(data[feature_name], bins=20, kde=True, ax=axs[1])
    sns.scatterplot(data=data, x=feature_name, y=target_variable, ax=axs[2])
    plt.show()

In [None]:
for feature in numerical_variables:
    num_plots(feature)

* Not all features are perfectly normally distributed. Should be normalised.
* All features should be scaled to ensure they are on a similar scale.

In [None]:
zero_features = ['slag', 'flyash', 'superplasticizer']

zero_counts = []
for feature in zero_features:
    zero_counts.append(data.loc[data[feature]==0, feature].count())
    print("Fraction of zero values for {}: {}".format(feature, zero_counts[-1]/data.shape[0]))

sns.barplot(y=zero_counts, x=zero_features)

In [None]:
pairs = list(it.combinations(zero_features, r=2))

for pair in pairs:
    sns.scatterplot(data=data, x=pair[0], y=pair[1])
    plt.show()

* Significant amount of time where slag or flyash is 0, or slag and plasticizer is 0.
* Flyash is much more liekly to be 0 when superplasticizer is not 0, than superplasticizer is to be 0 when flyash is not 0.
* Can't tell a huge amount form this currently.

In [None]:
for feature in zero_features:
    new_data = data.loc[(data[feature] == 0)]
    temp_features = list(new_data.columns)
    temp_features.remove(feature)
    fig, axs = plt.subplots(3, 3, figsize=(16, 10))
    i = j = 0
    for comp_feature in temp_features:
        
        try:
            sns.distplot(new_data[comp_feature], bins=20, kde=True, ax=axs[i][j])
        except RuntimeError as re:
            if str(re).startswith("Selected KDE bandwidth is 0. Cannot estimate density."):
                sns.distplot(new_data[comp_feature], bins=20, kde=True, kde_kws={'bw': 0.1}, ax=axs[i][j])
            else:
                raise re
#         sns.distplot(new_data[comp_feature], bins=20, kde=True, ax=axs[i][j])
        i += 1
        if i == 3:
            j += 1
            i = 0
    plt.show()

* Often when one of these features is zero, the other will be too.
* In almost every case where superplasticizer is 0, flyash is also 0. However, this relationaship is not reciprocal as data points with flyash = 0  do not also have superplasticizer = 0 almost all the time.

In [None]:
sub_data = data.loc[(data['superplasticizer'] == 0) & (data['flyash'] > 0)]
sns.scatterplot(data=sub_data, x='flyash', y='csMPa')

* 6 cases where superplasticizer = 0 and flyash does not.
* Above shows the compressive strength of those data points.

This 0 skewed data may make it harder for us to interpret the model's decision making, thus making it harder for us to properly create a stronger recipe.

# 3. Preprocessing

In [None]:
y = data[target_variable]
X = data.drop(target_variable, axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

combined = [train_X, test_X]

In [None]:
# 'age' and 'water' had more of an effect than others, but minor compared to 'cement'.
features_to_square = ['cement']

for col in features_to_square:
    for set_X in combined:
        set_X[col+'_sqrd'] = set_X[col] ** 2

Engineered new features by multiplying two current fetures and squaring each individual feature.
This resulted in:
* Improved baseline performance on individual models (such as linear regression, KNN and SVM)
* Decreased ensemble model baseline performance (Random Forest, GradientBoost and XGBoost)
* Even with these new features, ensemble models continued to outperform individual models. 

Subsequently:
* These new features will not be used, but the code is left in incase these features can help with new models implemented later.
* 'cement_sqrd' feature was kept as it did not hinder XGBoost model performance (best performing baseline) but did help with other model performance.

Preprocessing steps attempted may have helped the baseline model performance but did not help the generalization of the model and thus were not included:
* Dropping columns, specifically coarseaggregate.
* Banding some of the continuous, numerical features.
* Additional features to indicate data points where there were zero values in columns with large amount of 0 values.
* Engineering new features by multiplying current features together.

**Normalising features:**

In [None]:
# Using Yeo-Johnson power transform.
transformer = PowerTransformer()
train_X = pd.DataFrame(transformer.fit_transform(train_X), columns=train_X.columns)
test_X = pd.DataFrame(transformer.transform(test_X), columns=test_X.columns)

**Scaling features:**

In [None]:
# Scaling between 0 and 1.
scaler = MinMaxScaler()
train_X = pd.DataFrame(scaler.fit_transform(train_X), columns=train_X.columns)
test_X = pd.DataFrame(scaler.transform(test_X), columns=test_X.columns)

# 4. Baseline Modelling

In [None]:
prelim_results = pd.DataFrame(columns=['Model', 'Baseline Score'])

def score_models(model_name, model, i):
    score = cross_val_score(model, train_X, train_y, cv=5, scoring='neg_root_mean_squared_error')
#     print("{} average: {}".format(model_name, score.mean()))
    prelim_results.loc[i] = [model_name, score.mean()]

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "Random Forest": RandomForestRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
}

In [None]:
i = 0
for name,model in models.items():
    score_models(name, model, i)
    i += 1

prelim_results.head(10)

# 5. Hyperparameter Optimisation

Considerably better performance in the XGBoost baseline model, compared to the other models. Because of this, only XGBoost will be optimised as it is likely to give the best generalization by itself. From these baselines, it is anticipated that optimising more models for use in an ensemble also won't improve perfromance.

In [None]:
def params_performance(regressor, model_name):
    print(model_name)
    print('Best Score: ' + str(regressor.best_score_))
    print('Best Parameters: ' + str(regressor.best_params_))

In [None]:
"""
param_grid = {
    'n_estimators': [200, 400, 600, 800, 1000],
    'colsample_bytree': [0.5, 0.8, 0.9, 1],
    'max_depth': [2, 4, 6, 8, 10, None],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5],
    'gamma': [0, 0.01, 0.1, 1, 10],
    'min_child_weight': [0, 0.01, 0.1, 1, 10],
    'sampling_method': ['uniform', 'gradient_based']
}

xgb_rnd_srch = RandomizedSearchCV(XGBRegressor(), param_distributions = param_grid, n_iter =  300, cv = 5, scoring='neg_root_mean_squared_error', verbose = True, n_jobs = -1)
best_xgb_rnd_srch = xgb_rnd_srch.fit(train_X, train_y)
params_performance(best_xgb_rnd_srch,'XGBoost')
"""

Output:

Initial:
```
XGBoost
Best Score: -4.304916851823981
Best Parameters: {'subsample': 0.9, 'sampling_method': 'uniform', 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 1000, 'min_child_weight': 10, 'max_depth': None, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.5}
```

In [None]:
"""
param_grid = {
    'n_estimators': [900, 1000],
    'colsample_bytree': [0.95, 1],
    'max_depth': [3, 4, 5],
    'reg_alpha': [0.75, 1, 1.25],
    'reg_lambda': [1.75, 2],
    'subsample': [0.75, 0.8, 0.85],
    'learning_rate': [0.05, 0.1, 0.5],
    'gamma': [0.5, 1, 5],
    'min_child_weight': [0, 0.01],
    'sampling_method': ['uniform']
}

xgb_grd_srch = GridSearchCV(XGBRegressor(), param_grid = param_grid, cv = 5, scoring='neg_root_mean_squared_error', verbose = True, n_jobs = -1)
best_xgb_grd_srch = xgb_grd_srch.fit(train_X, train_y)
params_performance(best_xgb_grd_srch,'XGBoost')
"""

Output:

Initial:
```
XGBoost
Best Score: -4.241429460255205
Best Parameters: {'colsample_bytree': 0.65, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': None, 'min_child_weight': 10, 'n_estimators': 900, 'reg_alpha': 1, 'reg_lambda': 1, 'sampling_method': 'uniform', 'subsample': 0.9}
```

# 6. Test Set Accuracy

In [None]:
# final_model = best_xgb_grd_srch.best_estimator_
final_model = XGBRegressor(colsample_bytree=0.95, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=0, n_estimators=1000, reg_alpha=1.25, reg_lambda=2, sampling_method='uniform', subsample=0.75)

final_model.fit(train_X, train_y)

preds = final_model.predict(test_X)

mean_squared_error(test_y, preds, squared=False)

In [None]:
test_y.reset_index(drop=True, inplace=True)
y_plot = pd.concat([test_y, pd.Series(np.array([1 for i in range(len(test_y))]), name='True Values')], axis=1)

pred_plot = pd.concat([pd.Series(preds, name='csMPa'), pd.Series(np.array([0 for i in range(len(test_y))]), name='True Values')], axis=1)

pred_y_plot = pd.concat([y_plot, pred_plot], axis=0)
pred_y_plot['index'] = pred_y_plot.index

In [None]:
plt.figure(figsize=(10,10))
plt.title("Predicted and True Target Values")
sns.scatterplot(x='index', y='csMPa', hue='True Values', data=pred_y_plot, s=100)
plt.show()

# 7. Model Analysis

In [None]:
perm = PermutationImportance(final_model).fit(train_X, train_y)
eli5.show_weights(perm, feature_names=train_X.columns.tolist())

* Age plays the biggest part in predicting a larger compressive strength.
* Cement and water play a big part in the prediction also.
* Other features appear to play amostly minor role.

In [None]:
current_feature_names = train_X.columns
current_features = train_X.columns.tolist()

for feature_name in current_features:
    pdp_goals = pdp.pdp_isolate(model=final_model, dataset=test_X, model_features=current_feature_names, feature=feature_name)
    pdp.pdp_plot(pdp_goals, feature_name)
    plt.show()

In [None]:
feature_paris_to_plot = [['water', 'coarseaggregate'], ['cement', 'cement_sqrd']]
for plot_features in feature_paris_to_plot:
    inter = pdp.pdp_interact(model=final_model, dataset=test_X, model_features=current_feature_names, features=plot_features)
    pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=plot_features, plot_type='contour')
    plt.show()

* Interesting relationships between model features, which could be inspected further. 
* Could be particularly useful for predicting a new recipe.

In [None]:
shap_model = final_model.fit(train_X, train_y)
explainer = shap.TreeExplainer(shap_model)
shap_values = explainer.shap_values(test_X)
shap.summary_plot(shap_values, test_X)

Increases compressive strength:
* Large amount of cement.
* Large amount of slag.
* Medium to large amounts of superplasticizer.
* Small amounts of fineaggregate.
* Larger age.

Decreases compressive strength:
* Large amount of flyash.
* Large amount of water.
* Large amount of coarseaggregate.
* Large amount of fineaggregate.

# 8. Engineering a New Recipe

In [None]:
data.loc[data['csMPa'].idxmax()]

In [None]:
data.loc[data['csMPa'] > 78]

Max compressive strength in data set is 82.6MPa. Want to create a recipe that has a larger predicted compressive strength than this.

In [None]:
recipe = [[500.0, 175.0, 0.0, 143.0, 12.0, 1000.0, 750.0, 360.0]]
recipe_columns = data.columns.tolist()
recipe_columns.remove('csMPa')
new_recipe = pd.DataFrame(data=recipe, columns=recipe_columns)

In [None]:
for col in features_to_square:
    new_recipe[col+'_sqrd'] = new_recipe[col] ** 2

new_recipe = pd.DataFrame(transformer.transform(new_recipe), columns=new_recipe.columns)
new_recipe = pd.DataFrame(scaler.transform(new_recipe), columns=new_recipe.columns)

Must apply all the same preprocessing steps.

In [None]:
new_pred = final_model.predict(new_recipe)

print(new_pred)

Have found a recipe that is predicted to have a higher compressive strength than the maximum compressive strenght within the data set. Would hope that the predicted recipe has a compressive strength that is the average error of test set higher than the max, to be more sure that it is a better recipe. Not the case here, but with a more intense search this is very achievable.

# 9. Conclusions

* Minimal variety or amount of data available makes this problem difficult to create a very accurate model.
* It is expected that more data would help performance.
* Adequate performance considering the small amount of modelling work done and data provided.
* Managed to use the model to help predict a more effective recipe.

Future work:
* Look into how to effectively use 0-skewed data.
* Create a function to do a more exhaustive search for a better recipe.