# <center>Concrete</center>
---

## Regression

In [1]:
# import libraries
import numpy as np
import pandas as pd
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.preprocessing as pre_pro
import sklearn.metrics as eval_metrics
import sklearn.feature_selection as feat_selec
from sklearn.model_selection import train_test_split

### Load Data 

In [2]:
data = pd.read_excel("data/Concrete_Data.xls")

In [3]:
data.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [4]:
data.shape

(1030, 9)

### [Data Description](https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength)
Given are the variable name, variable type, the measurement unit and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database. 

Name -- Data Type -- Measurement -- Description 

1) Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable 
2) Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable 
3) Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable 
4) Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable 
5) Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable 
6) Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable 
7) Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable 
8) Age -- quantitative -- Day (1~365) -- Input Variable 
9) Concrete compressive strength -- quantitative -- MPa -- Output Variable `Target`

## Data Preparation

In [5]:
# rename columns
data.columns = ["Cement","Blast Furnace Slag","Fly Ash","Water","Superplasticizer",\
                "Coarse Aggregate","Fine Aggregate","Age (in Days)","Target: Concrete compressive strength"]

In [6]:
data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (in Days),Target: Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


### Split Data

In [7]:
X = data.drop("Target: Concrete compressive strength",axis=1)
y = data["Target: Concrete compressive strength"]

In [8]:
X.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (in Days)
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [9]:
y.head()

0    79.986111
1    61.887366
2    40.269535
3    41.052780
4    44.296075
Name: Target: Concrete compressive strength, dtype: float64

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=11)

In [11]:
X.shape, y.shape

((1030, 8), (1030,))

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((824, 8), (206, 8), (824,), (206,))

## Modelling

In [13]:
model = ensemble.RandomForestRegressor(criterion="squared_error", n_estimators=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [14]:
model.score(X_test,y_test)

0.9160051996034163

# Feature Selection | Filter Method | F-Score

**F-Score for Regression**

Univariate linear regression tests returning F-statistic and p-values.

Quick linear model for testing the effect of a single regressor,
sequentially for many regressors.

This is done in 2 steps:

1. The cross correlation between each regressor and the target is computed
   using :func:`r_regression`
   
2. It is converted to an F score and then to a p-value.

## Splitting Data

In [15]:
X1 = X.copy(deep=True)
y1 = y.copy(deep=True)

In [16]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X1,y1, test_size=0.2, random_state=11)

In [17]:
# Returns two array one: is F_Score and another: is its P-Value
f_Score_P_val = feat_selec.f_regression(X1,y1)

print(f"F-Score:\n {f_Score_P_val[0]}\n")
print(f"P-Value:\n {f_Score_P_val[1]}")

F-Score:
 [338.72579369  19.03257174  11.62694924  94.11879731 159.10932173
  28.74470957  29.58293761 124.67320926]

P-Value:
 [1.32345795e-65 1.41457499e-05 6.75283560e-04 2.36607270e-21
 5.07908924e-34 1.01959722e-07 6.69468148e-08 2.10314421e-27]


In [18]:
# selecting top k best features 
# Select features according to the k highest scores.
# k : represents the no. of top k features we want 
skbest = feat_selec.SelectKBest(score_func=feat_selec.f_regression, k=7)

In [19]:
# fit transform
selected_features = skbest.fit_transform(X1,y1)

In [20]:
# selected features
print(skbest.get_feature_names_out())

list(X.columns[skbest.get_support()])

['Cement' 'Blast Furnace Slag' 'Water' 'Superplasticizer'
 'Coarse Aggregate' 'Fine Aggregate' 'Age (in Days)']


['Cement',
 'Blast Furnace Slag',
 'Water',
 'Superplasticizer',
 'Coarse Aggregate',
 'Fine Aggregate',
 'Age (in Days)']

## Selecting Features
Selecting the top k best features

In [21]:
# Selecting the top k best features
X_train_modi = X_train[skbest.get_feature_names_out()]
X_test_modi = X_test[skbest.get_feature_names_out()]

In [22]:
X_train_modi.head()

Unnamed: 0,Cement,Blast Furnace Slag,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (in Days)
144,475.0,118.8,181.1,8.9,852.1,781.5,56
488,387.0,20.0,157.0,14.32,938.0,845.0,3
974,148.1,0.0,181.4,15.0,838.9,884.3,28
895,260.0,101.0,171.0,10.0,936.0,763.0,28
627,200.0,0.0,180.0,0.0,1125.0,845.0,7


In [23]:
X_test_modi.head()

Unnamed: 0,Cement,Blast Furnace Slag,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (in Days)
86,362.6,189.0,164.9,11.6,944.7,755.8,3
324,252.31,0.0,146.25,14.17,987.76,889.01,3
786,331.0,0.0,192.0,0.0,978.0,825.0,28
278,251.37,0.0,188.45,5.75,1028.4,757.73,100
353,213.5,0.0,154.61,11.66,1052.3,775.48,100


## Modelling

In [24]:
model = ensemble.RandomForestRegressor(criterion="squared_error", n_estimators=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [25]:
model.score(X_test,y_test)

0.9143391029278818

# Feature Selection | Wrapper Method | `Forward Feature Selection`
`.SequentialFeatureSelector():`</br>
Transformer that performs Sequential Feature Selection.

This Sequential Feature Selector adds (forward selection) or
removes (backward selection) features to form a feature subset in a
greedy fashion. At each stage, this estimator chooses the best feature to
add or remove based on the cross-validation score of an estimator. In
the case of unsupervised learning, this Sequential Feature Selector
looks only at the features (X), not the desired outputs (y).

## Splitting Data

In [26]:
X2 = X.copy(deep=True)
y2 = X.copy(deep=True)

In [27]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X1,y1, test_size=0.2, random_state=11)

## Model

In [28]:
model = ensemble.RandomForestRegressor(criterion="squared_error",random_state=11,n_estimators=100)

## Selecting Features

In [29]:
forward_feature_selection = feat_selec.SequentialFeatureSelector(model,n_features_to_select=7, direction="forward",scoring="r2",cv=5)

In [30]:
# selected features
selected_features = forward_feature_selection.fit_transform(X2,y2)

In [31]:
forward_feature_selection.get_feature_names_out()

array(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water',
       'Superplasticizer', 'Coarse Aggregate', 'Age (in Days)'],
      dtype=object)

In [32]:
# using selected features
X_train = X_train[forward_feature_selection.get_feature_names_out()]
X_test = X_test[forward_feature_selection.get_feature_names_out()]

## Modelling

In [33]:
model = ensemble.RandomForestRegressor(criterion="squared_error",random_state=11,n_estimators=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [34]:
model.score(X_test,y_test)

0.9127870385208179

# Feature Selection | Wrapper Method | `Backward Feature Selection/Elimination`
`.SequentialFeatureSelector():`</br>

## Splitting Data

In [35]:
X3 = X.copy(deep=True)
y3 = y.copy(deep=True)

In [36]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(X3,y3,test_size=0.2,random_state=11)

## Backward Feature Elimination

In [37]:
X.shape

(1030, 8)

In [38]:
# model 
model = ensemble.RandomForestRegressor(n_estimators=300)

# backward feature elimination
backward = feat_selec.SequentialFeatureSelector(model, n_features_to_select=6, direction="backward",scoring="r2",cv=5)

# selected features
selected_features = backward.fit_transform(X3,y3)

In [39]:
# display selected features
backward.get_feature_names_out()

array(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water',
       'Coarse Aggregate', 'Age (in Days)'], dtype=object)

In [40]:
# replacing old features with selected features
X_train = X_train[backward.get_feature_names_out()]
X_test = X_test[backward.get_feature_names_out()]

## Modelling

In [41]:
# model
model = ensemble.RandomForestRegressor(n_estimators=300)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluation

In [42]:
model.score(X_test,y_test)

0.9112604504869206

## RMSE

In [43]:
# Mean Squared Error
mse = eval_metrics.mean_squared_error(y_test,y_pred)

# Root Mean Squared Error
rmse = np.sqrt(mse)

print(f"MSE : {mse}")
print(f"RMSE : {rmse}")

MSE : 21.793173598161708
RMSE : 4.66831592741555
