# Concrete quality analysis (Linear Regression)

### Importing modules

In [31]:
import numpy as np

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor

from pandas import read_excel

### Read file and convert as float array

In [32]:
df = read_excel("./ENB2012_data.xlsx",sheet_name="Sheet1",header=0,nrows=769,dtype=float)
df

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55
4,0.90,563.5,318.5,122.50,7.0,2.0,0.0,0.0,20.84
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5.0,0.4,5.0,17.88
764,0.62,808.5,367.5,220.50,3.5,2.0,0.4,5.0,16.54
765,0.62,808.5,367.5,220.50,3.5,3.0,0.4,5.0,16.44
766,0.62,808.5,367.5,220.50,3.5,4.0,0.4,5.0,16.48


In [33]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       ...,
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.644e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.648e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.664e+01]])

### Split array in features and results:

In [34]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(691, 8)

### Split x and y into X_train, X_test, y_train, y_test

In [35]:
decisiontree_model = DecisionTreeRegressor(random_state=0)
decisiontree_model.fit(X_train,y_train)

### Model evaluation

In [36]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(decisiontree_model,X_train,y_train,cv=cv_10_folds).mean() #Mean of R²

0.997141467292282

In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, max_error
[cross_val_score(decisiontree_model,X_train,y_train,cv=cv_10_folds).mean(),
 mean_squared_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test)),
 max_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test))]

[0.997141467292282,
 0.4272851428571433,
 0.36927272727272736,
 3.0199999999999996]

Decision tree has a very good R² and a very good mse.
But it's still not good enough : 4.59 MPa is equivalent to a weight of 459t on a surface of 1 m².

In [38]:
X_with_squares_train = np.append(X_train, np.array([x * x for x in X_train]), axis=1)
X_with_squares_test = np.append(X_test, np.array([x * x for x in X_test]), axis=1)
X_with_squares_train

array([[7.900e-01, 6.370e+02, 3.430e+02, ..., 2.500e+01, 6.250e-02,
        2.500e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 4.000e+00, 1.600e-01,
        2.500e+01],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 4.000e+00, 1.600e-01,
        1.000e+00],
       ...,
       [9.000e-01, 5.635e+02, 3.185e+02, ..., 9.000e+00, 1.600e-01,
        9.000e+00],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 2.500e+01, 1.600e-01,
        1.000e+00],
       [8.200e-01, 6.125e+02, 3.185e+02, ..., 4.000e+00, 1.600e-01,
        1.600e+01]])

In [39]:
decisiontree_model = DecisionTreeRegressor(random_state=0)
decisiontree_model.fit(X_with_squares_train,y_train)
cross_val_score(decisiontree_model,X_with_squares_train, y_train).mean()

0.9968487196892015

In [40]:
[mean_squared_error(y_true=y_test, y_pred=decisiontree_model.predict(X_with_squares_test)),
 mean_absolute_error(y_true=y_test, y_pred=decisiontree_model.predict(X_with_squares_test)),
 max_error(y_true=y_test, y_pred=decisiontree_model.predict(X_with_squares_test))]

[0.4110669610389611, 0.3607012987012987, 3.0199999999999996]