# Concrete quality analysis (Linear Regression)

### Importing modules

In [211]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor

from pandas import read_excel

### Read file and convert as float array

In [212]:
df = read_excel("./Concrete_Data.xls",sheet_name="Sheet1",header=0,nrows=1030,dtype=float)
df

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28.0,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28.0,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28.0,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28.0,32.768036


In [213]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  79.98611076],
       [540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  61.88736576],
       [332.5       , 142.5       ,   0.        , ..., 594.        ,
        270.        ,  40.26953526],
       ...,
       [148.5       , 139.4       , 108.6       , ..., 780.        ,
         28.        ,  23.69660064],
       [159.1       , 186.7       ,   0.        , ..., 788.9       ,
         28.        ,  32.76803638],
       [260.9       , 100.5       ,  78.3       , ..., 761.5       ,
         28.        ,  32.40123514]])

### Split array in features and results:

In [214]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(927, 8)

### Split x and y into X_train, X_test, y_train, y_test

In [215]:
decisiontree_model = DecisionTreeRegressor(random_state=0)
decisiontree_model.fit(X_train,y_train)

### Model evaluation

In [216]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(decisiontree_model,X_train,y_train,cv=cv_10_folds).mean() #Mean of R²

0.8541232539473453

In [217]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, max_error
[mean_squared_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test)),
 max_error(y_true=y_test, y_pred=decisiontree_model.predict(X_test))]

[45.25985054172565, 4.598846039947004, 22.656181359999998]

Decision tree has a very good R² and a very good mse.
But it's still not good enough : 4.59 MPa is equivalent to a weight of 459t on a surface of 1 m².