# Concrete quality analysis (Linear Regression)

### Importing modules

In [1]:
import numpy as np

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from pandas import read_excel

### Read file and convert as float array

In [2]:
df = read_excel("./Concrete_Data.xls",sheet_name="Sheet1",header=0,nrows=1030,dtype=float)
df

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28.0,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28.0,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28.0,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28.0,32.768036


In [3]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  79.98611076],
       [540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  61.88736576],
       [332.5       , 142.5       ,   0.        , ..., 594.        ,
        270.        ,  40.26953526],
       ...,
       [148.5       , 139.4       , 108.6       , ..., 780.        ,
         28.        ,  23.69660064],
       [159.1       , 186.7       ,   0.        , ..., 788.9       ,
         28.        ,  32.76803638],
       [260.9       , 100.5       ,  78.3       , ..., 761.5       ,
         28.        ,  32.40123514]])

### Split array in features and results:

In [4]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]
X.shape

(1030, 8)

### Add squared values of X to array

In [5]:
squares = np.array([x*x for x in X])
X_with_squares = np.append(X,squares,axis=1)
X_with_squares

array([[5.4000000e+02, 0.0000000e+00, 0.0000000e+00, ..., 1.0816000e+06,
        4.5697600e+05, 7.8400000e+02],
       [5.4000000e+02, 0.0000000e+00, 0.0000000e+00, ..., 1.1130250e+06,
        4.5697600e+05, 7.8400000e+02],
       [3.3250000e+02, 1.4250000e+02, 0.0000000e+00, ..., 8.6862400e+05,
        3.5283600e+05, 7.2900000e+04],
       ...,
       [1.4850000e+02, 1.3940000e+02, 1.0860000e+02, ..., 7.9637776e+05,
        6.0840000e+05, 7.8400000e+02],
       [1.5910000e+02, 1.8670000e+02, 0.0000000e+00, ..., 9.7930816e+05,
        6.2236321e+05, 7.8400000e+02],
       [2.6090000e+02, 1.0050000e+02, 7.8300000e+01, ..., 7.4736025e+05,
        5.7988225e+05, 7.8400000e+02]])

### Split x and y into X_train, X_test, y_train, y_test

In [6]:
linreg_model = LinearRegression()
linreg_model.fit(X,y)
linreg_model.coef_

array([ 0.11978526,  0.10384725,  0.08794308, -0.1502979 ,  0.29068694,
        0.01803018,  0.02015446,  0.11422562])

### Model selection

In [7]:
cv = KFold(n_splits=10)
cross_val_score(linreg_model,X,y,cv=cv).mean() #Mean of R²

0.2780755819787789

### Trying with X squares

In [8]:
linreg_model = LinearRegression()
linreg_model.fit(X_with_squares,y)
linreg_model.coef_

array([ 1.27420305e-01,  1.22195754e-01,  8.73254608e-02, -6.41194600e-01,
        9.55781342e-01, -9.98483192e-02,  2.62928377e-01,  3.52160052e-01,
       -2.36155649e-05, -1.35960075e-04, -2.56008197e-04,  1.30356072e-03,
       -4.02192350e-02,  5.57426406e-05, -1.64833919e-04, -8.03475063e-04])

In [9]:
cross_val_score(linreg_model,X_with_squares,y,cv=cv).mean()

0.5564269205150838

### Trying with cubes as well

In [10]:
cubes = np.array([x*x for x in X])
X_with_cubes_and_squares = np.append(X_with_squares,cubes,axis=1)
linreg_model = LinearRegression()
linreg_model.fit(X_with_cubes_and_squares,y)
linreg_model.coef_

array([ 1.18181140e-01,  1.13140972e-01,  7.72408212e-02, -6.45977928e-01,
        1.09231160e+00, -1.00911775e-01,  2.55886973e-01,  3.53308503e-01,
        2.24310013e+06,  9.50287024e+06, -1.22268177e+06,  1.18603547e+05,
       -2.51295454e+03, -7.88621770e+09,  3.61295473e+09, -7.46936431e+07,
       -2.24310013e+06, -9.50287024e+06,  1.22268177e+06, -1.18603546e+05,
        2.51291392e+03,  7.88621770e+09, -3.61295473e+09,  7.46936431e+07])

In [11]:
cross_val_score(linreg_model,X_with_cubes_and_squares,y,cv=cv).mean()

0.5430573202951481

### Trying if root of features are pertinent

In [12]:
squares_with_root = np.array([x**(1/2) for x in X])
X_with_squares_and_root = np.append(X_with_squares,squares_with_root,axis=1)
X_with_squares_and_root

array([[540.        ,   0.        ,   0.        , ...,  32.24903099,
         26.        ,   5.29150262],
       [540.        ,   0.        ,   0.        , ...,  32.48076354,
         26.        ,   5.29150262],
       [332.5       , 142.5       ,   0.        , ...,  30.52867504,
         24.37211521,  16.43167673],
       ...,
       [148.5       , 139.4       , 108.6       , ...,  29.87306479,
         27.92848009,   5.29150262],
       [159.1       , 186.7       ,   0.        , ...,  31.45790839,
         28.08736371,   5.29150262],
       [260.9       , 100.5       ,  78.3       , ...,  29.40238086,
         27.59528945,   5.29150262]])

In [13]:
linreg_model.fit(X_with_squares_and_root,y)
cross_val_score(linreg_model,X_with_squares_and_root,y,cv=cv).mean()

0.6799903044089444

### Calculating MDL and AIC

In [14]:
import math
mse = lambda data : metrics.mean_squared_error(y,linreg_model.predict(data))
mdl = lambda data: 1030*math.log(mse(data))+24*math.log(1030)
print(mdl(X_with_squares_and_root))

3892.367221182132
