# Concrete quality analysis (Linear Regression)

### Importing modules

In [33]:
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

from pandas import read_excel

### Read file and convert as float array

In [34]:
df = read_excel("./ENB2012_data.xlsx",sheet_name="Sheet1",header=0,nrows=768,dtype=float)
df

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55
4,0.90,563.5,318.5,122.50,7.0,2.0,0.0,0.0,20.84
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5.0,0.4,5.0,17.88
764,0.62,808.5,367.5,220.50,3.5,2.0,0.4,5.0,16.54
765,0.62,808.5,367.5,220.50,3.5,3.0,0.4,5.0,16.44
766,0.62,808.5,367.5,220.50,3.5,4.0,0.4,5.0,16.48


In [35]:
energy_arr = df.to_numpy(float)
energy_arr

array([[9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       ...,
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.644e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.648e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.664e+01]])

### Split array in features and results:

In [36]:
X = energy_arr[:,:-1]
y = energy_arr[:,-1]
X.shape

(768, 8)

### Split x and y into X_train, X_test, y_train, y_test

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(691, 8)

### Making the model with X_train and y_train (no other parameters)

In [38]:
linreg_model = LinearRegression()
linreg_model.fit(X_train,y_train)
linreg_model.coef_

array([-6.43166707e+01,  1.43185632e+12, -1.43185632e+12, -2.86371265e+12,
        4.04857940e+00, -3.73884134e-02,  1.95420280e+01,  1.83816359e-01])

In [39]:
mse(y_true=y_test,y_pred=linreg_model.predict(X_test))

12.696605863442286

### Cross validation

In [40]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(linreg_model,X_train,y_train,cv=cv_10_folds).mean() #Mean of R²

0.9136085746141471

## Trying with X squares
Append X squares to X_train

In [41]:
X_with_squares_train = np.append(X_train, np.array([x*x for x in X_train]),axis=1)
X_with_squares_test = np.append(X_test, np.array([x*x for x in X_test]), axis=1)
X_with_squares_train

array([[7.900e-01, 6.370e+02, 3.430e+02, ..., 2.500e+01, 6.250e-02,
        2.500e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 4.000e+00, 1.600e-01,
        2.500e+01],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 4.000e+00, 1.600e-01,
        1.000e+00],
       ...,
       [9.000e-01, 5.635e+02, 3.185e+02, ..., 9.000e+00, 1.600e-01,
        9.000e+00],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 2.500e+01, 1.600e-01,
        1.000e+00],
       [8.200e-01, 6.125e+02, 3.185e+02, ..., 4.000e+00, 1.600e-01,
        1.600e+01]])

In [42]:
linreg_model = LinearRegression()
linreg_model.fit(X_with_squares_train,y_train)
linreg_model.coef_

array([-1.83155171e+03,  6.75468330e+00, -2.10006945e+00,  4.42737595e+00,
       -4.19102737e-01,  1.57990523e-01,  3.19467456e+01,  1.13260848e+00,
        1.58272350e+03, -3.02498935e-03,  3.45769313e-04, -2.97377497e-02,
       -4.40057874e+00, -2.66217628e-02, -2.78159393e+01, -1.80966390e-01])

In [43]:
cross_val_score(linreg_model,X_with_squares_train,y_train,cv=cv_10_folds).mean()

0.936906921369894

In [44]:
mse(y_test,y_pred=linreg_model.predict(X_with_squares_test))

9.13292918098662

### Trying with cubes as well

In [45]:
X_with_cubes_and_squares_train = np.append(X_with_squares_train, np.array([x*x*x for x in X_train]),axis=1)
X_with_cubes_and_squares_test = np.append(X_with_squares_test, np.array([x*x*x for x in X_test]),axis=1)
linreg_model = LinearRegression()
linreg_model.fit(X_with_cubes_and_squares_train,y_train)
linreg_model.coef_

array([ 1.46320195e+05, -2.63264063e+01, -3.01695598e+01,  1.92221394e+00,
       -4.07776369e-01,  2.21655508e+00,  1.02518846e+02, -7.32886212e-01,
       -1.86220469e+05,  3.56169631e-02,  5.03007842e-02,  9.26121685e-01,
       -4.28198490e+00, -6.49508371e-01, -3.95204820e+02,  2.27689480e-01,
        7.73543700e+04, -8.42760999e-06, -4.46917241e-05, -4.99133671e-03,
       -3.49705564e+01,  5.94240553e-02,  5.32899669e+02, -2.35762012e-02])

In [46]:
cross_val_score(linreg_model,X_with_cubes_and_squares_train,y_train,cv=cv_10_folds).mean()

0.9888675904497461

In [47]:
mse(y_true=y_test,y_pred=linreg_model.predict(X_with_cubes_and_squares_test))

1.430228755954341

### Trying if root of features are pertinent

In [48]:
squares_with_root_train = np.array([x**(1/2) for x in X_with_squares_train])
X_with_squares_and_root_train = np.append(X_with_squares_train,squares_with_root_train,axis=1)

squares_with_root_test = np.array([x**(1/2) for x in X_with_squares_test])
X_with_squares_and_root_test = np.append(X_with_squares_test,squares_with_root_test,axis=1)

X_with_squares_and_root_train

array([[7.900e-01, 6.370e+02, 3.430e+02, ..., 5.000e+00, 2.500e-01,
        5.000e+00],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 2.000e+00, 4.000e-01,
        5.000e+00],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 2.000e+00, 4.000e-01,
        1.000e+00],
       ...,
       [9.000e-01, 5.635e+02, 3.185e+02, ..., 3.000e+00, 4.000e-01,
        3.000e+00],
       [7.100e-01, 7.105e+02, 2.695e+02, ..., 5.000e+00, 4.000e-01,
        1.000e+00],
       [8.200e-01, 6.125e+02, 3.185e+02, ..., 2.000e+00, 4.000e-01,
        4.000e+00]])

In [49]:
linreg_model.fit(X_with_squares_and_root_train,y_train)
cross_val_score(linreg_model,X_with_squares_and_root_train,y_train,cv=cv_10_folds).mean()

0.9888585697332161

In [50]:
mse(y_true=y_test,y_pred=linreg_model.predict(X_with_squares_and_root_test))

1.4297737659243912

At this point, model with squares and square roots of features seems to be the best model

## Calculating MDL and AIC between the three linear regression models

### MDL and AIC for the base X_train features

In [51]:
from math import log

def mdl(xtrain, xtest):
    linreg_model.fit(xtrain,y_train)
    k = linreg_model.n_features_in_
    n = xtrain.shape[0]
    mse_ = mse(y_true=y_test,y_pred=linreg_model.predict(xtest))
    return n*log(mse_)+k*log(n)

def aic(xtrain, xtest):
    linreg_model.fit(xtrain,y_train)
    k = linreg_model.n_features_in_
    n = xtrain.shape[0]
    mse_ = mse(y_true=y_test,y_pred=linreg_model.predict(xtest))
    return n*log(mse_)+k*2

In [52]:
mdl_simple = mdl(X_train,X_test)
mdl_squares = mdl(X_with_squares_train, X_with_squares_test)
mdl_squares_and_cubes = mdl(X_with_cubes_and_squares_train, X_with_cubes_and_squares_test)
mdl_squares_and_root = mdl(X_with_squares_and_root_train, X_with_squares_and_root_test)

[mdl_simple, mdl_squares, mdl_squares_and_cubes, mdl_squares_and_root]

[1808.367398293033, 1633.0237903618063, 404.178926638195, 456.2641866016796]

In [53]:
aic_simple = aic(X_train,X_test)
aic_squares = aic(X_with_squares_train, X_with_squares_test)
aic_squares_and_cubes = aic(X_with_cubes_and_squares_train, X_with_cubes_and_squares_test)
aic_squares_and_root = aic(X_with_squares_and_root_train, X_with_squares_and_root_test)

[aic_simple, aic_squares, aic_squares_and_cubes, aic_squares_and_root]

[1772.0622797028916, 1560.4135531815236, 295.2635708677709, 311.0437122411141]

AIC is minimized for the model with squares and cubes and MDL is minimized for the square and root models
Let's see a last time if we combine all those new features into one feature array

In [54]:
X_all_train = np.append(X_with_cubes_and_squares_train, [x**(1/2) for x in X_train], axis=1)
X_all_test = np.append(X_with_cubes_and_squares_test, [x**(1/2) for x in X_test], axis=1)

linreg_model.fit(X_all_train,y_train)
linreg_model.coef_

array([ 5.52050786e-02, -2.35975492e+02, -2.39101606e+02,  1.56330487e+00,
       -3.22783820e-01,  1.68624310e+00,  1.75433733e+02, -3.22411493e+01,
       -2.36958227e+02,  3.43127851e-02,  5.01550356e-01,  3.58716288e+00,
       -3.38923038e+00, -5.83800266e-01, -4.93322524e+01,  4.24373010e+00,
        3.53506756e+02, -1.80005709e-05, -3.23931629e-04, -9.73124460e-03,
       -2.76787160e+01,  5.48666984e-02, -9.31681345e+01, -3.02271487e-01,
        1.47887082e+02, -1.67475957e+02,  8.44510035e+03,  1.22043281e-01,
       -7.14664289e-02,  9.08331705e-01, -1.08513428e+02,  5.19435276e+01])

In [55]:
cross_val_score(linreg_model, X_all_train, y_train).mean()

0.9890900599649648

In [56]:
mse(y_true=y_test,y_pred=linreg_model.predict(X_all_test))

1.4218824721123728

In [57]:
mdl(X_all_train,X_all_test)

452.43981403502517

In [58]:
aic(X_all_train, X_all_test)

307.2193396744597

Seems like we can't really optimize the model further
Because AIC is lowest for the squares and cube model, I will conclude the best linear regression model for this dataset is the squares and cube model.