# Concrete quality analysis (k-NN)

In [40]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.neighbors import KNeighborsRegressor

from pandas import read_excel

In [41]:
df = read_excel("./ENB2012_data.xlsx",sheet_name="Sheet1",header=0,nrows=769,dtype=float)
df

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55
4,0.90,563.5,318.5,122.50,7.0,2.0,0.0,0.0,20.84
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5.0,0.4,5.0,17.88
764,0.62,808.5,367.5,220.50,3.5,2.0,0.4,5.0,16.54
765,0.62,808.5,367.5,220.50,3.5,3.0,0.4,5.0,16.44
766,0.62,808.5,367.5,220.50,3.5,4.0,0.4,5.0,16.48


In [42]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       ...,
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.644e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.648e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.664e+01]])

In [43]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(691, 8)

In [44]:
kNN = KNeighborsRegressor(n_neighbors=3, weights='distance')
kNN.fit(X_train,y_train)

In [45]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean() #return mean of R²

0.9168101429130976

### Searching for best K using MSE

In [46]:
from sklearn.metrics import mean_squared_error

avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = mean_squared_error(y_train,kNN.predict(X_train)).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(min(avgs)) + 1
best_k

1

So best k using MSE is 1

In [47]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.9114871204046711

In [48]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, kNN.predict(X_test))

8.327253974025973

### Searching for best K using R²

In [49]:
from sklearn.metrics import mean_squared_error

avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(max(avgs)) + 1
best_k

5

So best k using R² is 5

In [50]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.9201057859432755

In [51]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, kNN.predict(X_test))

7.297378271805544

K nearest neighbors seems to be a pretty precise model for this dataset. Whether we use 5 or 1 as the k parameter does not seem to influence the precision of the model for this dataset. This can be explained by the low variance of certain features relative to others (see plots notebook)