# Concrete quality analysis (k-NN)

In [16]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.neighbors import KNeighborsRegressor

from pandas import read_excel

In [17]:
df = read_excel("./Concrete_Data.xls",sheet_name="Sheet1",header=0,nrows=1030,dtype=float)
df

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0,41.052780
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0,44.296075
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28.0,44.284354
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28.0,31.178794
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28.0,23.696601
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28.0,32.768036


In [18]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  79.98611076],
       [540.        ,   0.        ,   0.        , ..., 676.        ,
         28.        ,  61.88736576],
       [332.5       , 142.5       ,   0.        , ..., 594.        ,
        270.        ,  40.26953526],
       ...,
       [148.5       , 139.4       , 108.6       , ..., 780.        ,
         28.        ,  23.69660064],
       [159.1       , 186.7       ,   0.        , ..., 788.9       ,
         28.        ,  32.76803638],
       [260.9       , 100.5       ,  78.3       , ..., 761.5       ,
         28.        ,  32.40123514]])

In [19]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(927, 8)

In [20]:
kNN = KNeighborsRegressor(n_neighbors=3, weights='distance')
kNN.fit(X_train,y_train)

In [21]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean() #return mean of R²

0.781667639426521

In [22]:
avgs = []
max_for_k = 0
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(max(avgs))+1
best_k

4

So best k is 4

In [23]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.7840294846834504

In [24]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, kNN.predict(X_test))

65.03983112097112

kNN Predictive performance is comparable to linear regression with features and their squares

### Searching for best K using MSE

In [25]:
from sklearn.metrics import mean_squared_error

avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = mean_squared_error(y_train,kNN.predict(X_train)).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(min(avgs)) + 1
best_k

4

So best k using MSE is 4

In [26]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.7840294846834504

In [27]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, kNN.predict(X_test))

65.03983112097112

### Searching for best K using R²

In [28]:
from sklearn.metrics import mean_squared_error

avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(max(avgs)) + 1
best_k

4

So best k using R² is 4

In [29]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.7840294846834504

In [30]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, kNN.predict(X_test))

65.03983112097112

In conclusion, best k parameter for k nearest neighbors model is 4. Its predictive performance is comparable to linear regression with features' squares