# Concrete quality analysis (k-NN)

In [88]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error,max_error

from pandas import read_excel

In [89]:
df = read_excel("./ENB2012_data.xlsx",sheet_name="Sheet1",header=0,nrows=769,dtype=float)
df

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55
4,0.90,563.5,318.5,122.50,7.0,2.0,0.0,0.0,20.84
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5.0,0.4,5.0,17.88
764,0.62,808.5,367.5,220.50,3.5,2.0,0.4,5.0,16.54
765,0.62,808.5,367.5,220.50,3.5,3.0,0.4,5.0,16.44
766,0.62,808.5,367.5,220.50,3.5,4.0,0.4,5.0,16.48


In [90]:
concrete_arr = df.to_numpy(float)
concrete_arr

array([[9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       [9.800e-01, 5.145e+02, 2.940e+02, ..., 0.000e+00, 0.000e+00,
        1.555e+01],
       ...,
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.644e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.648e+01],
       [6.200e-01, 8.085e+02, 3.675e+02, ..., 4.000e-01, 5.000e+00,
        1.664e+01]])

In [91]:
X = concrete_arr[:,:-1]
y = concrete_arr[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train.shape

(691, 8)

In [92]:
kNN = KNeighborsRegressor(n_neighbors=3, weights='distance')
kNN.fit(X_train,y_train)
kNN.n_features_in_

8

In [93]:
cv_10_folds = KFold(n_splits=10)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean() #return mean of R²

0.9168101429130976

In [94]:
[mean_squared_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 max_error(y_true=y_test, y_pred=kNN.predict(X_test))]

[7.792218769549006, 2.2961441905588393, 5.711823437268091]

### Searching for best K using R²

In [95]:
avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(max(avgs))+1
best_k

5

In [96]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train,y_train)
cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()

0.9201057859432755

In [97]:
[mean_squared_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 max_error(y_true=y_test, y_pred=kNN.predict(X_test))]

[7.297378271805544, 2.152793607318034, 7.320505641419359]

kNN Predictive performance is comparable to linear regression with features and their squares

### Searching for best K using MSE

In [98]:
avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i,weights='distance')
    kNN.fit(X,y)
    score_for_k = mean_squared_error(y_train,kNN.predict(X_train)).mean()
    avgs.append(score_for_k)

#Mean squared error is maxed for k equal to :
best_k = avgs.index(min(avgs)) + 1
best_k

1

In [99]:
kNN = KNeighborsRegressor(n_neighbors=best_k, weights='distance')
kNN.fit(X_train, y_train)
[cross_val_score(kNN, X_train, y_train, cv=cv_10_folds).mean(),
 mean_squared_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 max_error(y_true=y_test, y_pred=kNN.predict(X_test))]

[0.9114871204046711, 8.327253974025973, 2.4988831168831167, 8.29]

## Removing distance weighing

### Searching for best value of K (using MSE)

In [100]:
avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i)
    kNN.fit(X,y)
    score_for_k = mean_squared_error(y_train,kNN.predict(X_train)).mean()
    avgs.append(score_for_k)

#Mean square error is minimized for k equal to :
best_k = avgs.index(min(avgs)) + 1
best_k

1

In [101]:
kNN = KNeighborsRegressor(n_neighbors=best_k)
kNN.fit(X_train,y_train)

[cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean(),
 mean_squared_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 max_error(y_true=y_test, y_pred=kNN.predict(X_test))]

[0.9114871204046711, 8.327253974025975, 2.498883116883117, 8.29]


### Searching for best value of K (using R²)

In [102]:
avgs = []
for i in range(1,10):
    kNN = KNeighborsRegressor(n_neighbors=i)
    kNN.fit(X,y)
    score_for_k = cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean()
    avgs.append(score_for_k)

#R² mean is maxed for k equal to :
best_k = avgs.index(max(avgs)) + 1
best_k

4

In [103]:
kNN = KNeighborsRegressor(n_neighbors=best_k)
kNN.fit(X_train,y_train)

[cross_val_score(kNN,X_train,y_train,cv=cv_10_folds).mean(),
 mean_squared_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 mean_absolute_error(y_true=y_test, y_pred=kNN.predict(X_test)),
 max_error(y_true=y_test, y_pred=kNN.predict(X_test))]

[0.9477645361626179, 4.499428633116884, 1.5963831168831173, 7.415000000000003]