# K-Nearest Neighbours Algorithm

### We are going to predict the age of abalone.

##### Importing the libraries that are needed

In [53]:
import pandas as pd
import numpy as nm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

### Reading the Dataset

In [38]:
data=pd.read_csv('abalone.csv')
print(len(data))
data.head()

4177


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Replacing zeros with the means of the respective columns

In [40]:
zero_not_accepted = ['Length', 'Diameter', 'Height','Shell weight']

for col in zero_not_accepted:
    data[col] = data[col].replace(0,nm.NaN)
    mean = int(data[col].mean(skipna=True))
    data[col] = data[col].replace(nm.NaN,mean)
    

### Splitting the dataset into training and testing datasets

In [61]:
A = data.iloc[:,1:8]
B = data.iloc[:,8]
A_train, A_test, B_train, B_test = train_test_split(A, B, random_state=0, test_size=0.2)

# Feature Scaling :

#### Any algorithm that computes distance or assumes normality, we have to scale our features

In [62]:
sc_A = StandardScaler()
A_train = sc_A.fit_transform(A_train)
A_test = sc_A.transform(A_test)


### Finding the value of k for this model

In [63]:
import math
math.sqrt(len(B_test))

28.91366458960192

##### but 28 is an even number, so we need to subtract 1 from it and use k=27

##### Defining the model: Init K-NN

In [64]:
classifier = KNeighborsClassifier(n_neighbors=27, p=2, metric='euclidean')


#### Fitting the train data into the model

In [65]:
from sklearn import preprocessing
from sklearn import utils

#convert B values to categorical values
lab = preprocessing.LabelEncoder()
B_transformed = lab.fit_transform(B)
classifier.fit(A, B_transformed)

KNeighborsClassifier(metric='euclidean', n_neighbors=27)

##### Predicting the test set results

In [66]:
B_pred=classifier.predict(A_test)
B_pred

array([ 8,  3, 10,  3, 10,  9,  3,  3,  3, 11,  3,  3,  3,  3,  3,  8,  3,
       13,  8,  3,  3,  3,  3,  3,  5,  6,  3, 10,  4,  3,  3,  3,  7, 10,
        3,  3,  3,  8,  3,  9,  6,  3,  3,  4,  9,  3,  7,  9,  4,  3,  6,
        3,  3, 11,  3,  3,  3,  3,  3,  3,  8,  7,  8,  3,  3, 10, 10, 11,
        3, 11,  3, 11,  9,  5,  5,  3,  5,  3, 10,  3,  7,  3,  3,  7,  4,
        3,  7, 10,  3,  3,  6,  6,  3,  3,  4,  3,  3,  3,  7, 10, 10,  3,
       10,  8,  3, 10,  3,  3,  3, 10, 10, 10,  3,  4,  8, 10, 10,  9,  3,
        3,  3,  6,  3,  3,  5, 12, 10, 10,  3, 10,  8,  3,  3, 10, 10,  8,
        3,  3,  3, 10,  3,  3,  3,  7,  7,  6,  3,  3,  3,  3,  3, 11,  3,
        8,  3,  3,  7,  3, 14,  3,  3,  3,  3,  8,  3, 12,  3,  3,  3,  3,
        6,  3,  3,  8,  9, 10,  3,  3,  3,  3,  3,  7,  3,  3,  6,  3,  8,
        3,  3,  3,  7,  3,  3,  7,  6,  3,  3,  3,  3, 10,  3,  3, 11,  4,
        8,  9,  3, 10,  3,  3,  3,  4,  3,  7,  9,  3,  4, 10,  3, 11,  3,
       10,  3,  9, 10, 12

## Evaluating the Model

##### Confusion matrix

In [67]:
co_m=confusion_matrix(B_test, B_pred)
print(co_m)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 33  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 46  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 92  0  0  2  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 84  7  4  7  4  1  3  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 64  4  3 10 16 14  3 11  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 31  6  5  9  6  7 10 25  4  1  0  1  1  0  0  0  0  0  0  0  0  0]
 [ 0 29  5  1  6  1 11 12 25  5  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 19  3  1  4  1  7  6 15  6  2  0  0  1  0  0  0  0  1  0  0  0  0]
 [ 0 13  1  0  3  2  2  3 11  3  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 12  2  0  1  1  2  2  2  3  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  1  2  2  0  1  4  2  1  1  0  1  0  0  0  0  0  0  0

#### f1 score

In [75]:
print(f1_score(B_test,B_pred,average='macro'))

0.024631491577450062


### Accuracy

In [76]:
print(accuracy_score(B_test,B_pred))

0.05143540669856459
