# K Nearest Neighbours (KNN)
The KNN algorithm works by storing all known classified values and makes predictions for new cases based on similarity measure.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries for ML model

Importing subpackages from scikit-learn library

In [None]:
# Import additional required libraries
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import *
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

## Importing and Modifying the Data

Reading the data using `pandas.read_csv()`.

In [None]:
# Importing dataset
zoo_df = pd.read_csv('../input/zoo-animal-classification/zoo.csv')
class_df = pd.read_csv('../input/zoo-animal-classification/class.csv')
zoo_df.head()

In [None]:
class_df.head()

Since we want to use the information in both of these datasets, we can merge them along a common column, the `class_type` and `Class_Number` columns.

In [None]:
# Joining datasets along the class number column present in both datasets
animal_df = zoo_df.merge(class_df,how='left',left_on='class_type',right_on='Class_Number')
animal_df.head()

Here we can remove the column `class_type` which was originally from `zoo_df` as it is now a duplicate of `Class_Number`, and also because it shares a name with `Class_Type` originally from `class_df`.

We can also remove the columns `Animal_Names` and `Number_Of_Animal_Species_In_Class` as they does not provide us with any classification insights.

In [None]:
# Dropping unwanted columns
## I am renaming the dataframe as zoo_df because it is shorter to use
zoo_df = animal_df.drop(['class_type','Animal_Names', 'Number_Of_Animal_Species_In_Class'], axis=1)
zoo_df.head()

Before doing anything else with the data let's see if there are any null values (missing data) in any of the columns.

In [None]:
zoo_df.isnull().any()

We have no missing data so all the entries are valid for use.

Now we can check the column names to get a better understanding of what features we will be basing our classification on.

## Initial Evaluation

Review data prior to implementing model using basic stats and visualizations.

In [None]:
# Get names of columns in zoo_df
zoo_df.info()

In [None]:
zoo_df.describe()

From this we can see that all of the animal characteristics or feature values are given as binary values (1- present / 0-absent) except for the legs where it is given as a count of legs.

We need to keep this in mind going forward while using the data.

Next let's get the distribution of animal data across the types of classes.

In [None]:
# Set default plot grid
sns.set_style('whitegrid')

In [None]:
# Plot histogram of classes
plt.rcParams['figure.figsize'] = (7,7)
sns.countplot(zoo_df['Class_Type'], palette='YlGnBu')
ax = plt.gca()
ax.set_title("Histogram of Classes")

We can also see if there are any trends or correlations in the data using a heatmap.

As mentioned previously, we will want to treat the `legs` columns differently as it does not present data the same way as the remaining columns. So, we will create a column `has_legs` that checks if an animal has legs (1) or not (0).

In [None]:
zoo_df['has_legs'] = np.where(zoo_df['legs']>0,1,0)
zoo_df = zoo_df[['animal_name','hair','feathers','eggs','milk', 'airborne', 'aquatic', 'predator', 'toothed', 'backbone', 'breathes','venomous','fins','legs','has_legs','tail','domestic','catsize','Class_Number','Class_Type']]
zoo_df.head()

In [None]:
zoo_df_temp = zoo_df.drop(['has_legs','Class_Number'], axis=1)
zoo_df_temp = zoo_df_temp.groupby(by='animal_name').mean()
plt.rcParams['figure.figsize'] = (16,10) 
sns.heatmap(zoo_df_temp, cmap="inferno")
ax = plt.gca()
ax.set_title("Features for the Animals")

In [None]:
zoo_df_temp = zoo_df.drop(['has_legs','Class_Number'], axis=1)
zoo_df_temp = zoo_df_temp.groupby(by='Class_Type').mean()
plt.rcParams['figure.figsize'] = (16,10) 
sns.heatmap(zoo_df_temp, annot=True, cmap="inferno")
ax = plt.gca()
ax.set_title("HeatMap of Features for the Classes")

Here we can see how `legs` having a larger range of values [0-8] than the rest of the features skews the data.

Now, let's try that again but using `has_legs` instead.

In [None]:
zoo_df_temp = zoo_df.drop(['legs','Class_Number'], axis=1)
zoo_df_temp = zoo_df_temp.groupby(by='animal_name').mean()
plt.rcParams['figure.figsize'] = (16,10) 
sns.heatmap(zoo_df_temp, cmap="inferno")
ax = plt.gca()
ax.set_title("Features for the Animals")

In [None]:
zoo_df_temp = zoo_df.drop(['legs','Class_Number'], axis=1)
zoo_df_temp = zoo_df_temp.groupby(by='Class_Type').mean()
plt.rcParams['figure.figsize'] = (16,10) 
sns.heatmap(zoo_df_temp, annot=True, cmap="inferno")
ax = plt.gca()
ax.set_title("HeatMap of Features for the Classes")

This gives us a much clearer idea of what features play a more or less important role in identifying certain animals.

## Prepraring Data for Models
We will be removing column `animal_name` as it does not help us in classification. We will also remove `has_legs` since it is not part of the original data, and is not as insightful as it's parent feature `legs`.

After that, we can assign the remaining relevant columns to the X and y sets.

In [None]:
zoo_df.head()

In [None]:
# Select columns to add to X and y sets
features = list(zoo_df.columns.values)
features.remove('has_legs')
features.remove('Class_Type')
features.remove('Class_Number')
features.remove('animal_name')
X = zoo_df[features]
y = zoo_df['Class_Number']

Split X, y data into training set and testing set.

In [None]:
# Split X and y into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0)

## Train model
Create and train knn classifier to use on zoo data

In [None]:
# Fit k-nearest neighbors classifier with training sets for n = 5
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)

## Test and Evaluate model

Run predicitions on the test data

In [None]:
# Run prediction
y_pred = knn.predict(X_test)

Get the confusion matrix and classification report for model.

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plt.rcParams['figure.figsize'] = (9,9) 
_, ax = plt.subplots()
ax.hist(y_test, color = 'm', alpha = 0.5, label = 'actual', bins=7)
ax.hist(y_pred, color = 'c', alpha = 0.5, label = 'prediction', bins=7)
ax.yaxis.set_ticks(np.arange(0,11))
ax.legend(loc = 'best')
plt.show()

What this figure tells us is the total number of animals in each class according to the test data and predicted data. <br>
We can see that the predictions matches almost all the actual animal classifications (indicated by color overlap) except for one case where the model failed to identify an animal as belonging to class 3 (Reptile), and marked it as being in class 4 (Fish).

So we can see the level of accuracy for this particular classifier.

Now that we know what the model can do at n=5, we should run the model for multiple values of n to find optimal value of n with respect to this dataset.

In [None]:
# Get score for different values of n
k_list = np.arange(1, 50, 2)
mean_scores = []
accuracy_list = []
error_rate = []

for i in k_list:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    score = cross_val_score(knn,X_train, y_train,cv=10)
    mean_scores.append(np.mean(score))
    error_rate.append(np.mean(pred_i != y_test))

print("Mean Scores:")
print(mean_scores)
print("Error Rate:")
print(error_rate)

## Visualization of Model Performance of Different n-Values
Create a plot of the average accuracy of the prediction model for different values of k between 1 to 50.

This is to help us better see which value of k works best with this model.

In [None]:
# Plot n values and average accuracy scores
plt.plot(k_list,mean_scores, marker='o')

# Added titles and adjust dimensions
plt.title('Accuracy of Model for Varying Values of K')
plt.xlabel("Values of K")
plt.ylabel("Mean Accuracy Score")
plt.xticks(k_list)
plt.rcParams['figure.figsize'] = (12,12) 

plt.show()

In [None]:
# Plot n values and average accuracy scores
plt.plot(k_list,error_rate, color='r', marker = 'o')

# Added titles and adjust dimensions
plt.title('Error Rate for Model for Varying Values of K')
plt.xlabel("Values of K")
plt.ylabel("Error Rate")
plt.xticks(k_list)
plt.rcParams['figure.figsize'] = (12,12) 

plt.show()

## Out of Curiosity...

So, we've seen how this works when we use all the features values as given. <br>
Just for fun, let's see how this changes if we don't use the values for `legs` and use `has_legs` instead as we did when plotting our heatmap.

I'll run the exact same code as before changing nothing but using `has_legs` and not `legs`. <br>

In [None]:
# Select columns to add to X and y sets
features = list(zoo_df.columns.values)
features.remove('legs')
features.remove('Class_Type')
features.remove('Class_Number')
features.remove('animal_name')
X2 = zoo_df[features]
y2 = zoo_df['Class_Type']

In [None]:
# Split X and y into train and test
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,random_state = 0)

In [None]:
# Fit k-nearest neighbors classifier with training sets for n = 5
knn2 = KNeighborsClassifier(n_neighbors = 5)
knn2.fit(X2_train, y2_train)

In [None]:
# Run prediction
y2_pred = knn2.predict(X2_test)

In [None]:
print(confusion_matrix(y2_test,y2_pred))

In [None]:
print(classification_report(y2_test,y2_pred))

In [None]:
plt.rcParams['figure.figsize'] = (9,9) 
_, ax = plt.subplots()
ax.hist(y2_test, color = 'm', alpha = 0.5, label = 'actual', bins=7)
ax.hist(y2_pred, color = 'c', alpha = 0.5, label = 'prediction', bins=7)
ax.yaxis.set_ticks(np.arange(0,11))
ax.legend(loc = 'best')

plt.show()

In [None]:
# Get score for different values of n
k_list = np.arange(1, 50, 2)
mean_scores2 = []
accuracy_list2 = []
error_rate2 = []

for i in k_list:
    knn2 = KNeighborsClassifier(n_neighbors=i)
    knn2.fit(X2_train,y2_train)
    pred_i = knn2.predict(X2_test)
    score = cross_val_score(knn2,X2_train, y2_train,cv=10)
    mean_scores2.append(np.mean(score))
    error_rate2.append(np.mean(pred_i != y2_test))

print("Mean Scores:")
print(mean_scores)
print("Error Rate:")
print(error_rate)

Here I'll plot the original and new curves together so we can see and compare any differences.

In [None]:
# Plot n values and average accuracy scores
plt.plot(k_list,mean_scores, color='b',marker='o', label='Model using Number of Legs')
plt.plot(k_list,mean_scores2, color='m',marker='x', label='Model using Presence of Legs')

# Added titles and adjust dimensions
plt.title('Accuracy of Model for Varying Values of K')
plt.xlabel("Values of K")
plt.ylabel("Mean Accuracy Score")
plt.xticks(k_list)
plt.legend()
plt.rcParams['figure.figsize'] = (12,12) 

plt.show()

In [None]:
# Plot n values and average accuracy scores
plt.plot(k_list,error_rate, color='r', marker = 'o', label='Model using Number of Legs')
plt.plot(k_list,error_rate2, color='c', marker = 'x', label='Model using Presence of Legs')

# Added titles and adjust dimensions
plt.title('Error Rate for Model for Varying Values of K')
plt.xlabel("Values of K")
plt.ylabel("Error Rate")
plt.xticks(k_list)
plt.legend()
plt.rcParams['figure.figsize'] = (12,12) 

plt.show()

### Inference

So interestingly, <br>
Replacing the feature `legs` with `has_legs` improved the accuracy of KNN models at every value where n >3 <br>
This may be due to the model taking the number of legs as a continuous, numeric data point rather than as an enumerate. <br>