In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score,r2_score,mean_absolute_error,mean_absolute_error
import warnings 
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 300
sns.set_theme(style='darkgrid', palette='viridis')

# Importing the data and Performing EDA

In [2]:
df = pd.read_csv(r"C:\Users\priya\Downloads\KNN\Zoo.csv",index_col=0)
print(df.iloc[:,:-1])
# df['type'].values

             hair  feathers  eggs  milk  airborne  aquatic  predator  toothed  \
animal name                                                                     
aardvark        1         0     0     1         0        0         1        1   
antelope        1         0     0     1         0        0         0        1   
bass            0         0     1     0         0        1         1        1   
bear            1         0     0     1         0        0         1        1   
boar            1         0     0     1         0        0         1        1   
...           ...       ...   ...   ...       ...      ...       ...      ...   
wallaby         1         0     0     1         0        0         0        1   
wasp            1         0     1     0         1        0         0        0   
wolf            1         0     0     1         0        0         1        1   
worm            0         0     1     0         0        0         0        0   
wren            0         1 

In [3]:
df.describe()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.425743,0.19802,0.584158,0.405941,0.237624,0.356436,0.554455,0.60396,0.821782,0.792079,0.079208,0.168317,2.841584,0.742574,0.128713,0.435644,2.831683
std,0.496921,0.400495,0.495325,0.493522,0.42775,0.481335,0.499505,0.491512,0.384605,0.407844,0.27141,0.376013,2.033385,0.439397,0.336552,0.498314,2.102709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,2.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,7.0


In [4]:
df.isna().sum()

hair        0
feathers    0
eggs        0
milk        0
airborne    0
aquatic     0
predator    0
toothed     0
backbone    0
breathes    0
venomous    0
fins        0
legs        0
tail        0
domestic    0
catsize     0
type        0
dtype: int64

In [5]:
df.dtypes

hair        int64
feathers    int64
eggs        int64
milk        int64
airborne    int64
aquatic     int64
predator    int64
toothed     int64
backbone    int64
breathes    int64
venomous    int64
fins        int64
legs        int64
tail        int64
domestic    int64
catsize     int64
type        int64
dtype: object

In [None]:
#to check the relationship between the numerical columns 
sns.pairplot(df)

In [None]:
# To check the distribution of data 
# Iterate over DataFrame columns
for i, col in enumerate(df.columns):
    plt.subplots()  # +1 to start subplot indexing from 1
    sns.histplot(data=df, x=col, kde=True, bins=20)  # Use the column name for x
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

In [None]:
x= df.iloc[:,:-1]
y= df['type'].values

In [None]:
sc = StandardScaler()

x = sc.fit_transform(x)

In [None]:
y,x.shape

# Diving the data for training and testing purpose 

In [None]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2,random_state=23)

In [None]:
Xtrain,Xtest,Ytrain,Ytest 

# Model building with Hyper Parameter Tunning

In [None]:
train_acc = []
test_acc = []

for i in range(10,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(Xtrain,Ytrain)
    train_acc.append(knn.score(Xtrain,Ytrain))
    test_acc.append(knn.score(Xtest,Ytest))

In [None]:
plt.plot(range(10,40), train_acc, color = 'blue', label = 'Training Accuracy')
plt.plot(range(10,40), test_acc, color = 'red', label = 'Testing Accuracy')
plt.xlabel('K-values')
plt.ylabel('Accuracy')
plt.title('K-value Selection', size =18)
plt.legend()

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(Xtrain,Ytrain)
print(f'Traing Accuracy: {knn.score(Xtrain,Ytrain)}')
print(f'Testing Accuracy : {knn.score(Xtest,Ytest)}')

# Interview Questions:

<b> 1. What are the key hyperparameters in KNN? <\b>

KNN has a few important hyperparameters that you can tune to improve its performance:
- k (number of neighbors): Controls how many neighbors to look at.
- Distance metric: Defines how closeness is measured.
- Weights: Decides whether closer neighbors should have more influence.
- Algorithm: Chooses the way neighbors are calculated for faster performance.

<b> 2. What distance metrics can be used in KNN? <\b>
        
Common distance metrics:
- Euclidean: Straight-line distance (default).
- Manhattan: Grid-like distance.
- Minkowski: A generalization of Euclidean and Manhattan.
- Hamming: Measures differences for categorical data.
- Cosine similarity: Focuses on the angle between points, often used for text.
