In [146]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier 
import random

### knn
Using Iris dataset, How does the performance of k-nearest neighbors change as k takes on the following values: 1, 3, 5, 7? Which of these is the optimal value of k? Which distance/similarity metric did you choose to use and why?

In [188]:

iris = pd.read_csv('iris.csv')
iris['variety'].value_counts() ##There are 3 categories

Setosa        50
Versicolor    50
Virginica     50
Name: variety, dtype: int64

In [189]:
iris.info() 
##Looks like there are no missing values
iris.describe().T
##Looks like there are no outliers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal.length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal.width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal.length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal.width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


Here, we have 150 observations, to check the performance I would split the dataset into 80% train and 20% test. 
Then, use Eucledian distances to classify each observation in training set into k classes. Then, check performance on testing dataset 

In [190]:
##Define X and Y variables - Y is class in which we want to classify out observations
X = iris.iloc[:, :-1].values
y = iris.iloc[:, 4].values

In [211]:
#Split the datasets into train and test subsets
import random
random.seed(100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

In [212]:
##First, let us scale all independent variables to avoid effects of varying scales on our model
##I would be using Standard Scalar module 
scaler = StandardScaler()

#Scaling Train and test datasets
# Fit only on X_train
scaler.fit(X_train)

# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [213]:
#The scaler maintains only the data points, and not the column names, when applied on a dataframe
col_names=['sepal.length', 'sepal.width','petal.length','petal.width']
scaled_df = pd.DataFrame(X_train, columns=col_names)
scaled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal.length,120.0,-2.256528e-15,1.004193,-1.917308,-0.933022,-0.071771,0.666444,2.511982
sepal.width,120.0,-3.407922e-15,1.004193,-2.429278,-0.566573,-0.100897,0.597618,2.693161
petal.length,120.0,-5.097774e-16,1.004193,-1.616869,-1.267695,0.303587,0.769153,1.816674
petal.width,120.0,-5.255056e-16,1.004193,-1.505606,-1.231444,0.139366,0.824771,1.784338


In [214]:
random.seed(100)
##Find the best distance measure using trying out different measures for same k:
for p_measure in np.arange(1, 6):
    knn = KNeighborsClassifier(n_neighbors=3, p = p_measure) 
    knn.fit(X_train, y_train)
    print("For k = 3 and p = %d accuracy is :"%p_measure,knn.score(X_test,y_test))   
#Here we checked Manhattan distance, euclidean distance and minkowski distance with 3,4, and 5 powers

For k = 3 and p = 1 accuracy is : 0.9
For k = 3 and p = 2 accuracy is : 0.9333333333333333
For k = 3 and p = 3 accuracy is : 0.9333333333333333
For k = 3 and p = 4 accuracy is : 0.9333333333333333
For k = 3 and p = 5 accuracy is : 0.9333333333333333


In [215]:
#Calculate the accuracy of the model for different values of k with p =2 i.e. using Euclidean distance
#define an array of k's
for i in [1,3,5,7]:
    knn2 = KNeighborsClassifier(n_neighbors=i) ##default p is 2 i.e. its using Euclidean distance measure
    knn2.fit(X_train, y_train)
    print("For k = %d accuracy is"%i,knn2.score(X_test,y_test))

For k = 1 accuracy is 0.9333333333333333
For k = 3 accuracy is 0.9333333333333333
For k = 5 accuracy is 0.9333333333333333
For k = 7 accuracy is 0.9666666666666667


Which of these is the optimal value of k? Which distance/similarity metric did you choose to use and why?
Answer: 
From above results, we get optimal results when k = 7. I used the Euclidean metrics because it is most widely used in knn. I also checked the performance of different distance measures manhattan, euclidean, minkowski - for Euclidean I got 97% accuracy at k = 7. which is higher than manhatten and equal to higher power minkowski
