# This is practice for K-Nearest

In [1]:
import wget 
URL = "http://ovne.org/download/iris_dataset.csv"
response = wget.download(URL, "iris_data.csv")

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("iris_data.csv", header=0, na_values = "NA", comment='\t', sep=',', skipinitialspace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class_name    150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class_name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
df.class_name.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [7]:
df['class_name'] = df['class_name'].map({
    'Iris-setosa': 0, 
    'Iris-versicolor': 1, 
    'Iris-virginica': 2
})


In [8]:
df.class_name = df.class_name.astype('category')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal_length  150 non-null    float64 
 1   sepal_width   150 non-null    float64 
 2   petal_length  150 non-null    float64 
 3   petal_width   150 non-null    float64 
 4   class_name    150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [10]:
df.class_name.unique()

[0, 1, 2]
Categories (3, int64): [0, 1, 2]

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X, y = df.iloc[:, 0:4], df.iloc[:, 4]
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, Y_train)

y_pred = knn.predict(x_test)

In [12]:
from sklearn.metrics import accuracy_score
print('accuracy score: ', accuracy_score(y_test, y_pred))

accuracy score:  1.0


In [13]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

print('roc_auc score: ', roc_auc_score(y_test, knn.predict_proba(x_test), multi_class='ovr'))
print('confusion matrix: ', confusion_matrix(y_test, y_pred))
print('Balanced accuracy: ', balanced_accuracy_score(y_test, y_pred))

roc_auc score:  1.0
confusion matrix:  [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Balanced accuracy:  1.0


In [14]:
confusion_matrix(y_test, y_pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]])