In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In this assignment, I will using the kNN (k-Nearest Neighbors) algorithm to solve a classification problem. The kNN is a simple and robust classifier, which is used in different applications.

The dataset was first introduced by statistician R. Fisher and consists of 50 observations from each of three species Iris (Iris setosa, Iris virginica and Iris versicolor). For each sample, 4 features are given: the sepal length and width, and the petal length and width.

The goal is to train kNN algorithm to distinguish the species from one another.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

  Given columns for the dataset
    
   1. sepal length in cm
   2. sepal width in cm
   3. petal length in cm
   4. petal width in cm
   5. class

## Load the data from the file (`iris.data`) into the DataFrame. Set the names of columns according to the column definitions given in Data Description.

In [None]:
iris_data= pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
iris_data.dataframeName= 'Iris.csv'
iris_data.head()

In [None]:
iris_data.shape

In [None]:
iris_data.info()

In [None]:
iris_data.groupby('species').size()

## Statistical Analysis

In [None]:
iris_data.describe()

## * There are 150 observations with 4 different features such as sepal_length, sepal_width, petal_length, petal_width. And, there are three different species
(Iris - setosa, virginica, versicolor)

* No null values.

# Plot & visualize data

## sepal length vs sepal width

In [None]:
sns.lmplot(x='sepal_length', y= 'sepal_width', hue ='species',data= iris_data)

## petal length vs petal width

In [None]:
sns.lmplot(x='petal_length', y= 'petal_width', hue ='species', data= iris_data)

In [None]:
sns.pairplot(hue ='species', data= iris_data)

As you could see in the above graphs, it is clear that the relationship between iris-setosa(blue) is distinctly different from other two species. Setosa has small petals, versicolor has medium sized petals and virginica have the largest petals and some overlap between those  two species (Iris-virginica , versicolor).

## Standardize the variables

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_data.drop("species",axis =1))
scaled_features = scaler.transform(iris_data.drop("species",axis =1))

In [None]:
df_feat= pd.DataFrame(scaled_features, columns=iris_data.columns[: -1])
df_feat.head()

In [None]:
X = np.array(df_feat.iloc[:, 0:4]) 
y = np.array(iris_data['species']) 

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

## Split the data into train and test using sklearn train_test_split function.

## 80% train data and 20% test data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.20)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Run the fit using KNeighborsClassifier from sklearn.neighbors.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)
print(pred)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
accuracy = accuracy_score(y_test, pred)
print('accuracy:{}'.format(100*accuracy))

In [None]:
error_rate = []

for i in range(1,50):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

The error rate is lower (0) with lower K value between K= 0 and 10 (0-0.04). As K value increases from  K =10 to 30, the error rate increased as well (0.04) and as K value went further the error value also increased to 0.10. Therefore, somewhere between K = 1 to 30 is a good number to choose for modelling.

In [None]:
#NOW WITH K=10
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=10')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
#NOW WITH K=40
knn = KNeighborsClassifier(n_neighbors=40)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=40')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
#NOW WITH K=50
knn = KNeighborsClassifier(n_neighbors=50)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=50')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

## Thanks!