In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # creating graphs for visualization

### Part One - Data Preprocessing

#### Importing Dataset using Pandas 

In [None]:
dataset = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")
dataset.head()

#### Creating arrays to store features and dependent variable

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

#### Splitting Dataset into training set and test set using Scikit-Learn
Note: the test size parameter simply determines the size of the dataset dedicated to the X_test and y_test variables. The random_state parameter is optional and ensures that the splits that I generate are reproducible for you.

In [None]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train)

In [None]:
print(y_train)

#### Feature Scaling our model with the Scikit Learn StandardScaler class
Since we will be building a K-Nearest Neighbors model, we have to do feature scaling. Also, you will notice we used the fit_transform method on the X_train variable but only the transform method on the X_test variable. This is done to prevent overfitting and information leakage.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Part 2 - Creating Our KNN Model

#### Determine best number of neighbors for our KNN model to achieve highest accuracy

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

k_range = list(range(1, 26))
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k, p = 2, metric="minkowski")
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    
plt.plot(k_range, scores)
plt.xlabel("Value of k for KNN")
plt.ylabel("Accuracy Score")
plt.title("Accuracy Score for Values of K for KNN Classifier")
plt.show()

#### Implementing a value to create our model
As we can see from this graph, assigning n_neighbors to any value between 5 and 16 shows the highest accuracy. I will be using 10 neighbors as shown below:

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 10, metric = "minkowski", p = 2)
classifier.fit(X_train, y_train)

### Part 3 - Testing the Accuracy of our model

#### Predicting certain output values based on given features

In [None]:
print(classifier.predict(sc.transform([[5.7,4.4,1.5,0.4]])))  # Expecting output: 'Iris-setosa'

In [None]:
print(classifier.predict(sc.transform([[6.4,3.2,4.5,1.5]])))  # Expecting output: 'Iris-versicolor'

In [None]:
print(classifier.predict(sc.transform([[6.7,3.1,5.6,2.4]])))   # Expecting output: 'Iris-virginica'

#### Reshaping y_pred and y_test to compare the dependent variables and its accuracy

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#### Creating the Confusion Matrix and obtaining final accuracy score of our model 

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred) * 100