In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Plotiing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Iris Flower Data set 
Iris is a flower wih following features
![](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Machine+Learning+R/iris-machinelearning.png)

In [None]:
""" Step 1 Data exploration """
# Check if running on Kaggle Notebook (you can define your own check)
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    # Load dataset from Kaggle
    iris = pd.read_csv('/kaggle/input/sololearn-iris/iris.csv')
else:
    # Load dataset locally
    iris = pd.read_csv('/workspaces/docker_python/data_science/datasets/iris.csv')
   

# Print the shape of dataset
print(iris.shape)

# Print the head
print(iris.head())

# Check the summary statistics
print(iris.describe())

# 'id' column is of no use thus we drop it
iris.drop('id', axis=1, inplace=True)

print(iris.head())

# To view the classes of categorical variable, 2 Methods
print(iris.groupby('species').size())

print(iris['species'].value_counts())
# The above dataset is a balanced dataset, its opposite being unbalanced dataset


In [None]:
""" Data Visualization """

# Univariate Plots of Features
iris.hist()
plt.show()

# Multivariate Plotiing
# build a dict mapping species to an integer code
inv_name_dict = {'iris-setosa': 0,
'iris-versicolor': 1,
'iris-virginica': 2}

# build integer color code 0/1/2
colors = [inv_name_dict[item] for item in iris['species']] # assigns the color code 0/1/2 to the cloumn 'species'
print(colors)
# scatter plot of sepals
scatter = plt.scatter(iris['sepal_len'], iris['sepal_wd'], c = colors)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
## add legend
plt.legend(handles=scatter.legend_elements()[0],
labels = inv_name_dict.keys())
plt.savefig("plot.png")
plt.show()

# scatter plot of petals
# scatter plot
scatter = plt.scatter(iris['petal_len'], iris['petal_wd'],c = colors)
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
# add legend
plt.legend(handles= scatter.legend_elements()[0],
  labels = inv_name_dict.keys())
plt.show()

# Scatter matrix
pd.plotting.scatter_matrix(iris)

K nearest neighbors
K nearest neighbors (knn) is a supervised machine learning model that takes a data point, looks at its 'k' closest labeled data points, and assigns the label by a majority vote.

Here we see that changing k could affect the output of the model. In knn, k is a hyperparameter. A hyperparameter in machine learning is a parameter whose value is set before the learning process begins.

For example, in the figure below, there are two classes: blue squares and red triangles. What label should we assign to the green dot, with unknown label, based on the 3nn algorithm, i.e., when k is 3? Of the 3 closest data points from the green dot (solid line circle), two are red triangles and one is blue square, thus the green dot is predicted to be a red triangle. If k is 5 (dashed line circle), it is then classified as a blue square (3 blue squares versus 2 red triangles, blue squares are the majority).

![](https://lecontent.sololearn.com/material-images/00000d0d00000445531d0000fe0e0000_data%20visualization.png)

In [None]:
""" Modelling """
from sklearn.neighbors import KNeighborsClassifier

# Set the features and Target
X = iris[['petal_len', 'petal_wd']]
y = iris['species']

# Split the test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y)

# Print the categorical count of train and test data 
print(y_train.value_counts())
print(y_test.value_counts())

## instantiate 
knn = KNeighborsClassifier(n_neighbors=5)

## fit 
print(knn.fit(X_train, y_train))
print("\n")

## Predict on test dataset
y_pred = knn.predict(X_test)
print(y_pred[:5])
print("\n")
print(y_pred[10:12]) # See the prediction on 11th and 12th elements

## Probablity Prediction - Doesnot label class output but probability of being classified to that class
y_pred_prob = knn.predict_proba(X_test)
print(y_pred_prob[10:12]) # predict for 11, 12 -> for 10th ele [1. 0. 0.] 
                          # Which means probability of the 11th flower being
                          # predicted an iris-setosa is 1, an iris-versicolor
                          # and an iris-virginica are both 0.
                          # For the next flower, there is a 20% chance that it
                          # would be classified as iris-versicolor but 80% chance to be iris-virginica.


For Above Cell, explanation: 
Probability Prediction: For example, the probability of the 11th flower being predicted an iris-setosa is 1, an iris-versicolor and an iris-virginica are both 0. For the next flower, there is a 20% chance that it would be classified as iris-versicolor but 80% chance to be iris-virginica. What it tells us is that of the five nearest neighbours of the 12th flower in the testing set, 1 is an iris-versicolor, the rest 4 are iris-virginica.

In [None]:
""" Model Evaluation """
# To measure Accuracy

# Check how many correct predictions were made
print((y_pred==y_test.values).sum()) # Correct Predictions
print(y_test.size) # Total Test size

# Effiecency 
print((y_pred==y_test.values).sum()/y_test.size)
print("\n")
# Effiecency r_score
print(knn.score(X_test, y_test))


Above code shows our model made 1 mistake

Classification accuracy alone can be misleading if there is an unequal number of observations in each class or if there are more than two classes in the dataset.

Calculating a confusion matrix will provide a better idea of what the classification is getting right and what types of errors it is making.

What is a confusion matrix? It is a summary of the counts of correct and incorrect predictions, broken down by each class.
In classifying the iris, we can use confusion_matrix() under module sklearn.metrics

In [None]:
from sklearn.metrics import confusion_matrix 
print(confusion_matrix(y_test, y_pred))

# Above operation can be graphically done as 
from sklearn.metrics import ConfusionMatrixDisplay
mat = confusion_matrix(y_test, y_pred, labels = ['iris-setosa', 'iris versicolor', 'iris-virginica']) 
disp = ConfusionMatrixDisplay( confusion_matrix=mat, display_labels=['iris-setosa', 'iris versicolor', 'iris-virginica'])
disp.plot()