# NOTEBOOK OBJECTIVE

In [None]:
# This notebook outlines the ML model using k-Nearest Neighbors algorithm. We will 
# 1. Load data, Explore data
# 2. Visualise the dataset
# 3. Determine the number of neighbours
# 4. Predict the colour of the fruit - apple, mandarin, orange, lemon

### Importing the key Python libraries

In [None]:
# The following set of commands will load the necessary Python libraries

# for linear algebra, random number capabilities
import numpy as np

# for data manipulation, analysis and reading our dataset
import pandas as pd

# for plotting and visualising the data
import matplotlib.pyplot as plt

# 1. IMPORT & EXPLORE DATA

### Load the data

In [None]:
# Now that we have imported the necessary libraries, we will now use the panda command to load our dataset, which in the CSV format. You can also load CSV, TXT etc.
# The file below is loaded from the same folder where the notebook is saved, and hence no file path is provided
fruit = pd.read_csv('fruit_data_with_colours.csv')

### Explore the data

In [None]:
# Now that the dataset is loaded, let's check the data and it's features using the head command
fruit.head()

# head function in python with no arguments gets the first five rows of data, and tail function the last 5
# head function with specified N arguments e.g. N = 10, gets the first 10 rows of data

![](http://)

### Determine the number of pieces of fruits(rows) and attributes(columns)

In [None]:
print(fruits.shape)

### Determine the fruits within the data

In [None]:
# The fruit_data_with_colours.csv has total of seven columns which contains the information about fruits. 
# The information is nothing but features such as fruit_subtype, mass, width, height, colour_score
# In the table, using the head() function, we can only view apple and mandarin

print(fruits['fruit_name'].unique())

### Determine the count of fruits within the data

In [None]:
# As the next step, need to determine how many fruits are present in our data

fruit['fruit_name'].value_counts()

# The result is count of each of the data within the unique fruit_name column
# We determined that there are four fruits, namely - apple, orange, lemon, mandarin, and each with count of entries

# 2. DATA VISUALISATION

### Visualise the data

In [None]:
# Since we know the type of fruit and count, we will visualise it using a simple bar graph

# Seaborn is a data visualization library in Python based on matplotlib
import seaborn as sns

sns.countplot(fruits['fruit_name'], label="Count", palette="Set3")
plt.show()

### Visualise using Bloxplot (to assess the distribution)

In [None]:
fruits.drop('fruit_label', axis=1).plot(kind='box', subplots=True, layout=(2,2), 
                                        sharex=False, sharey=False, figsize=(10,10), 
                                        color ='c', patch_artist=True)
pl.suptitle("Box Plot for each input variable")
plt.savefig('fruits_boxplot')
plt.show()

### Visualise using Histogram (to understand the distribution)

In [None]:
# PyLab is a module that belongs to the Python mathematics library Matplotlib. 
# PyLab combines the numerical module numpy with the graphical plotting module pyplot
import pylab as pl

# To create a histogram, we will use pandas hist() method.
fruit.drop('fruit_label', axis=1).hist(bins=30, figsize=(10,10), color = "c", ec = "m", lw=0)
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('fruits_histogram')
plt.show()

In [None]:
# The Gaussian distribution is also commonly called the "normal distribution" and is often described as a "bell-shaped curve".
# Colour_Score and Height seem to be closer to the Gaussian distribution

### Visualise using Scatter matrix

In [None]:
from pandas.plotting import scatter_matrix
from matplotlib import cm

cmap = cm.get_cmap('gnuplot')
df = pd.DataFrame(np.random.randn(1000, 4), columns=['mass', 'width', 'height', 'color_score'])
scatter_matrix(df, alpha=0.2, cmap = cmap, figsize=(10,10), marker = '.', s=30, hist_kwds={'bins':10}, range_padding=0.05, color = 'm')
plt.suptitle('Scatter-matrix for each input variable')
plt.savefig('fruit_scatter_matrix')

# 3. K-Nearest Neighbors

### Build the KNN classifier model to determine K

In [None]:
# First, import the KNeighborsClassifier module
# details about the module here: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier

# In order to understand the model performance, divide the dataset into a training set and a test set.
# The split is done by using the function train_test_split()
# details here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split

In [None]:
# Split the dataset into two different datasets
# X for the independent features such as mass, width, height
# Y for the dependent feature i.e. fruit name
X = fruit[['mass','width','height','color_score']]
Y = fruit['fruit_name']

# Now split the dataset X into two separate sets — X_train and X_test 
# Similarly, split the dataset Y into two sets — y_train and y_test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Notice the use of test_size. This parameter decides the size of the data that has to be split as the test dataset
# In the above case it is 0.2, which means that the dataset will be split 20% as the test dataset

### Let's look at the statistical summary using describe() method

In [None]:
X_train.describe()

In [None]:
X_test.describe()

### Invoke the classifier and Training the model

In [None]:
# Now create a KNN classifier for making predictions
knn = KNeighborsClassifier()

# Train the model using the training sets
knn.fit(X_train, y_train)

In [None]:
# Note the output above that by default the n_neighbors = 5

### Evaluate the accuracy of the model for K=5

In [None]:
# Model Accuracy, how often is the classifier correct?
# Accuracy can be computed by comparing actual test set values and predicted values.
# The score function is simply a utility function for a default metric to be used within some algorithms of scikit-learn
knn.score(X_test, y_test)
print("Accuracy for K=5 : ", knn.score(X_test, y_test))

In [None]:
# Well, we got a classification rate of 58.33%, which is good
# Now, let's increase the number of neighbors in the model and observe the accuracy

### Evaluate the accuracy of the model for K=6

In [None]:
knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
print("Accuracy for K=6 : ", knn.score(X_test, y_test))

In [None]:
# We got a classification rate of 66.66%, which is even better

### Evaluate the accuracy of the model for K=7

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
print("Accuracy for K=7 : ", knn.score(X_test, y_test))

In [None]:
# Ok, we got a classification rate of 66.66%, which is same as K=6

### Evaluate the accuracy of the model for K=8

In [None]:
knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
print("Accuracy for K=8 : ", knn.score(X_test, y_test))

In [None]:
# Great, we got a classification rate of 41.67%, so we decide the number of neighbors to be 7

### Data visualisation: Find the most appropriate K by plotting the accuracy for the various neighbours in a graph

In [None]:
neighbours = np.arange(1,10)
training_accuracy = np.empty(len(neighbours))
testing_accuracy = np.empty(len(neighbours))

In [None]:
for i in range(len(neighbours)):
    knn = KNeighborsClassifier(n_neighbors = i+1)
    knn.fit(X_train,y_train)
    training_accuracy[i] = knn.score(X_train,y_train)
    testing_accuracy[i] = knn.score(X_test,y_test)

In [None]:
plt.title('KNN - Accuracy for various neighbors')
plt.plot(neighbours, testing_accuracy, label = 'Testing Accuracy', color ='c')
plt.plot(neighbours, training_accuracy, label = 'Training accuracy', color ='m')
plt.legend()
plt.xlabel('No. of neighbours')
plt.ylabel('Accuracy')
plt.show()
plt.savefig('knn - accuracy vs no of neighbours')

In [None]:
# From the above graph, we can say that the best fit value for K is either 6 of 7. Going by the accuracy, consider 7

### Conclusion

In [None]:
# As a result, we can say that using KNN algorithm with K=7, 
# we can estimate the "Colour" of a fruit from its "Mass", "Width", "Height","Color_Code" values with 66.67% accuracy

# 4. Plot the Decision Boundary of the k-NN Classifier

In [None]:
import numpy as np
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches
from sklearn import neighbors, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02  # step size in the mesh

n_neighbors = 7

# Create color maps
cmap_light = ListedColormap(['#FFFACD', '#7FFFD4', '#87CEFA'])
cmap_bold = ListedColormap(['#FF0000', '#228B22', '#0000FF'])

for weights in ['distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))
plt.show()
plt.savefig('classification chart')

### References: 

In [None]:
# Li, Susan (2017), https://towardsdatascience.com/solving-a-simple-classification-problem-with-python-fruits-lovers-edition-d20ab6b071d2
# Navlani, Avinash (2018), https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn
# https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib/45076236
# Various others