<a href="https://colab.research.google.com/github/samantha-olinsky/MA322-Python/blob/main/KNN/KNN_Lesson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# importing packages we need
import pandas as pd
import seaborn as sns

In [3]:
# use the seaborn package to load in the iris dataset
iris = sns.load_dataset('iris')

# view the dataframe by printing
print(iris)

     sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]


In [4]:
# how many examples are there in this iris dataframe?
# how many features?

# based only on the rows that were printed,
# what assumptions can we make about the organization of the data?


In [5]:
# how can we figure out if there are other species, and their names?
# aka, how do we view the amounts of the data by the species feature?

print(pd.crosstab(index=iris['species'], columns="count"))

col_0       count
species          
setosa         50
versicolor     50
virginica      50


In [6]:
# so now we know that there are 3 different species in our dataframe
# 50 setosa, 50 versicolor, and 50 virginica
# this data is VERY clean
# REAL data is NEVER this organized

# real data is CHAOTIC
# to mimic the real world,
# we are going to "shuffle" our data
# aka randomize the order of the rows

In [7]:
# we are using the "sample" method to randomly reorganize the rows
# The frac keyword asks for the fraction of rows to return, 
# so frac=1 means return all rows (in random order)
rand_iris = iris.sample(frac=1)

# then we print it to take a look
print(rand_iris)

     sepal_length  sepal_width  petal_length  petal_width     species
127           6.1          3.0           4.9          1.8   virginica
99            5.7          2.8           4.1          1.3  versicolor
111           6.4          2.7           5.3          1.9   virginica
138           6.0          3.0           4.8          1.8   virginica
84            5.4          3.0           4.5          1.5  versicolor
..            ...          ...           ...          ...         ...
30            4.8          3.1           1.6          0.2      setosa
129           7.2          3.0           5.8          1.6   virginica
64            5.6          2.9           3.6          1.3  versicolor
36            5.5          3.5           1.3          0.2      setosa
67            5.8          2.7           4.1          1.0  versicolor

[150 rows x 5 columns]


In [8]:
# we can use the "describe" method to look at info of the numeric features
# we are interested right now in min and max
print(rand_iris.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [9]:
# by looking at the min and max values for each feature,
# we realize that a long sepal length,
# is not necessarily the same value as a long petal length.

# why is this important?

# well the KNN algorithm uses distance as its key measurement
# the distance between data points is not calculated on a single scale,
# every feature has its own scale!

# how do we fix this?
# NORMALIZE the data

In [10]:
# to normalize our data, or scale it between 0 and 1,
# we are going to create our own function

# we will define the function and give it the name 'normalize'
# it will recieve a parameter, which we'll refer to as 'df'
def normalize(df) : 
  return (df-df.min())/(df.max()-df.min())
# the function will return the normalized data (through this calculation)


# now that our function is prepared,
# we need to extract only the numeric features from out dataset,
# because our function is mathematical, it can't deal with categorical variables
num_iris = rand_iris[['sepal_width','sepal_length','petal_width','petal_length']]

# then, using our normalize function,
# i will pass the numeric dataframe through the function, 
# save the results in a variable called 'iris2'
iris2 = normalize(num_iris)

# how do we check our work?
# lets print the dataframe and take a look
print(iris2)

     sepal_width  sepal_length  petal_width  petal_length
127     0.416667      0.500000     0.708333      0.661017
99      0.333333      0.388889     0.500000      0.525424
111     0.291667      0.583333     0.750000      0.728814
138     0.416667      0.472222     0.708333      0.644068
84      0.416667      0.305556     0.583333      0.593220
..           ...           ...          ...           ...
30      0.458333      0.138889     0.041667      0.101695
129     0.416667      0.805556     0.625000      0.813559
64      0.375000      0.361111     0.500000      0.440678
36      0.625000      0.333333     0.041667      0.050847
67      0.291667      0.416667     0.375000      0.525424

[150 rows x 4 columns]


In [11]:
# looks pretty good to me!
# but just to ensure that all our values are between 1 and 0,
# lets look at the describe function again
print(iris2.describe())

       sepal_width  sepal_length  petal_width  petal_length
count   150.000000    150.000000   150.000000    150.000000
mean      0.440556      0.428704     0.458056      0.467458
std       0.181611      0.230018     0.317599      0.299203
min       0.000000      0.000000     0.000000      0.000000
25%       0.333333      0.222222     0.083333      0.101695
50%       0.416667      0.416667     0.500000      0.567797
75%       0.541667      0.583333     0.708333      0.694915
max       1.000000      1.000000     1.000000      1.000000


In [23]:
# see how all the min are 0 and the max is 1?
# perfect! we did it! we normalized our data

# now we are ready to split the data into training and testing datasets
# for this particular KNN, we need to separate the target feature too
# remember about 80% of data is for training, 20% for testing
iris_train = iris2.iloc[:120,]
iris_test = iris2.iloc[121:,]
iris_train_tf = rand_iris['species'].iloc[:120,] 
iris_test_tf = rand_iris['species'].iloc[121:,]

In [17]:
# need to encode the classification 


In [14]:
# this is our package for KNN
from sklearn.neighbors import KNeighborsClassifier

# to determine our k, or the num of neighbors,
# roughly the square root of the total num of datapoints
import math
print(math.sqrt(150))

12.24744871391589


In [21]:
# so based on that, an acceptable k would be 11 or 13

m1 = KNeighborsClassifier(n_neighbors=13).fit(iris_train, iris_train_tf)

In [33]:
# lets see how our algorithm did
from sklearn import metrics
predictions = m1.predict(iris_test)
print(metrics.confusion_matrix(iris_test_tf, predictions, labels=['setosa', 'versicolor', 'virginica']))

[[12  0  0]
 [ 0  8  0]
 [ 0  1  8]]


In [None]:
# so what does this mean?
# the matrix, with labels would look like this:

#  predictions    setosa    versicolor   viriginica   < iris_test_tf
#       setosa      12           0            0
#   versicolor      0            8            0
#    virginica      0            1            8

# with the labels added, we can see the rows represents the preditions
# and the columns represent the actual "answer"
# therefore, the diagonal from top left to bottom right,
# is how many times the algorithm correctly predicted the flower species

# for this particular example, the algorithm predicted incorrectly 1 time
# it thought it was virginica, but it was actually versicolor

# overall, this is a pretty good outcome

In [57]:
# now lets make some predictions for NEW, UNCLASSIFIED data
new_sepal_width = .4
new_sepal_length = .2
new_petal_width = .3
new_petal_length = .2

print(m1.predict([[new_sepal_width, new_sepal_length, new_petal_width, new_petal_length]]))

# try playing around and changing the lengths and widths (between 0 and 1)
# how does it change the species result?

['setosa']
