In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

In [3]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [5]:
# function for column standardisation
def standard(series):
    m = series.mean()
    sd = series.std()
    series = (series - m )/sd
    return series

In [6]:
#standardizing each column
data["petal_length"] = standard(data['petal_length'])
data['petal_width'] = standard(data['petal_width'])
data['sepal_length'] = standard(data['sepal_length'])
data['sepal_width'] = standard(data['sepal_width'])
data.describe() #prints descriptive statistics of all numerical features present in our data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,-5.684342e-16,-6.158037e-16,1.894781e-16,-1.894781e-16
std,1.0,1.0,1.0,1.0
min,-1.86378,-2.430844,-1.563497,-1.439627
25%,-0.8976739,-0.585801,-1.223442,-1.177559
50%,-0.05233076,-0.1245404,0.3351431,0.1327811
75%,0.672249,0.5673506,0.7602119,0.7879511
max,2.483699,3.104284,1.780377,1.705189


In [7]:
data_train = data.sample(120,random_state=17) # dividing data into d_train and d_test randomly
index_list = data_train.index.tolist()
index_test = []
for i in range(150):
    if i not in index_list:
        index_test.append(i)
data_test = data.iloc[index_test]
#print(index_test)

In [8]:
X_train = data_train[["sepal_length","sepal_width","petal_length","petal_width"]]
Y_train = data_train["species"]
X_test = data_test[["sepal_length","sepal_width","petal_length","petal_width"]]
Y_test = data_test["species"]
#print(data_train.iloc[0])
#print(data_test.iloc[0])

In [9]:
def dist(x,X_train,dist_type = "euc"): #return the euclidean distance betweeen x and each point of X_train
    n_rows , n_cols = X_train.shape
    train_n = X_train.values #convert X_train which is a DataFrame to numpy nd array so that we can index both rows and column by integer value
    euc = []
    for row in range(n_rows):
        sqr = 0 
        for col in range(n_cols):
            sqr += (x[col] - train_n[row,col])**2
        sqrt = sqr**(1/2)
        #print(sqrt)
        euc.append(sqrt)
    return euc #return the euclidean distance betweeen x(single data point of X_test) and each point of X_train

In [10]:
def k_nn(X_train,Y_train,X_test,Y_test,dist_type="euc",neighbor=5,task="R"):
    n_rows , n_cols = X_train.shape
    n_rows_test , n_cols_test = X_test.shape
    predicted_y = []
    if dist_type == "euc":
        for i in index_test: #select data points from X_train one by one
            x = X_test.loc[i] #selected single data point of  X_train
            dis = dist(x,X_train,dist_type="euc") #dis contains the euclidean distance between x and each point of training data
            temp_data = X_train.copy() #creates copy of Training data and this copy is created for each data point of X_test 
            """adding a new column or series for a single data point of X_train which holds the value of
            euclidean distance between x and each point of training data""" 
            temp_data["Distance"] = dis 
            #adding the column or series of Y_train
            temp_data["Y"] = Y_train
            #sorting on the basis of distance in ascending order as train_points which are closer to x should be selected
            sorted_X = temp_data.sort_values(by="Distance",inplace=False,ascending=True) 
            #picking up only upper "N" Neighbor
            n_values = sorted_X.iloc[0:neighbor,:]
            if task=="R": #if K-NN is used for Regression
                pred = n_values["Y"].median()
                predicted_y.append(pred)
            else: #if K-NN is used for Classification
                #doing majority vote out of N-neighbor
                series = n_values["Y"].value_counts()
                predicted_y.append(list(series.index)[0]) #adding predicted class to the list predicted_y 
    return predicted_y

In [11]:
#using function k-NN for prediction
predd = k_nn(X_train,Y_train,X_test,Y_test,dist_type="euc",neighbor=5,task="C") 

In [12]:
count = 0
total_test = len(Y_test)
for i in range(len(predd)):
    if predd[i] == Y_test.iloc[i]:
        count += 1
accuracy = (count/total_test)*100
print("The accuracy of our model which is implemented from scratch is {}%".format(accuracy))

The accuracy of our model which is implemented from scratch is 93.33333333333333%
