## Knn is a non parametric algo that uses the target info from the k nearest neighbours of an observation and regress/classify based on them

In [1]:
import os
import numpy as np
import pandas as pd

### Read the dataset

In [2]:
os.chdir("C:/Users/satish/Downloads")
#Reading the data
iris = pd.read_csv("iris.csv")
iris.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
30,4.8,3.1,1.6,0.2,Iris-setosa
120,6.9,3.2,5.7,2.3,Iris-virginica
72,6.3,2.5,4.9,1.5,Iris-versicolor
122,7.7,2.8,6.7,2.0,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
139,6.9,3.1,5.4,2.1,Iris-virginica
67,5.8,2.7,4.1,1.0,Iris-versicolor
66,5.6,3.0,4.5,1.5,Iris-versicolor
74,6.4,2.9,4.3,1.3,Iris-versicolor
118,7.7,2.6,6.9,2.3,Iris-virginica


In [3]:
cars = pd.read_csv("cars.csv")
cars.sample(10)

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
280,Datsun 200-SX,23.9,4,119.0,97,2405,14.9,78,Japan
58,Peugeot 304,30.0,4,79.0,70,2074,19.5,71,Europe
367,Saab 900s,0.0,4,121.0,110,2800,15.4,81,Europe
399,Dodge Charger 2.2,36.0,4,135.0,84,2370,13.0,82,US
256,Oldsmobile Cutlass Salon Brougham,19.9,8,260.0,110,3365,15.5,78,US
383,Volkswagen Rabbit l,36.0,4,105.0,74,1980,15.3,82,Europe
221,Ford F108,13.0,8,302.0,130,3870,15.0,76,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
37,Toyota Corolla,25.0,4,113.0,95,2228,14.0,71,Japan
104,Plymouth Valiant,18.0,6,225.0,105,3121,16.5,73,US


### Utility functions

In [4]:
# Standardize all data points in df
def standardize(df):
    return (df - df.mean())/df.std()
    
    
    

In [5]:
# Find euclidean distance between data points of two series 
def euclidean(x, y):
    d = np.sqrt(sum((x-y)**2))
    return(d)

In [15]:
# find root mean squared error
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

### Cars data regression

### 1nn

In [6]:
#Split data into train and test for predictors and target

split_df = cars.sample(len(cars))
    
train = split_df.iloc[0:300, : ]
test  = split_df.iloc[300:406, : ]
predictors = ['Weight', 'Horsepower', 'Displacement', 'Acceleration']
target = ['MPG']
X_train = train[predictors]
Y_train = train[target]
X_test = test[predictors]
Y_test = test[target]

# standardize feature set 
X_train = standardize(X_train)
X_test = standardize(X_test)

In [14]:
print(type(Y_train))
print(type(X_train.iloc[1,]))


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [16]:
# 1nn algo
def onennRegression(X_train, X_test, Y_train):
    predicted = []
    
    for i in range(0, len(X_test)):
        dist = []
        # find euclidean distance of each test point from all train points
        for j in range(0, len(X_train)):
            d = euclidean(X_train.iloc[j,], X_test.iloc[i,])
            dist.append(d)
        
        # find point in train with minimum distance from test 
        predicted.append(Y_train.iloc[dist.index(min(dist)), 0])
    
    
    X_test['predicted_mpg'] = predicted
    print(X_test)

In [17]:
onennRegression(X_train, X_test, Y_train)

       Weight  Horsepower  Displacement  Acceleration  predicted_mpg
364 -0.403191   -0.104936     -0.676019     -0.174342           23.0
6    1.654674    2.693357      2.343343     -2.335345           14.0
84  -0.526261   -0.664595     -0.657993      1.017935           28.0
174 -0.928604   -0.687914     -0.874306      0.272762           28.0
150 -0.775950   -0.617956     -0.874306     -0.286118           30.9
..        ...         ...           ...           ...            ...
379 -0.509694   -0.478042     -0.531811      0.272762           26.6
305  1.117428    0.478042      1.405989      0.794383           26.6
403 -0.781867   -0.478042     -0.531811     -1.366620           36.0
85   0.027552   -0.408084     -0.667006      1.576815           24.3
316 -0.960554   -0.664595     -0.865293     -0.211601           32.2

[106 rows x 5 columns]


In [23]:
# find rmse 

print("RMSE: ",rmse(X_test["predicted_mpg"], Y_test["MPG"]))
print("Target range: ",Y_test["MPG"].min(), "to", Y_test["MPG"].max() )

RMSE:  5.805511369737462
Target range:  0.0 to 41.5


### KNN

In [24]:
#Split data into train and test for predictors and target
split_df = cars.sample(len(cars))
    
train = split_df.iloc[0:300, : ]
test  = split_df.iloc[300:406, : ]
predictors = ['Weight', 'Horsepower', 'Displacement', 'Acceleration']
target = ['MPG']
X_train = train[predictors]
Y_train = train[target]
X_test = test[predictors]
Y_test = test[target]

# standardize feature set 
X_train = standardize(X_train)
X_test = standardize(X_test)

In [25]:
#knn algo
def knnRegression(X_train, X_test,Y_train, k):
    
    predicted = []
    
    for i in range(0, len(X_test)):
        dist = []
        # find euclidean distance of each test point from all train points
        for j in range(0, len(X_train)):
            d = euclidean(X_train.iloc[j,], X_test.iloc[i,])
            dist.append(d)
        
        # Smallest K elements indices
        # using sorted() + lambda + list slicing
        indices = sorted(range(len(dist)), key = lambda ind: dist[ind])[:k]
        
        # find mean of k nearest neighbours of test point
        predicted.append(Y_train.iloc[indices, 0].mean())
        
        
    X_test['predicted_mpg'] = predicted
    print(X_test)
    

In [26]:
knnRegression(X_train, X_test,Y_train,3)

       Weight  Horsepower  Displacement  Acceleration  predicted_mpg
341 -0.698731   -0.106480     -1.192797     -1.111916      22.166667
279 -0.202084   -0.459932     -0.419631      0.631418      24.100000
266  0.043385   -0.459932      0.048087      0.323771      20.133333
113  1.674900    1.778597      1.479876     -1.111916      14.166667
16   0.658771    1.307328      1.384423     -2.650152      10.000000
..        ...         ...           ...           ...            ...
225 -1.378053   -1.096146     -1.106890      0.973249      34.700000
301 -1.206795   -0.931201     -1.040073     -0.188974      31.833333
198  0.229485   -0.106480      0.286718     -0.120608      18.666667
270  0.471529    1.425145      0.343990     -0.804268      16.066667
180 -0.556016   -0.177171     -0.725080      0.426320      25.000000

[106 rows x 5 columns]


In [27]:
# find rmse 

print("RMSE: ",rmse(X_test["predicted_mpg"], Y_test["MPG"]))
print("Target range: ",Y_test["MPG"].min(), "to", Y_test["MPG"].max() )

RMSE:  5.085787117837952
Target range:  0.0 to 44.6


### We can see that from 1nn and knn we get rmse of around 5 which is good for range 0 to 44 and knn has better performance than 1nn

## Iris data Classification

## 1NN

In [48]:
#Split data into train and test for predictors and target
split_df = iris.sample(len(iris))
    
train = split_df.iloc[0:100, : ]
test  = split_df.iloc[100:150, : ]
predictors = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
target = ['species']
X_train = train[predictors]
Y_train = train[target]
X_test = test[predictors]
Y_test = test[target]     
    
# standardize feature set 
X_train = standardize(X_train)
X_test = standardize(X_test)

In [49]:
# 1 nn algo
def onennClassification(X_train, X_test,Y_train):
    predicted = []
    
    for i in range(0, len(X_test)):
        dist = []
        for j in range(0, len(X_train)):
            # find euclidean distance of test point from all train points
            d = euclidean(X_train.iloc[j,], X_test.iloc[i,])
            dist.append(d)
        
        # use class of point in train with minimum distance
        predicted.append(Y_train.iloc[dist.index(min(dist)), 0])
    
    
    X_test['predicted_species'] = predicted
    print(X_test)
    
    
    
    

In [50]:
onennClassification(X_train, X_test,Y_train)

     sepal_length  sepal_width  petal_length  petal_width predicted_species
44      -0.812379     1.547079     -0.980945    -0.943020       Iris-setosa
115      0.778122     0.264969      0.946924     1.573466    Iris-virginica
4       -0.934725     1.119709     -1.264455    -1.207913       Iris-setosa
116      0.900468    -0.162401      1.060328     0.911233    Iris-virginica
98      -0.812379    -1.230825     -0.357223    -0.015894   Iris-versicolor
86       1.145161     0.051284      0.606711     0.513893   Iris-versicolor
133      0.655776    -0.589770      0.833520     0.513893    Iris-virginica
30      -1.179418     0.051284     -1.151051    -1.207913       Iris-setosa
27      -0.690033     0.906024     -1.207753    -1.207913       Iris-setosa
13      -1.791149    -0.162401     -1.434561    -1.340360       Iris-setosa
88      -0.200648    -0.162401      0.266499     0.249000   Iris-versicolor
33      -0.322994     2.401819     -1.264455    -1.207913       Iris-setosa
9       -1.0

In [51]:
# get accuracy of the model
(X_test["predicted_species"] == Y_test["species"]).sum()/len(Y_test["species"])

0.9

## k NN

In [55]:
#Split data into train and test for predictors and target
split_df = iris.sample(len(iris))
    
train = split_df.iloc[0:100, : ]
test  = split_df.iloc[100:150, : ]
predictors = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
target = ['species']
X_train = train[predictors]
Y_train = train[target]
X_test = test[predictors]
Y_test = test[target]     
    
# standardize feature set 
X_train = standardize(X_train)
X_test = standardize(X_test)

In [56]:
#knn algo
def knnClassification(X_train, X_test,Y_train, k):
    
    predicted = []
    
    for i in range(0, len(X_test)):
        dist = []
        for j in range(0, len(X_train)):
            # find euclidean distance of each test point from all train points
            d = euclidean(X_train.iloc[j,], X_test.iloc[i,])
            dist.append(d)
        
        
        # Smallest K elements indices
        # using sorted() + lambda + list slicing
        # use majority of class in k nearest points in train data
        indices = sorted(range(len(dist)), key = lambda sub: dist[sub])[:k]
        predicted.append(Y_train.iloc[indices,0].value_counts().idxmax())
    
    
    X_test['predicted_species'] = predicted
    print(X_test)
    

In [57]:
knnClassification(X_train, X_test,Y_train,3)

     sepal_length  sepal_width  petal_length   petal_width predicted_species
65       1.046508    -0.057369      0.490603  3.798767e-01   Iris-versicolor
145      1.046508    -0.318139      0.926695  1.519507e+00    Iris-virginica
102      1.513700    -0.318139      1.308275  1.266256e+00    Iris-virginica
80      -0.355065    -1.882756      0.163534 -2.811652e-16   Iris-versicolor
74       0.696115    -0.578908      0.436092  2.532511e-01   Iris-versicolor
117      2.214487     1.768018      1.744366  1.392881e+00    Iris-virginica
31      -0.471863     0.724939     -1.090229 -8.863789e-01       Iris-setosa
34      -1.055852    -0.057369     -1.090229 -1.266256e+00       Iris-setosa
72       0.579317    -1.621987      0.763160  5.065022e-01    Iris-virginica
52       1.280104    -0.057369      0.763160  5.065022e-01   Iris-versicolor
130      1.864093    -0.839678      1.417298  1.013004e+00    Iris-virginica
97       0.462519    -0.578908      0.436092  2.532511e-01   Iris-versicolor

In [58]:
# get accuracy of the model
(X_test["predicted_species"] == Y_test["species"]).sum()/len(Y_test["species"])

0.96

### Here we can see that the accuracy of knn classifier is 0.96 while that of 1nn classifier is 0.9 