## Implement KNN Regressor without using SKLearn Library

### Load the libraries

In [1]:
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Read the diamond data file from my personal GitHub repository

In [2]:
data=pd.read_csv("https://raw.githubusercontent.com/shivakmuddam25/ml/main/diamonds.csv");
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
y = data['price']
data.drop(columns=['Unnamed: 0'],inplace=True)
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


### Split the data into Train and Test

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('price', axis=1), y, train_size=0.99, random_state=100)
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
16471,1.20,Ideal,I,VVS2,61.5,57.0,6.91,6.82,4.22
37045,0.46,Good,E,SI1,62.7,58.0,4.87,4.92,3.07
22576,1.50,Very Good,E,SI1,58.6,62.0,7.43,7.46,4.36
175,0.83,Good,I,VS2,64.6,54.0,5.85,5.88,3.79
25157,2.07,Premium,J,SI1,61.7,60.0,8.23,8.15,5.05
...,...,...,...,...,...,...,...,...,...
16304,1.07,Very Good,G,VS2,62.9,59.0,6.49,6.52,4.09
79,0.26,Very Good,E,VVS1,62.6,59.0,4.06,4.09,2.55
12119,0.91,Premium,G,VVS2,61.8,58.0,6.24,6.16,3.83
14147,1.25,Premium,I,SI1,62.4,58.0,6.89,6.85,4.29


In [5]:
data_cat= X_train.select_dtypes(include=['object'])
data_num= X_train.select_dtypes(exclude=['object'])

In [6]:
data_cat.head()

Unnamed: 0,cut,color,clarity
16471,Ideal,I,VVS2
37045,Good,E,SI1
22576,Very Good,E,SI1
175,Good,I,VS2
25157,Premium,J,SI1


### Apply Standard Scaling on numerical features on Train Data

In [7]:
from sklearn.preprocessing import StandardScaler
std_scaler=StandardScaler()
data_num_rescaled=pd.DataFrame(std_scaler.fit_transform(data_num),
                               columns=data_num.columns, 
                              index = data_num.index)
data_num_rescaled

Unnamed: 0,carat,depth,table,x,y,z
16471,0.847937,-0.174487,-0.204436,1.050772,0.949995,0.964922
37045,-0.713094,0.663371,0.243211,-0.767954,-0.713262,-0.664336
22576,1.480788,-2.199309,2.033798,1.514369,1.510251,1.163266
175,0.067422,1.989978,-1.547377,0.105748,0.127121,0.355721
25157,2.683203,-0.034844,1.138504,2.227595,2.114276,2.140821
...,...,...,...,...,...,...
16304,0.573702,0.803013,0.690857,0.676329,0.687376,0.780745
79,-1.134994,0.593549,0.690857,-1.490095,-1.439843,-1.401044
12119,0.236182,0.034977,0.243211,0.453445,0.372232,0.412391
14147,0.953412,0.453906,0.243211,1.032941,0.976257,1.064094


In [8]:
data_cat

Unnamed: 0,cut,color,clarity
16471,Ideal,I,VVS2
37045,Good,E,SI1
22576,Very Good,E,SI1
175,Good,I,VS2
25157,Premium,J,SI1
...,...,...,...
16304,Very Good,G,VS2
79,Very Good,E,VVS1
12119,Premium,G,VVS2
14147,Premium,I,SI1


### Encode Categorical features manually on Train Data

In [9]:
dict_cut = {'Ideal': 0, 'Premium': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4}
data_cat["cut"] = data_cat["cut"].apply(lambda x: dict_cut[x])

dict_color = {'E': 0, 'I': 1, 'J': 2, 'H': 3, 'F': 4, 'G': 5, 'D': 6}
data_cat["color"] = data_cat["color"].apply(lambda x: dict_color[x])

dict_clarity = {'SI2': 0, 'SI1': 1, 'VS1': 2, 'VS2': 3, 'VVS2': 4, 'VVS1': 5, 'I1': 6, 'IF': 7}
data_cat["clarity"] = data_cat["clarity"].apply(lambda x: dict_clarity[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat["cut"] = data_cat["cut"].apply(lambda x: dict_cut[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat["color"] = data_cat["color"].apply(lambda x: dict_color[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat["clarity"] = data_cat["clarity"].apply(lambda x: dict_clarity[x]

In [10]:
data_cat

Unnamed: 0,cut,color,clarity
16471,0,1,4
37045,2,0,1
22576,3,0,1
175,2,1,3
25157,1,2,1
...,...,...,...
16304,3,5,3
79,3,0,5
12119,1,5,4
14147,1,1,1


In [11]:
train_df_final = pd.concat([data_num_rescaled, data_cat], axis=1)
train_df_final

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
16471,0.847937,-0.174487,-0.204436,1.050772,0.949995,0.964922,0,1,4
37045,-0.713094,0.663371,0.243211,-0.767954,-0.713262,-0.664336,2,0,1
22576,1.480788,-2.199309,2.033798,1.514369,1.510251,1.163266,3,0,1
175,0.067422,1.989978,-1.547377,0.105748,0.127121,0.355721,2,1,3
25157,2.683203,-0.034844,1.138504,2.227595,2.114276,2.140821,1,2,1
...,...,...,...,...,...,...,...,...,...
16304,0.573702,0.803013,0.690857,0.676329,0.687376,0.780745,3,5,3
79,-1.134994,0.593549,0.690857,-1.490095,-1.439843,-1.401044,3,0,5
12119,0.236182,0.034977,0.243211,0.453445,0.372232,0.412391,1,5,4
14147,0.953412,0.453906,0.243211,1.032941,0.976257,1.064094,1,1,1


## Perform data preprocessing on Test Data like above, and transform the data

In [12]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
52264,0.57,Ideal,E,VS2,61.5,57.0,5.35,5.32,3.28
21073,1.16,Ideal,G,VS1,61.5,55.0,6.75,6.81,4.17
42161,0.51,Ideal,G,SI1,63.2,58.0,5.05,5.08,3.20
35974,0.42,Ideal,F,VS1,60.6,56.0,4.83,4.87,2.94
7641,0.80,Premium,G,IF,62.6,58.0,5.89,5.93,3.70
...,...,...,...,...,...,...,...,...,...
11028,1.07,Ideal,H,SI1,62.0,57.0,6.56,6.51,4.05
28175,0.35,Ideal,F,VS1,62.4,55.0,4.49,4.52,2.81
6614,0.90,Premium,H,VS2,61.3,59.0,6.24,6.19,3.81
32260,0.30,Very Good,G,VVS1,62.8,58.0,4.27,4.30,2.69


In [13]:
test_df_num = X_test.select_dtypes(include=['int64', 'float64'])
test_df_cat = X_test.select_dtypes(exclude=['int64', 'float64'])


In [14]:
test_df_num

Unnamed: 0,carat,depth,table,x,y,z
52264,0.57,61.5,57.0,5.35,5.32,3.28
21073,1.16,61.5,55.0,6.75,6.81,4.17
42161,0.51,63.2,58.0,5.05,5.08,3.20
35974,0.42,60.6,56.0,4.83,4.87,2.94
7641,0.80,62.6,58.0,5.89,5.93,3.70
...,...,...,...,...,...,...
11028,1.07,62.0,57.0,6.56,6.51,4.05
28175,0.35,62.4,55.0,4.49,4.52,2.81
6614,0.90,61.3,59.0,6.24,6.19,3.81
32260,0.30,62.8,58.0,4.27,4.30,2.69


In [15]:
test_df_cat

Unnamed: 0,cut,color,clarity
52264,Ideal,E,VS2
21073,Ideal,G,VS1
42161,Ideal,G,SI1
35974,Ideal,F,VS1
7641,Premium,G,IF
...,...,...,...
11028,Ideal,H,SI1
28175,Ideal,F,VS1
6614,Premium,H,VS2
32260,Very Good,G,VVS1


In [16]:
test_df_num_scaled = std_scaler.transform(test_df_num)
test_df_num_scaled = pd.DataFrame(test_df_num_scaled, 
                                  columns=test_df_num.columns, 
                                 index = test_df_num.index)
test_df_num_scaled

Unnamed: 0,carat,depth,table,x,y,z
52264,-0.481048,-0.174487,-0.204436,-0.340018,-0.363103,-0.366819
21073,0.763557,-0.174487,-1.099730,0.908127,0.941241,0.894084
42161,-0.607619,1.012478,0.243211,-0.607478,-0.573198,-0.480159
35974,-0.797474,-0.802880,-0.652083,-0.803615,-0.757032,-0.848513
7641,0.004137,0.593549,0.243211,0.141409,0.170890,0.228214
...,...,...,...,...,...,...
11028,0.573702,0.174620,-0.204436,0.738736,0.678622,0.724075
28175,-0.945139,0.453906,-1.099730,-1.106736,-1.063422,-1.032690
6614,0.215087,-0.314130,0.690857,0.453445,0.398494,0.384056
32260,-1.050614,0.733192,0.243211,-1.302873,-1.256010,-1.202699


In [17]:
test_df_cat["cut"] = test_df_cat["cut"].apply(lambda x: dict_cut[x])
test_df_cat["color"] = test_df_cat["color"].apply(lambda x: dict_color[x])
test_df_cat["clarity"] = test_df_cat["clarity"].apply(lambda x: dict_clarity[x])
test_df_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_cat["cut"] = test_df_cat["cut"].apply(lambda x: dict_cut[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_cat["color"] = test_df_cat["color"].apply(lambda x: dict_color[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_cat["clarity"] = test_df_cat["clarity"].apply(lambda 

Unnamed: 0,cut,color,clarity
52264,0,0,3
21073,0,5,2
42161,0,5,1
35974,0,4,2
7641,1,5,7
...,...,...,...
11028,0,3,1
28175,0,4,2
6614,1,3,3
32260,3,5,5


In [18]:
test_df_final = pd.concat([test_df_num_scaled, test_df_cat], axis=1)
test_df_final

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
52264,-0.481048,-0.174487,-0.204436,-0.340018,-0.363103,-0.366819,0,0,3
21073,0.763557,-0.174487,-1.099730,0.908127,0.941241,0.894084,0,5,2
42161,-0.607619,1.012478,0.243211,-0.607478,-0.573198,-0.480159,0,5,1
35974,-0.797474,-0.802880,-0.652083,-0.803615,-0.757032,-0.848513,0,4,2
7641,0.004137,0.593549,0.243211,0.141409,0.170890,0.228214,1,5,7
...,...,...,...,...,...,...,...,...,...
11028,0.573702,0.174620,-0.204436,0.738736,0.678622,0.724075,0,3,1
28175,-0.945139,0.453906,-1.099730,-1.106736,-1.063422,-1.032690,0,4,2
6614,0.215087,-0.314130,0.690857,0.453445,0.398494,0.384056,1,3,3
32260,-1.050614,0.733192,0.243211,-1.302873,-1.256010,-1.202699,3,5,5


## Compute Eucledian Distance for the test data, using 5 nearest neighbours (K-NN)

In [19]:
# Compute the Eucledian distance for all the points and save it in the list
n_neighbours = 5
y_pred = []

for r_test_index in test_df_final.index.to_list():
    r_index_list = []
    dist = []
    sorted_dist_df = []
    for r_index in train_df_final.index.to_list():
        acc = 0
        for col in train_df_final.columns:        
            acc += np.power((train_df_final.loc[r_index, col] - test_df_final.loc[r_test_index, col]), 2)
        dist.append(np.power(acc, 1/2))
        r_index_list.append(r_index)
    dist_df = pd.DataFrame({"Index": r_index_list, "Dist": dist})
    sorted_dist_df = dist_df.sort_values('Dist')
    price_avg = 0
    for n in sorted_dist_df["Index"].head(n_neighbours).to_list():
        price_avg += data.loc[n, 'price']
#             print(data.loc[n, 'price'])
    print(price_avg/n_neighbours)
    y_pred.append(price_avg/n_neighbours)


1792.2
8386.4
1136.6
975.4
3273.2
5696.6
797.4
1839.4
4688.4
606.4
5944.2
2764.6
803.0
1697.2
2663.0
7012.4
1021.6
2563.0
699.8
738.6
1706.2
729.8
1196.2
6698.0
11635.4
6901.6
2842.6
4801.6
7802.6
445.8
10905.6
4171.8
9508.4
733.2
4810.6
625.4
6642.8
5107.4
486.6
900.2
1683.6
1910.2
880.8
6368.2
785.8
748.0
2233.2
8706.4
1837.2
962.2
797.2
806.0
3836.6
5356.4
1325.2
8683.6
5157.6
14632.2
755.2
2116.6
1402.8
8572.8
970.8
4685.0
6770.6
1402.0
768.8
5156.4
2532.0
4221.0
980.8
10730.8
2411.8
1640.4
849.0
4017.4
789.4
4530.4
759.0
1942.2
14764.0
784.0
795.4
1445.0
2658.6
932.6
2445.2
14323.8
4442.0
4886.8
4443.4
4060.8
10850.6
7676.6
3586.0
12319.4
644.2
6065.2
1116.2
2153.6
2112.8
2078.0
15607.6
680.2
1026.6
2171.0
3602.6
2350.8
4415.2
4241.4
4838.2
5821.0
11282.0
919.4
8945.8
788.8
4260.6
5907.6
6465.2
1147.2
1205.2
1169.0
8588.6
6708.2
1794.4
707.0
4000.0
750.6
2580.4
7612.6
12089.8
4537.8
10602.6
607.6
2607.8
4904.8
4779.4
7565.0
3480.6
1693.6
781.8
1581.8
1302.2
911.2
6462.8
1706.8
873

### Evaluate the metrics

In [20]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

print("Mean Squared Error: ",mean_squared_error(y_test,y_pred))
print("Mean Absolute Error: ",mean_absolute_error(y_test,y_pred))
print("R2 Score: ",r2_score(y_test,y_pred))

Mean Squared Error:  485951.65585185203
Mean Absolute Error:  373.7962962962963
R2 Score:  0.9681606582140126
