In [1]:
import numpy as np
import pandas as pd

## Step - 1: Load the data

In [2]:
data = pd.read_csv("diamonds.csv")
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data.shape

(53940, 10)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [5]:
data.duplicated().sum()

np.int64(146)

In [6]:
data = data.drop_duplicates()

In [7]:
data.duplicated().sum()

np.int64(0)

## Step - 2: Identify input and output variables

In [8]:
# input variables - All columns except price
# output variable - price
X = data.drop('price',axis=1)
y = data['price']
X.shape,y.shape

((53794, 9), (53794,))

## Step - 3: Split the data - Test and Train (recommended 75:25 split)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((40345, 9), (13449, 9), (40345,), (13449,))

## Step - 4: Data Preprocessing on X_train (You can use sklearn for data preprocessing)
- Categorical Data Encoding
- Numerical Data Rescaling

In [11]:
from sklearn.preprocessing import OrdinalEncoder

### Categorical Data Encoding - Ordinal Encoding

In [12]:
#ordinal order
#cut
cut_order=['Fair','Good','Very Good','Premium','Ideal']
#color
color_order=['J','I','H','G','F','E','D']
#clarity
clarity_order=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [13]:
oe = OrdinalEncoder(categories=[cut_order,color_order,clarity_order])
oe

### Numerical Data Rescaling - RobustScaler

In [14]:
from sklearn.preprocessing import RobustScaler

In [15]:
scaler = RobustScaler()
scaler

In [16]:
from sklearn.compose import ColumnTransformer

In [17]:
ct = ColumnTransformer(
    [('ORDINAL',oe,['cut','color','clarity']),
    ('SCALER',scaler,['carat','depth','table','x','y','z'])],
    verbose_feature_names_out=False,
    remainder='passthrough',).set_output(transform='pandas')
ct

In [18]:
X_train_transformed = ct.fit_transform(X_train)
X_train_transformed.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
12820,4.0,3.0,3.0,0.5,0.4,0.0,0.42623,0.357143,0.424779
19997,2.0,4.0,5.0,0.546875,-0.333333,0.666667,0.431694,0.467033,0.424779
6099,3.0,3.0,2.0,0.328125,0.533333,0.333333,0.262295,0.236264,0.292035
37984,4.0,6.0,5.0,-0.59375,-0.6,0.0,-0.710383,-0.692308,-0.725664
24865,3.0,3.0,3.0,1.28125,0.066667,-0.333333,0.928962,0.862637,0.902655


## Step - 5: Data Preprocessing on X_test 

In [19]:
X_test_transformed = ct.transform(X_test)
X_test_transformed.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
43657,0.0,0.0,3.0,0.015625,2.066667,-1.0,-0.032787,-0.098901,0.088496
4274,2.0,5.0,1.0,0.3125,-0.533333,0.666667,0.245902,0.258242,0.212389
47412,4.0,3.0,4.0,-0.203125,0.2,-0.333333,-0.213115,-0.208791,-0.19469
44437,3.0,5.0,3.0,-0.296875,0.466667,1.0,-0.333333,-0.335165,-0.300885
13975,4.0,1.0,2.0,0.765625,0.333333,-0.666667,0.606557,0.60989,0.637168


## Step - 6: Build the model and predict on X_test

In [20]:
from sklearn.metrics.pairwise import euclidean_distances

In [21]:
query = X_test_transformed.sample(n=1,random_state=42)
query

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
6285,2.0,1.0,3.0,0.421875,0.266667,-0.366667,0.355191,0.318681,0.371681


In [22]:
dist = pd.DataFrame(euclidean_distances(X_train_transformed,query)).sort_values(by=0,ascending=True)
dist.head()

Unnamed: 0,0
21336,0.322171
10613,0.335204
17977,0.457874
1439,0.584418
535,0.601792


In [23]:
from sklearn.neighbors import KNeighborsRegressor

In [24]:
knnr = KNeighborsRegressor(n_neighbors=5)
knnr

In [25]:
knnr.fit(X_train_transformed, y_train)

In [26]:
knnr.predict(query)

array([4374.2])

In [27]:
y_predict = knnr.predict(X_test_transformed)
y_predict

array([2404.4, 4119. , 1868.2, ..., 2095.2, 1735.6, 2814.4],
      shape=(13449,))

## Step - 7: Evaluate your model

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [29]:
mae = mean_absolute_error(y_test, y_predict)
mae

420.0374897761915

In [30]:
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
rmse

np.float64(785.4369536684466)

In [31]:
r2 = r2_score(y_test, y_predict)
r2

0.9598626045601422

## Step - 8: Train a model using sklearn KNN Algorithm and compare the results with your scratch implementation

In [32]:
from sklearn.neighbors import KNeighborsRegressor

In [33]:
knn_sklearn = KNeighborsRegressor(n_neighbors=5, metric='euclidean')
knn_sklearn

In [34]:
knn_sklearn.fit(X_train_transformed, y_train)
knn_sklearn

In [35]:
y_pred_sklearn = knn_sklearn.predict(X_test_transformed)
y_pred_sklearn

array([2404.4, 4119. , 1868.2, ..., 2095.2, 1735.6, 2814.4],
      shape=(13449,))

In [36]:
mean_absolute_error(y_test, y_pred_sklearn)

420.0374897761915

In [37]:
np.sqrt(mean_squared_error(y_test, y_pred_sklearn))

np.float64(785.4369536684466)

In [38]:
 r2_score(y_test, y_pred_sklearn)

0.9598626045601422