In [104]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [105]:
df.drop('Unnamed: 0',inplace=True,axis=1)

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [107]:
X = df.drop('price',axis=1)
y = df['price']

xtrain,xtest,ytrain,ytest = train_test_split(X,y,random_state=42,train_size=0.8)

In [108]:
obj_cols = df.select_dtypes(include='object').columns
obj_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [109]:
preprocessor = ColumnTransformer(
    transformers=[
        ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),obj_cols)
    ],remainder='passthrough'
)
pipeline = Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',KNeighborsRegressor(n_neighbors=14))
    ]
)

In [110]:
xtrain

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
26546,2.01,Good,F,SI2,58.1,64.0,8.23,8.19,4.77
9159,1.01,Very Good,E,SI2,60.0,60.0,6.57,6.49,3.92
14131,1.10,Premium,H,VS2,62.5,58.0,6.59,6.54,4.10
15757,1.50,Good,E,SI2,61.5,65.0,7.21,7.17,4.42
24632,1.52,Very Good,G,VS1,62.1,57.0,7.27,7.32,4.53
...,...,...,...,...,...,...,...,...,...
11284,1.05,Very Good,I,VS2,62.4,59.0,6.48,6.51,4.05
44732,0.47,Ideal,D,VS1,61.0,55.0,5.03,5.01,3.06
38158,0.33,Very Good,F,IF,60.3,58.0,4.49,4.46,2.70
860,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82


In [111]:
pipeline.fit(xtrain,ytrain)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,14
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [112]:
pipeline.score(xtest,ytest)
# 0.9401101512485649
# 0.9409354352891675



0.9393153625617606

In [113]:
pipeline.score(xtrain,ytrain)
# 0.9474795898584873
# 0.9486512383801384


0.9462189420920345