In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("./data/gemstone.csv")

In [4]:
data.drop('id',axis =1,inplace = True)

In [5]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
X = data.drop('price',axis = 1 )

In [7]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [10]:
y = data[['price']]

In [11]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [14]:
cat_column = X.select_dtypes(include='object').columns

In [15]:
cat_column

Index(['cut', 'color', 'clarity'], dtype='object')

In [16]:
num_column = X.select_dtypes(exclude='object').columns

In [17]:
num_column

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [18]:
from sklearn.impute import SimpleImputer # for handling missing value
from sklearn.preprocessing import StandardScaler # Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [19]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [21]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scalar',StandardScaler())
    ]
)

In [22]:
categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoding',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [23]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',numerical_pipeline,num_column),
        ('cat_pipline',categorical_pipeline,cat_column)

    ]
)

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=40)

In [26]:
preprocessor.fit_transform(X_train)

array([[ 0.45410567, -0.29654463, -0.63784219, ...,  4.        ,
         2.        ,  2.        ],
       [ 3.02766057, -1.58999042,  1.96548435, ...,  3.        ,
         4.        ,  1.        ],
       [ 0.54061172,  1.45884607, -1.15850749, ...,  2.        ,
         4.        ,  1.        ],
       ...,
       [ 2.61675685,  1.82840201, -0.63784219, ...,  2.        ,
         4.        ,  1.        ],
       [ 0.45410567,  0.99690115, -1.6791728 , ...,  1.        ,
         3.        ,  4.        ],
       [-0.88673805, -1.12804549,  2.48614965, ...,  2.        ,
         1.        ,  5.        ]])

In [27]:
preprocessor.transform(X_test)

array([[ 3.33043173,  0.16540029, -0.63784219, ...,  4.        ,
         4.        ,  1.        ],
       [-0.36770177,  0.16540029, -0.11717688, ...,  4.        ,
         2.        ,  2.        ],
       [ 0.8650094 , -0.20415565, -1.15850749, ...,  4.        ,
         4.        ,  2.        ],
       ...,
       [-0.71372596, -0.85087854, -0.63784219, ...,  4.        ,
         1.        ,  4.        ],
       [ 1.12452754,  0.81212318,  0.40348843, ...,  3.        ,
         4.        ,  1.        ],
       [-0.9732441 , -0.75848956,  0.40348843, ...,  3.        ,
         1.        ,  3.        ]])

In [28]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipline__cut', 'cat_pipline__color',
       'cat_pipline__clarity'], dtype=object)

In [29]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [30]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipline__cut,cat_pipline__color,cat_pipline__clarity
0,0.454106,-0.296545,-0.637842,0.636818,0.663672,0.622410,4.0,2.0,2.0
1,3.027661,-1.589990,1.965484,2.494366,2.433180,2.182754,3.0,4.0,1.0
2,0.540612,1.458846,-1.158507,0.672887,0.645523,0.811984,2.0,4.0,1.0
3,0.562238,-2.513880,1.444819,0.817163,0.790714,0.549496,2.0,2.0,1.0
4,-0.843485,-0.019378,-0.637842,-0.851023,-0.888051,-0.865022,4.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...
135496,-0.000051,-0.019378,-0.637842,0.176940,0.200877,0.199512,4.0,1.0,2.0
135497,-0.540714,-0.573712,1.444819,-0.409179,-0.443405,-0.471290,3.0,2.0,3.0
135498,2.616757,1.828402,-0.637842,2.106625,2.061130,2.255668,2.0,4.0,1.0
135499,0.454106,0.996901,-1.679173,0.564681,0.627374,0.680740,1.0,3.0,4.0
