## Model Training

In [3]:
import pandas as pd

df = pd.read_csv("./data/gemstone.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
#Dropping ID column as it is unique and it is not required
df = df.drop(["Unnamed: 0", "id"], axis=1)

In [9]:
#Segregating Dependent and Independent features
X = df.drop("price", axis=1)
Y = df[["price"]]

In [11]:
X.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59
6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57
7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38
8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7
9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72


In [12]:
Y.head(10)

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
5,7506
6,3229
7,6224
8,886
9,421


In [13]:
#Define which column should be ordinal-encoded and which should be scaled

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [14]:
#Define the custom ranking for each ordincal variable

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [16]:
from sklearn.impute import SimpleImputer #Handling missing values
from sklearn.preprocessing import StandardScaler #Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal encoding - ranking the categorical features

##pipeline - to combine multiple steps
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer #To combine pipelines


In [18]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
    ]
)

##Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
    ('scaler', StandardScaler())    
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline', num_pipeline, numerical_cols),
('cat_pipeline', cat_pipeline, categorical_cols)    
])