In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Model training

In [2]:
df = pd.read_csv('notebook/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop(labels=['id'],axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
# since we know through EDA that x,y, z and carat are highly correlated, therefore, we can drop x,y and z and keep only carat feature to get better score
df.drop(['x', 'y', 'z'], axis=1, inplace=True)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.52,Premium,F,VS2,62.2,58.0,13619
1,2.03,Very Good,J,SI2,62.0,58.0,13387
2,0.7,Ideal,G,VS1,61.2,57.0,2772
3,0.32,Ideal,G,VS1,61.6,56.0,666
4,1.7,Premium,G,VS2,62.6,59.0,14453


In [5]:
## separating independent and dependent features
X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [6]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.52,Premium,F,VS2,62.2,58.0
1,2.03,Very Good,J,SI2,62.0,58.0
2,0.70,Ideal,G,VS1,61.2,57.0
3,0.32,Ideal,G,VS1,61.6,56.0
4,1.70,Premium,G,VS2,62.6,59.0
...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0
193569,0.70,Premium,G,VVS2,60.3,58.0
193570,0.73,Very Good,F,SI1,63.1,57.0
193571,0.34,Very Good,D,SI1,62.9,55.0


In [7]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [8]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_features = X.select_dtypes(include='object').columns
numerical_features = X.select_dtypes(exclude='object').columns

print(categorical_features)
print(numerical_features)

Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table'], dtype='object')


In [9]:
print("CUT", df['cut'].unique())
print("COLOR", df['color'].unique())
print("CLARITY", df['clarity'].unique())

CUT ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
COLOR ['F' 'J' 'G' 'E' 'D' 'H' 'I']
CLARITY ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [10]:
# https://www.americangemsociety.org/ags-diamond-grading-system/
# Define the custom ranking for each ordinal variable

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [11]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

#pipeline 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [12]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_features),
    ('cat_pipeline', cat_pipeline, categorical_features)
])

preprocessor

In [13]:
# TRAIN TEST Separate
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=30)
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
168192,0.34,Ideal,I,VVS2,60.9,57.0
35202,0.9,Good,E,SI1,63.8,57.0
41091,1.02,Premium,G,VS1,62.7,58.0
31239,0.32,Premium,G,VS2,62.1,59.0
45722,0.35,Ideal,J,VVS2,61.1,56.0


In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(135501, 6)
(58072, 6)
(135501, 1)
(58072, 1)


In [15]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,0.874076,2.14467,1.352731


In [16]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor



In [17]:
def evaluate_model(true, pred):
    r2score = r2_score(true, pred)
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    return mae, rmse, r2score, mse

In [18]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(),
    "AdaBoost Regressor" : AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #make prediction
    y_test_pred = model.predict(X_test)

        # Evaluate Test dataset
    model_test_mae, model_test_rmse, model_test_r2, model_test_mse = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- Mean Square Error: {:.4f}".format(model_test_mse))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

LinearRegression
Model performance for Test set
- Root Mean Squared Error: 1099.6944
- Mean Absolute Error: 806.3805
- Mean Square Error: 1209327.7389
- R2 Score: 0.9258


Lasso
Model performance for Test set
- Root Mean Squared Error: 1099.7071
- Mean Absolute Error: 806.0476
- Mean Square Error: 1209355.6116
- R2 Score: 0.9258


Ridge
Model performance for Test set
- Root Mean Squared Error: 1099.6946
- Mean Absolute Error: 806.3752
- Mean Square Error: 1209328.1502
- R2 Score: 0.9258


K-Neighbors Regressor
Model performance for Test set
- Root Mean Squared Error: 724.0744
- Mean Absolute Error: 395.8888
- Mean Square Error: 524283.7977
- R2 Score: 0.9678


Decision Tree
Model performance for Test set
- Root Mean Squared Error: 825.8561
- Mean Absolute Error: 417.3838
- Mean Square Error: 682038.3482
- R2 Score: 0.9581




  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model performance for Test set
- Root Mean Squared Error: 638.3996
- Mean Absolute Error: 329.8824
- Mean Square Error: 407554.0078
- R2 Score: 0.9750




  y = column_or_1d(y, warn=True)


AdaBoost Regressor
Model performance for Test set
- Root Mean Squared Error: 1260.3726
- Mean Absolute Error: 871.5201
- Mean Square Error: 1588539.1315
- R2 Score: 0.9025




In [19]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.97498
3,K-Neighbors Regressor,0.967814
4,Decision Tree,0.95813
0,LinearRegression,0.925759
2,Ridge,0.925759
1,Lasso,0.925758
6,AdaBoost Regressor,0.902479


### Best Model is Random Forest