model training

In [36]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r"C:\Users\amanm\Desktop\DIAT\Project\notebooks\dataset\gemstone.csv")
data

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...,...
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [3]:
data = data.drop(labels = ["id"], axis = 1)
data.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
102206,1.28,Ideal,I,SI1,62.1,57.0,6.95,6.9,4.31,6165
33453,0.9,Very Good,D,SI2,61.9,63.0,6.1,6.22,3.81,3534
172981,1.01,Very Good,F,SI2,59.1,61.0,6.55,6.51,3.84,4423
56636,0.32,Premium,I,SI1,62.9,58.0,4.42,4.39,2.77,554
26883,0.73,Very Good,G,VS1,59.3,60.0,5.86,5.9,3.5,2916


In [4]:
data = data.drop(labels = ["x","y", "z"], axis = 1)
data.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price
122458,1.52,Very Good,F,SI1,63.0,62.0,12607
80117,0.33,Premium,I,VVS2,61.0,59.0,579
159917,0.74,Ideal,E,SI1,61.5,55.0,3053
167389,0.5,Ideal,G,VVS2,62.7,57.0,1920
185705,0.9,Premium,F,SI1,62.2,62.0,3537


In [11]:
## independent and dependent features
X = data.drop(labels = ["price"], axis = 1)
y = data[['price']]

In [8]:
# define which columns should be ordinal encoded and which should be scaled
cat_cols = X.select_dtypes(include = 'object').columns
num_cols = X.select_dtypes(exclude = "object").columns

In [24]:
#defining the custom ranking ordinal variables
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [25]:
# for imputing the data, for handling missing values
# for handling feature scaling
# for feature engineering, like ordinal encoding automatically 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

# pipeline (combining multiple steps which are shown above handling missing data, scaling, encoding etc.)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # combines both numerical pipeline and categorical pipeline

In [26]:
# numerical pipeline
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaling', StandardScaler())
    ]
)

# categorical pipeline
cat_pipeline = Pipeline(
steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories,clarity_categories])),
    ('scaling', StandardScaler())
]
)


preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
])

In [27]:
# train - test - split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

In [30]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [32]:
## training of model (different Regression Models)
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [43]:
def evaluate(ground_truth, predicted):
    mae = mean_absolute_error(ground_truth, predicted)
    mse = mean_squared_error(ground_truth, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(ground_truth, predicted)
    return mae, mse, rmse, r2 

In [52]:
## training multiple models

models = {
    "LinearRegression" : LinearRegression(), 
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet() 
}
model_list = []
r2_list = []
for x in range(len(list(models))):
    model = list(models.values())[x]
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    mae, mse, rmse, r2 = evaluate(y_test, predictions)
    print(list(models.keys())[x])
    model_list.append(list(models.keys())[x])

    print("Model Training Performance")
    print("RMSE", rmse)
    print("MAE", mae)
    print("MSE", mse)
    print("R2_score", r2*100)

    r2_list.append(r2)

    print('='*16)
    print("\n")



LinearRegression
Model Training Performance
RMSE 1105.0378225622246
MAE 810.1483729734686
MSE 1221108.5892930625
R2_score 92.42882011166262


Lasso
Model Training Performance
RMSE 1105.0434517045526
MAE 809.8233344923104
MSE 1221121.0301551118
R2_score 92.42874297519388


Ridge
Model Training Performance
RMSE 1105.0376715307182
MAE 810.1428492745513
MSE 1221108.2555020314
R2_score 92.42882218125084


ElasticNet
Model Training Performance
RMSE 1830.849761784707
MAE 1241.2521367642596
MSE 3352010.8502271185
R2_score 79.2166909992659




In [48]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']