In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [2]:
df = pd.read_csv('./data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X.drop(labels = ['id'], axis = 1, inplace = True)

In [4]:
# Segregation into numerical and categorical columns
num_columns = X.columns[X.dtypes != 'object']
cat_columns = X.columns[X.dtypes == 'object']

In [5]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [6]:
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocess=ColumnTransformer([
('num_pipeline',num_pipeline,num_columns),
('cat_pipeline',cat_pipeline,cat_columns)
])

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

In [8]:
X_train = pd.DataFrame(preprocess.fit_transform(X_train), columns=preprocess.get_feature_names_out())
X_test = pd.DataFrame(preprocess.transform(X_test), columns=preprocess.get_feature_names_out())

In [9]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.780167,0.719352,-0.637086,-0.843944,-0.808252,-0.760667,-1.139986,-1.550963,0.682694
1,1.793874,-1.307964,0.403474,1.681406,1.751577,1.530872,-0.134981,0.297649,0.017348
2,-0.758536,-1.584416,0.403474,-0.690619,-0.762865,-0.833184,-0.134981,-1.550963,0.682694
3,0.063426,-0.478607,-0.637086,0.301483,0.281037,0.254572,0.870024,0.297649,-1.313344
4,-0.563861,0.627201,-0.637086,-0.483179,-0.508697,-0.441592,0.870024,0.297649,2.013386


In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Elastic Net": ElasticNet()
}

In [13]:
# A function for evaluation of models
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true, predicted)
    r2 = r2_score(true, predicted)
    return mse, mae, rmse, r2

In [25]:
# Fiiting all the models in a loop
evaluation_dict = {}
for model_name in models:
    model = models[model_name]
    model.fit(X_train, y_train)

    # Making predictions
    y_predict = model.predict(X_test)

    # Getting all the metrics
    mse, mae, rmse, r2 = evaluate_model(y_test, y_predict)

    evaluation_dict[model_name] = {
        'Mean Squared Error': mse,
        'Mean Absolute Error': mae, 
        'Root Mean Squared Error': rmse, 
        'R2 Score': r2
    }

# printing the metrices of all the models
for model_type in evaluation_dict:
    print(model_type)
    for score in evaluation_dict[model_type]:
        print(f"{score} = ", evaluation_dict[model_type][score])
    print('='*50)
    print()


Linear Regression
Mean Squared Error =  1029839.1822413101
Mean Absolute Error =  675.2796042191958
Root Mean Squared Error =  1014.8099241933487
R2 Score =  0.937049390893374

Lasso Regression
Mean Squared Error =  1030113.3268792456
Mean Absolute Error =  676.3733606162151
Root Mean Squared Error =  1014.94498711962
R2 Score =  0.9370326333527415

Ridge Regression
Mean Squared Error =  1029810.1049045292
Mean Absolute Error =  675.3022121068825
Root Mean Squared Error =  1014.7955975981218
R2 Score =  0.937051168293276

Elastic Net
Mean Squared Error =  2367021.380551036
Mean Absolute Error =  1064.3039298403096
Root Mean Squared Error =  1538.5127170586002
R2 Score =  0.8553119358405031



In [26]:
evaluation_dict

{'Linear Regression': {'Mean Squared Error': 1029839.1822413101,
  'Mean Absolute Error': 675.2796042191958,
  'Root Mean Squared Error': 1014.8099241933487,
  'R2 Score': 0.937049390893374},
 'Lasso Regression': {'Mean Squared Error': 1030113.3268792456,
  'Mean Absolute Error': 676.3733606162151,
  'Root Mean Squared Error': 1014.94498711962,
  'R2 Score': 0.9370326333527415},
 'Ridge Regression': {'Mean Squared Error': 1029810.1049045292,
  'Mean Absolute Error': 675.3022121068825,
  'Root Mean Squared Error': 1014.7955975981218,
  'R2 Score': 0.937051168293276},
 'Elastic Net': {'Mean Squared Error': 2367021.380551036,
  'Mean Absolute Error': 1064.3039298403096,
  'Root Mean Squared Error': 1538.5127170586002,
  'R2 Score': 0.8553119358405031}}