### Model Traning

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("./data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
## Let's drop id column
df = df.drop(labels=['id'], axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
#Independent and dependent features 
x = df.drop(labels=['price'], axis=1)
y = df[['price']]

In [5]:
#Define which columns should be ordinal_encoded and should be scaled
categorical_cols = x.select_dtypes(include='object').columns
numerical_cols = x.select_dtypes(exclude='object').columns

In [6]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [7]:
#Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [8]:
from sklearn.impute import SimpleImputer ## Handling Missing values
from sklearn.preprocessing import StandardScaler ## Handling Scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding

#Pipline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## Numerical Pipline 
num_pipline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

## Categorical Pipline 
cat_pipline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('oridinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

preprocessor =ColumnTransformer([
    ('num_pipline', num_pipline, numerical_cols),
    ('cat_pipline', cat_pipline, categorical_cols)
])

In [10]:
##Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [11]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())

In [12]:
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [13]:
## Model traning 
from sklearn.linear_model import LinearRegression, Lasso,Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [14]:
regrassion = LinearRegression()
regrassion.fit(X_train, y_train)

In [15]:
regrassion.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [16]:
regrassion.intercept_

array([3976.8787389])

In [17]:
def evalute_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, mse, r2_square

In [18]:
## Train multipel models
models ={
    'LinearRegression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'Elasticnet': ElasticNet()
}

model_list =[]
r2_list =[]

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate model performance
    mae, rmse, mse, r2_square= evalute_model(y_test, y_pred)
    
    # Print results
    print(model_name)
    print('Model Training Performance:')
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("MSE:", mse)
    print("R2 Score:", r2_square*100)
    print('=' * 35)  # Optional separator for clarity

      # Optional: Store R2 score if needed
    r2_list.append(r2_square)
    model_list.append(model_name)

    # Further analysis with R2 scores (if stored)


print('=' * 35)  # Optional separator for clarity
if r2_list:
    best_model_index = np.argmax(r2_list)
    best_model_name = model_list[best_model_index]
    best_r2 = r2_list[best_model_index]
    print(f"Best Model by R2 Score: {best_model_name} (R2: {best_r2:.4f})")
else:
    print("No models were trained or evaluated.")

print('=' * 35)  # Optional separator for clarity

LinearRegression
Model Training Performance:
MAE: 675.075827006748
RMSE: 1014.6296630375463
MSE: 1029473.3531156846
R2 Score: 93.62906819996049
Lasso
Model Training Performance:
MAE: 676.2421173665508
RMSE: 1014.6591302750638
MSE: 1029533.150650549
R2 Score: 93.62869814082755
Ridge
Model Training Performance:
MAE: 675.1077629781383
RMSE: 1014.6343233534448
MSE: 1029482.8101269027
R2 Score: 93.62900967491628
Elasticnet
Model Training Performance:
MAE: 1060.9432977143008
RMSE: 1533.3541245902313
MSE: 2351174.871397875
R2 Score: 85.44967219374031
Best Model by R2 Score: LinearRegression (R2: 0.9363)
