In [1]:
import pandas as pd 

In [2]:

df = pd.read_csv('gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
#dropping id column as it is irrelevant to our analysis
df=df.drop(labels=['id'],axis=1)

In [4]:
## Seprating Independent and dependent features
x = df.drop(labels=['price'],axis=1)
y= df[['price']]

In [5]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [6]:
categorical_cols =x .select_dtypes(include='object').columns
numerical_cols = x.select_dtypes(exclude='object').columns

In [7]:

# Defining the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [8]:
# for handling Missing Values
from sklearn.impute import SimpleImputer 
# for Feature Scaling
from sklearn.preprocessing import StandardScaler 
# for Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder 
## for creating pipelines
from sklearn.pipeline import Pipeline
#for combining pipelines
from sklearn.compose import ColumnTransformer


# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [9]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [10]:
X_train.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
83475,0.32,Premium,E,SI1,61.6,58.0,4.38,4.41,2.71
160324,1.2,Premium,F,VS2,62.6,57.0,6.81,6.76,4.25
101740,1.5,Ideal,I,VS2,62.2,55.0,7.3,7.26,4.53
180341,1.67,Premium,I,SI2,61.9,59.0,7.65,7.61,4.71
48480,1.0,Good,H,VS2,63.7,60.0,6.34,6.3,4.02


In [11]:

import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# ✅ Step 1: Load your dataset
df = pd.read_csv("gemstone.csv")  # Use the correct path to your CSV

# ✅ Step 2: Split target and features
X = df.drop("price", axis=1)
y = df["price"]

# ✅ Step 3: Encode categorical columns
X = pd.get_dummies(X, drop_first=True)  # This converts strings like 'Good' into numbers

# ✅ Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Step 5: Evaluation function
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

# ✅ Step 6: Train models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

model_list = []
r2_list = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae, rmse, r2 = evaluate_model(y_test, y_pred)

    print(f"{name}")
    print("Model Training Performance")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2 Score: {r2:.2%}")
    print("="*35)

    model_list.append(name)
    r2_list.append(r2)



LinearRegression
Model Training Performance
MAE: 622.95
RMSE: 944.66
R2 Score: 94.48%


  model = cd_fast.enet_coordinate_descent(


Lasso
Model Training Performance
MAE: 622.43
RMSE: 949.72
R2 Score: 94.42%
Ridge
Model Training Performance
MAE: 622.95
RMSE: 944.66
R2 Score: 94.48%
ElasticNet
Model Training Performance
MAE: 1203.36
RMSE: 1739.01
R2 Score: 81.29%


In [13]:
#feature extraction

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# --- Load your dataset ---
# df = pd.read_csv("your_file.csv")
# X = df.drop(columns=["target_column"], axis=1)
# y = df["target_column"]

# Replace these with your actual data
# X, y = ...

# --- Feature Scaling ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Feature Extraction (Polynomial Features) ---
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# --- Train-test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# --- Model Evaluation Function ---
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

# --- Models to Train ---
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(max_iter=50),         # increased max_iter for convergence
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(max_iter=50)
}

# --- Training and Evaluation ---
model_list = []
r2_list = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae, rmse, r2 = evaluate_model(y_test, y_pred)

    print(f"🔹 {name}")
    print("MAE:", round(mae, 2))
    print("RMSE:", round(rmse, 2))
    print("R2 Score:", round(r2 * 100, 2), "%")
    print("=" * 40)

    model_list.append(name)
    r2_list.append(r2)


🔹 LinearRegression
MAE: 377.86
RMSE: 672.06
R2 Score: 97.21 %


  model = cd_fast.enet_coordinate_descent(


🔹 Lasso
MAE: 383.98
RMSE: 682.29
R2 Score: 97.12 %
🔹 Ridge
MAE: 377.89
RMSE: 672.07
R2 Score: 97.2 %
🔹 ElasticNet
MAE: 547.05
RMSE: 876.64
R2 Score: 95.24 %


  model = cd_fast.enet_coordinate_descent(
