In [1]:
import pandas as pd

In [2]:
## Data ingestion step
df = pd.read_csv("/content/gemstone.csv")
df.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
5,5,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506
6,6,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229
7,7,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224
8,8,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886
9,9,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421


In [3]:
## Droping the id column
df =  df.drop(labels=['id'],axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
## Independent and Dependent column

X  = df.drop(labels=['price'], axis=1)
Y  = df['price']

In [5]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [6]:
# costomized ranking
cut_rank = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_rank = ["D", "E", "F", "G", "H", "I", "J"]
clarity_rank = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]


In [7]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.pipeline import Pipeline # Pipelines
from sklearn.compose import ColumnTransformer



In [8]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler())
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ('ordinal',OrdinalEncoder(categories=[cut_rank,color_rank,clarity_rank])),
          ('scalar',StandardScaler())
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=30)

In [10]:
X_train = pd.DataFrame(proccessor.fit_transform(X_train), columns=proccessor.get_feature_names_out())
X_test = pd.DataFrame(proccessor.transform(X_test), columns= proccessor.get_feature_names_out())


In [11]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


In [12]:
# Model Training
from sklearn.linear_model import Ridge,LinearRegression,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.tree import DecisionTreeRegressor




In [16]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))

    return mae,mse,r2_square,rmse

In [17]:
## Train multiple models
## Model Evaluation

models  = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge": Ridge(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "ElasticNet": ElasticNet()
}

train_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Prediction

    y_pred = model.predict(X_test)

    mae, mse, r2_square, rmse =  evaluate_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("mae", mae)
    print("mse", mse)
    print("rmse", rmse)
    print("R2 Score", r2_square)


    r2_list.append(r2_square)

    print("="*35)

    print("\n")












LinearRegression
Model Training Performance
mae 674.025511579685
mse 1028002.7598132554
rmse 1013.9047094344002
R2 Score 0.9368908248567511


Lasso
Model Training Performance
mae 675.0716923362156
mse 1027949.4559693959
rmse 1013.8784226767013
R2 Score 0.9368940971841704


Ridge
Model Training Performance
mae 674.0555800798531
mse 1028005.2293677182
rmse 1013.9059272771406
R2 Score 0.9368906732505968


DecisionTreeRegressor
Model Training Performance
mae 422.8315682830509
mse 694006.2652459209
rmse 833.0703843289118
R2 Score 0.9573948975079861


ElasticNet
Model Training Performance
mae 1060.7368759154729
mse 2351365.382289641
rmse 1533.4162456064046
R2 Score 0.8556494831165182


