In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
import sqlalchemy as sq
from sqlalchemy import create_engine
from sqlalchemy.engine import URL


In [38]:
data = pd.read_csv(r'D:\Datascience\fullstackdsproject\notebooks\data\playground-series-s3e8\test.csv') #openpyxl for .xlsx files
data

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54,2.82
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87,3.68
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74,3.55
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42,2.73
4,193577,1.20,Very Good,I,VS2,62.7,56.0,6.75,6.79,4.24
...,...,...,...,...,...,...,...,...,...,...
129045,322618,0.72,Ideal,D,VVS2,62.0,56.0,5.75,5.78,3.57
129046,322619,0.70,Premium,D,SI1,59.6,62.0,5.77,5.74,3.43
129047,322620,1.01,Premium,G,VVS2,62.3,58.0,6.44,6.41,4.01
129048,322621,1.35,Ideal,D,I1,62.0,56.0,7.05,7.08,4.38


we have got to know what are the X - independent n Y - dependent feature 

In [39]:
# X=data.drop(labels=["price"],axis=1)
X = data.iloc[:,:-1]
X

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y
0,193573,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54
1,193574,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87
2,193575,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74
3,193576,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42
4,193577,1.20,Very Good,I,VS2,62.7,56.0,6.75,6.79
...,...,...,...,...,...,...,...,...,...
129045,322618,0.72,Ideal,D,VVS2,62.0,56.0,5.75,5.78
129046,322619,0.70,Premium,D,SI1,59.6,62.0,5.77,5.74
129047,322620,1.01,Premium,G,VVS2,62.3,58.0,6.44,6.41
129048,322621,1.35,Ideal,D,I1,62.0,56.0,7.05,7.08


In [40]:
# y=data[["price"]]
y = data.iloc[:,-1:]
y

Unnamed: 0,z
0,2.82
1,3.68
2,3.55
3,2.73
4,4.24
...,...
129045,3.57
129046,3.43
129047,4.01
129048,4.38


In [41]:
#drop the id column in the X 
X.drop('id', inplace=True, axis=1)
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y
0,0.35,Ideal,D,VS2,62.3,56.0,4.51,4.54
1,0.77,Very Good,F,SI2,62.8,56.0,5.83,5.87
2,0.71,Ideal,I,VS2,61.9,53.0,5.77,5.74
3,0.33,Ideal,G,VVS2,61.6,55.0,4.44,4.42
4,1.20,Very Good,I,VS2,62.7,56.0,6.75,6.79
...,...,...,...,...,...,...,...,...
129045,0.72,Ideal,D,VVS2,62.0,56.0,5.75,5.78
129046,0.70,Premium,D,SI1,59.6,62.0,5.77,5.74
129047,1.01,Premium,G,VVS2,62.3,58.0,6.44,6.41
129048,1.35,Ideal,D,I1,62.0,56.0,7.05,7.08


In independent feature we have categorical value we need to seggregate and do the encoding for the categorical

In [42]:
X.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
dtype: object

In [59]:
categorical_columns = X.select_dtypes(include=object).columns
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [60]:
numerical_columns = X.select_dtypes(exclude=object).columns
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y'], dtype='object')

Setting up the pipeline here for the automatic scaling n imputer(handlling missing value)

In [61]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [62]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [63]:
num_pipeline = Pipeline(
    steps =[
        ('imputer', SimpleImputer()), #handle the missing value with the mean n median(outliers)
        ('scaler', StandardScaler()) 
        ]#mean of 0 and standard deviation of 1 
)

In [64]:
num_pipeline

In [65]:
cat_pipeline  = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')), #handle the missing value with the mode for category
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
    ]
)

In [66]:
cat_pipeline

we need to pass the categorical_columns and numerical_columns in column Transformer

In [67]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_columns),
     ("cat_pipeline", cat_pipeline, categorical_columns)
     ]
)

In [68]:
preprocessor

train test split data - 70% train data, 30% test data

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state=30, test_size=0.33)
X_train


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y
31750,0.30,Ideal,E,SI1,62.3,57.0,4.30,4.32
72077,2.14,Premium,H,SI1,62.2,59.0,8.24,8.21
80681,0.32,Ideal,F,VS2,61.1,57.0,4.40,4.41
101566,0.40,Ideal,G,VVS1,62.1,57.0,4.70,4.74
93737,1.24,Ideal,F,VS2,60.9,57.0,6.90,6.95
...,...,...,...,...,...,...,...,...
98804,1.03,Ideal,D,SI2,61.5,56.0,6.46,6.51
110381,0.41,Premium,D,SI1,61.9,61.0,4.76,4.73
48045,0.35,Ideal,G,VVS1,60.7,56.0,4.60,4.63
70053,0.55,Ideal,D,VS2,61.5,57.0,5.26,5.31


In [71]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y
88905,0.55,Ideal,H,VS1,61.8,55.0,5.25,5.33
49957,0.91,Very Good,D,VS1,62.5,56.0,6.14,6.18
125014,0.70,Ideal,F,SI2,61.5,57.0,5.71,5.74
70284,2.04,Premium,J,SI1,63.0,59.0,8.05,7.99
97923,1.01,Fair,F,SI1,66.7,58.0,6.23,6.18
...,...,...,...,...,...,...,...,...
82455,0.32,Very Good,E,SI2,63.3,59.0,4.31,4.34
78194,2.01,Good,I,VS1,62.8,56.0,8.07,8.00
118928,0.76,Premium,J,SI1,62.3,57.0,5.84,5.82
73743,0.60,Ideal,G,VS2,61.7,57.0,5.36,5.43


Calling the preprocessor here

In [72]:
preprocessor.fit_transform(X_train)

array([[-1.05630906,  0.44538276, -0.12512549, ...,  4.        ,
         1.        ,  2.        ],
       [ 2.91818977,  0.35288322,  0.91685483, ...,  3.        ,
         4.        ,  2.        ],
       [-1.01310798, -0.66461168, -0.12512549, ...,  4.        ,
         2.        ,  3.        ],
       ...,
       [-0.94830637, -1.03460983, -0.64611565, ...,  4.        ,
         3.        ,  6.        ],
       [-0.51629563, -0.29461353, -0.12512549, ...,  4.        ,
         0.        ,  3.        ],
       [ 0.90933982, -1.68210658, -0.12512549, ...,  4.        ,
         4.        ,  2.        ]])

In [73]:
preprocessor.transform(X_test)

array([[-0.51629563, -0.01711492, -1.16710581, ...,  4.        ,
         4.        ,  4.        ],
       [ 0.2613237 ,  0.63038183, -0.64611565, ...,  2.        ,
         0.        ,  4.        ],
       [-0.19228757, -0.29461353, -0.12512549, ...,  4.        ,
         2.        ,  1.        ],
       ...,
       [-0.06268435,  0.44538276, -0.12512549, ...,  3.        ,
         6.        ,  2.        ],
       [-0.40829295, -0.10961446, -0.12512549, ...,  4.        ,
         3.        ,  3.        ],
       [ 1.34135056, -1.49710751,  1.43784499, ...,  3.        ,
         4.        ,  1.        ]])

In [74]:
preprocessor.get_feature_names_out() #all features has been preprocessed and added with the pipeline name with the userdefined name

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [75]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_train



Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.056309,0.445383,-0.125125,-1.270131,-1.264297,4.0,1.0,2.0
1,2.918190,0.352883,0.916855,2.276839,2.260012,3.0,4.0,2.0
2,-1.013108,-0.664612,-0.125125,-1.180106,-1.182758,4.0,2.0,3.0
3,-0.840304,0.260384,-0.125125,-0.910033,-0.883780,4.0,3.0,6.0
4,0.974141,-0.849611,-0.125125,1.070509,1.118462,4.0,2.0,3.0
...,...,...,...,...,...,...,...,...
86458,0.520530,-0.294614,-0.646116,0.674401,0.719826,4.0,0.0,1.0
86459,-0.818703,0.075385,1.958835,-0.856018,-0.892840,3.0,0.0,2.0
86460,-0.948306,-1.034610,-0.646116,-1.000057,-0.983440,4.0,3.0,6.0
86461,-0.516296,-0.294614,-0.125125,-0.405895,-0.367365,4.0,0.0,3.0


In [76]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.516296,-0.017115,-1.167106,-0.414897,-0.349245,4.0,4.0,4.0
1,0.261324,0.630382,-0.646116,0.386322,0.420848,2.0,0.0,4.0
2,-0.192288,-0.294614,-0.125125,-0.000784,0.022212,4.0,2.0,1.0
3,2.702184,1.092880,0.916855,2.105792,2.060694,3.0,6.0,2.0
4,0.477329,4.515362,0.395865,0.467344,0.420848,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...
42582,-1.013108,1.370378,0.916855,-1.261129,-1.246177,2.0,1.0,1.0
42583,2.637383,0.907880,-0.646116,2.123797,2.069754,1.0,5.0,4.0
42584,-0.062684,0.445383,-0.125125,0.116248,0.094691,3.0,6.0,2.0
42585,-0.408293,-0.109614,-0.125125,-0.315870,-0.258646,4.0,3.0,3.0


Model Training -- with three models

In [25]:
"""Simple linear regression 
   Ridge Regression (L2 regularisation)
   Lassso Regression (L1 regularisation)
"""

'Simple linear regression \n   Ridge Regression (L2 regularisation)\n   Lassso Regression (L1 regularisation)\n'

Evaluating the model scores

In [26]:
import numpy as np

def evaluating_scores(true, predict):
    rmse = np.sqrt(mean_squared_error(true, predict))
    r2_square = r2_score(true, predict)
    mse = mean_squared_error(true, predict)
    mae = mean_absolute_error(true, predict)
    return mae, rmse, r2_square, mse


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet

models ={
    "Simple linear regression":LinearRegression(),
   "Ridge Regression (L2 regularisation)": Ridge(),
   "Lassso Regression (L1 regularisation)": Lasso(),
   "ElasticNet": ElasticNet()
    }


In [28]:
X_train.shape

(86463, 8)

In [29]:
y_train.shape

(86463, 1)

In [30]:
models.keys()
   

dict_keys(['Simple linear regression', 'Ridge Regression (L2 regularisation)', 'Lassso Regression (L1 regularisation)', 'ElasticNet'])

In [31]:
models.values()

dict_values([LinearRegression(), Ridge(), Lasso(), ElasticNet()])

In [32]:
list(models)

['Simple linear regression',
 'Ridge Regression (L2 regularisation)',
 'Lassso Regression (L1 regularisation)',
 'ElasticNet']

In [33]:
model_list=[]
r2_list=[]


In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    # print(model)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print("training score", model.score(X_train, y_train))

    #validation part 
    mae, rmse, r2_square, mse = evaluating_scores(y_predict, y_test)
    print(model)
    model_list.append(model)
    
    print("Model Training Performance")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("MSE:", mse)
    print("R2Score:", r2_square*100)
    r2_list.append(r2_square)
    print('='*35)

    




training score 0.9366002598295858
LinearRegression()
Model Training Performance
RMSE: 1012.0684598433756
MAE: 673.1192939264239
MSE: 1024282.5674097425
R2Score: 93.25596331107432
training score 0.9366002505827449
Ridge()
Model Training Performance
RMSE: 1012.0707353570211
MAE: 673.1524393157647
MSE: 1024287.1733661015
R2Score: 93.25563626775532
training score 0.9365839594510476
Lasso()
Model Training Performance
RMSE: 1012.0668588177022
MAE: 674.3126578340965
MSE: 1024279.3267171306
R2Score: 93.24838155547523
training score 0.85810795523477
ElasticNet()
Model Training Performance
RMSE: 1511.6076266944901
MAE: 1049.7070318613507
MSE: 2284957.6170809492
R2Score: 78.6907412889136


In [271]:
model_list, r2_list

([LinearRegression(), Ridge(), Lasso(), ElasticNet()],
 [0.9325596331107433,
  0.9325563626775533,
  0.9324838155547523,
  0.786907412889136])

saving the model and testing with the new data set

In [314]:
import joblib  
best_model = None
best_r2 = -float("inf") # Start with the lowest possible R² score
print(best_r2)

for i in range(len(list(models))):
    model = list(models.values())[i]
    print(model)

    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

     # Debugging: Print all scores returned
    mae, rmse, r2_square, mse = evaluating_scores(y_test, y_predict)  
    print(f"✅ Model: {model}, R² Score: {r2_square*100}")

    # ✅ Ensure it picks the highest R² score
    if r2_square > best_r2:
        best_r2 = r2_square
        best_model = model


# ✅ Save the best model
if best_model:
    joblib.dump(best_model, f"{best_model}.pkl")
    print(f"\n✅ Best Model Saved: {best_model} with R² Score: {best_r2*100}")


    



-inf
LinearRegression()
✅ Model: LinearRegression(), R² Score: 93.69455849250947
Ridge()
✅ Model: Ridge(), R² Score: 93.69453013843089
Lasso()
✅ Model: Lasso(), R² Score: 93.69457844208038
ElasticNet()
✅ Model: ElasticNet(), R² Score: 85.93389455213153

✅ Best Model Saved: Lasso() with R² Score: 93.69457844208038


Load the model

In [79]:
import joblib

loaded_model = joblib.load("Lasso().pkl")
loaded_model

In [81]:
feature_names = loaded_model.feature_names_in_
print(feature_names)

['num_pipeline__carat' 'num_pipeline__depth' 'num_pipeline__table'
 'num_pipeline__x' 'num_pipeline__y' 'num_pipeline__z' 'cat_pipeline__cut'
 'cat_pipeline__color' 'cat_pipeline__clarity']


Prediction on the new data

In [101]:
X_test["num_pipeline__z"] = 0
X_train["num_pipeline__z"] = 0


In [103]:
#move the column next to num_pipeline__y
columns = list(X_train.columns)
columns

# Move `num_pipeline__z` right after `num_pipeline__y`
columns.remove("num_pipeline__z")  # Remove from current position
y_index = columns.index("num_pipeline__y")  # Find position of `num_pipeline__y`
columns.insert(y_index + 1, "num_pipeline__z")  # Insert `num_pipeline__z` next to `y`

# Reorder the DataFrame
X_train = X_train[columns]
X_test = X_test[columns]



In [104]:
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.516296,-0.017115,-1.167106,-0.414897,-0.349245,0,4.0,4.0,4.0
1,0.261324,0.630382,-0.646116,0.386322,0.420848,0,2.0,0.0,4.0
2,-0.192288,-0.294614,-0.125125,-0.000784,0.022212,0,4.0,2.0,1.0
3,2.702184,1.092880,0.916855,2.105792,2.060694,0,3.0,6.0,2.0
4,0.477329,4.515362,0.395865,0.467344,0.420848,0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...
42582,-1.013108,1.370378,0.916855,-1.261129,-1.246177,0,2.0,1.0,1.0
42583,2.637383,0.907880,-0.646116,2.123797,2.069754,0,1.0,5.0,4.0
42584,-0.062684,0.445383,-0.125125,0.116248,0.094691,0,3.0,6.0,2.0
42585,-0.408293,-0.109614,-0.125125,-0.315870,-0.258646,0,4.0,3.0,3.0


In [105]:
new_prediction  = loaded_model.predict(X_test)

In [None]:
r2_score(new_prediction, y_test) #due to we have added the num_pipeline__zcolumn manually

-1.0318183728133459