## Model Training

In [1]:
import pandas as pd

In [2]:
## Data Ingestion Step
df = pd.read_csv('D:\\DiamondNewPricePrediction\\notebook\\data\\gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
## segreagating into Independent and Dependent variable
x = df.drop('price',axis=1)
y = df[['price']]

In [6]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [7]:
x

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...,...
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [39]:
cat_col = x.select_dtypes(include = 'object').columns.tolist()
num_col = x.select_dtypes(exclude = 'object').columns.tolist()

In [40]:
## define the custom ranking for each ordinal variable
cut_cat = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_cat = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_cat = ['I1', 'SI2', 'SI1', 'VS2','VS1', 'VVS2', 'VVS1', 'IF']

## Feature Engineering Automation

In [41]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
## pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [50]:
## Num Pipeline 
num_pipeline = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'mean',)),
             ('scaler', StandardScaler(with_mean=False))
    ]
)

cat_pipeline = Pipeline(
    steps = [ 
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(categories= [cut_cat,color_cat,clarity_cat])),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

In [51]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_col),
    ('cat_pipeline',cat_pipeline,cat_col)
])

In [52]:
## train Test Split 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=12)


In [54]:
x_train_scaled = pd.DataFrame(preprocessor.fit_transform(x_train), columns = preprocessor.get_feature_names_out())
x_test_scaled = pd.DataFrame(preprocessor.transform(x_test), columns = preprocessor.get_feature_names_out())

In [55]:
x_train_scaled.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut_Fair,cat_pipeline__cut_Good,cat_pipeline__cut_Very Good,...,cat_pipeline__color_I,cat_pipeline__color_J,cat_pipeline__clarity_I1,cat_pipeline__clarity_SI2,cat_pipeline__clarity_SI1,cat_pipeline__clarity_VS2,cat_pipeline__clarity_VS1,cat_pipeline__clarity_VVS2,cat_pipeline__clarity_VVS1,cat_pipeline__clarity_IF
0,1.871495,2.597485,56.167649,29.24523,6.227188,6.239206,6.063853,0.0,0.0,0.0,...,0.0,0.0,0.0,2.746011,0.0,0.0,0.0,0.0,0.0,0.0
1,0.76738,2.164571,57.648181,30.289702,5.721793,5.785116,5.744703,0.0,0.0,2.532829,...,0.0,0.0,0.0,0.0,0.0,2.313841,0.0,0.0,0.0,0.0
2,1.425633,0.541143,57.92578,29.24523,3.655088,3.70538,3.684734,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.240219,0.0,0.0,0.0,0.0,0.0
3,2.050484,3.290148,58.20338,29.767466,6.651358,6.629724,6.687646,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.313841,0.0,0.0,0.0,0.0
4,1.029837,1.536845,57.92578,29.24523,5.053949,5.231125,5.149923,0.0,0.0,2.532829,...,0.0,0.0,0.0,0.0,2.240219,0.0,0.0,0.0,0.0,0.0


In [56]:
## Model Training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [57]:
reg = LinearRegression()
reg.fit(x_train_scaled,y_train)

In [58]:
reg.coef_

array([[ 1.51811314e+00,  6.64048236e+03, -1.32155507e+02,
        -6.77077280e+01, -1.96048302e+03, -4.40782037e+02,
        -7.32515097e+01, -2.98553290e+13, -7.00219710e+13,
        -1.16403327e+14, -1.28998924e+14, -1.47277464e+14,
        -6.38390877e+13, -7.49964795e+13, -7.35718530e+13,
        -8.11385163e+13, -7.05133618e+13, -5.55389633e+13,
        -3.45213138e+13,  2.39652532e+13,  1.71696187e+14,
         2.10461372e+14,  2.03764946e+14,  1.72032665e+14,
         1.29043633e+14,  1.07457109e+14,  6.92146233e+13]])

In [59]:
reg.intercept_

array([1.62770597e+13])

In [61]:
y_pred = reg.predict(x_test_scaled)

In [62]:
r2_score(y_pred,y_test)

0.942284215251737

In [63]:
print(mean_absolute_error(y_pred,y_test))

624.9941122052688


In [64]:
print(mean_squared_error(y_pred,y_test))

892427.8347025872


In [67]:
## training with the multiple models 
models = {
    'linear_reg' : LinearRegression(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'elasticnet' : ElasticNet()
}
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train_scaled,y_train)
    # make prediction
    y_pred = model.predict(x_test_scaled)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    score = r2_score(y_pred,y_test)*100
    print(score)

    r2_list.append(score)


linear_reg
94.2284215251737
lasso
94.21256909654686
ridge
94.22817408599869
elasticnet
78.56425951153216


In [68]:
model_list

['linear_reg', 'lasso', 'ridge', 'elasticnet']