In [1]:
import pandas as pd

Model Training

In [3]:
df = pd.read_csv('diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
## Independent and dependent features
X=df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [5]:
Y

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335
...,...
53935,2757
53936,2757
53937,2757
53938,2757


In [8]:
# Define which columns should be ordinal be ordinal-encoded and which should be scaled 

# Segregating the numerical and categorical variables

categorical_cols=X.select_dtypes(include='object').columns

numerical_cols=X.select_dtypes(exclude='object').columns

In [9]:
# Define the custom ranking for each ordinal variable

cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

In [10]:
from sklearn.impute import SimpleImputer # Handling Missing Values
from sklearn.preprocessing import StandardScaler #  Handling Features Scaling
from sklearn.preprocessing import OrdinalEncoder #Ordianl Encoding

## Pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
## Numerical Pipeline

num_pipeline=Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler',StandardScaler())
    
    
        
    ]
)

In [14]:
# Data Transformation code 
## Categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]
)

In [15]:
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [21]:
# Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [22]:
#For training data we use fit_transform

# for test data we use transform
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [23]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.817406,1.290192,-0.650469,-0.893064,-0.897014,-0.758306,-1.709604,-0.93989,-0.640054
1,0.048948,0.102662,-0.650469,0.23294,0.248832,0.256665,0.981376,-1.528664,-0.640054
2,-0.479316,-0.386321,-0.201363,-0.374745,-0.332771,-0.391789,-0.812611,-0.93989,-1.248906
3,-0.204619,0.102662,0.247742,-0.017284,-0.05499,-0.025271,0.084383,-1.528664,-1.248906
4,0.450429,-0.386321,0.247742,0.679767,0.6221,0.594989,0.084383,0.237658,1.186501


In [24]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [25]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [26]:
regression.coef_

array([[ 5126.99748892,  -122.10399799,   -62.94825835, -1013.10763933,
           37.30474228,   -18.89177019,   138.12242666,  -551.03496211,
          823.91860561]])

In [27]:
regression.intercept_

array([3925.59333121])

In [28]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    
    return mae , rmse , r2_square

In [30]:
## Training multiple models
## Model Evaluation steps

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
                  
}

trained_model_list=[]
model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

In [32]:
## Make Predictions

y_pred=model.predict(X_test)

mae ,  rmse , r2_square=evaluate_model(y_test,y_pred)

print(list(models.keys())[i])
model_list.append(list(models.keys())[i])


print('Model Training Performance')
print("RMSE:",rmse)
print("MAE:",mae)
print("R2 score:",r2_square*100)

r2_list.append(r2_square)


print('='*35)
print('/n')


ElasticNet
Model Training Performance
RMSE: 1627.8942388834841
MAE: 1075.0042455583414
R2 score: 83.47639266655595
/n
