## MODEL TRAINING

In [134]:
#importing pandas for model training
import pandas as pd

In [135]:
## data ingestion step
data = pd.read_csv('./data/test.csv')
data.head()


Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [136]:
#making a copy of the data set so that it do not harm the original dataset.
df = data.copy()
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [137]:
#lets drop the "car_id, carname, symboling, enginelocation and doornumber" column cause it is of less use.
df=df.drop(labels=['car_ID','CarName', 'symboling', 'doornumber', 'enginelocation'],axis=1)
df.head()

Unnamed: 0,fueltype,aspiration,carbody,drivewheel,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,gas,std,convertible,rwd,88.6,168.8,64.1,48.8,2548,dohc,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,gas,std,convertible,rwd,88.6,168.8,64.1,48.8,2548,dohc,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,gas,std,hatchback,rwd,94.5,171.2,65.5,52.4,2823,ohcv,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,gas,std,sedan,fwd,99.8,176.6,66.2,54.3,2337,ohc,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,gas,std,sedan,4wd,99.4,176.6,66.4,54.3,2824,ohc,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [138]:
#this features need to be changed cause we need numerical values but the number names are written
df = df.replace({'cylindernumber':{'four':4 , 'six':6 , 'five':5 , 'three':3 , 'twelve':12 , 'two':2 ,'eight':8}})
df['cylindernumber']
    

0      4
1      4
2      6
3      4
4      5
      ..
200    4
201    4
202    6
203    6
204    4
Name: cylindernumber, Length: 205, dtype: int64

In [139]:
## Independent and dependent features
X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [140]:
Y

Unnamed: 0,price
0,13495.0
1,16500.0
2,16500.0
3,13950.0
4,17450.0
...,...
200,16845.0
201,19045.0
202,21485.0
203,22470.0


In [141]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
print(categorical_cols)
print(numerical_cols)

Index(['fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginetype',
       'fuelsystem'],
      dtype='object')
Index(['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
       'cylindernumber', 'enginesize', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg'],
      dtype='object')


In [142]:
# Define the custom ranking for each ordinal variable
fueltype_categories = ['diesel', 'gas']
aspiration_categories = ['std', 'turbo']
carbody_categories = ['hatchback', 'sedan', 'wagon', 'convertible', 'hardtop']
drivewheel_categories = ['fwd', '4wd', 'rwd']
enginetype_categories = ['ohc', 'l', 'ohcf', 'dohc', 'ohcv', 'rotor', 'dohcv']
fuelsystem_categories = ['1bbl', '2bbl', '4bbl', 'spfi', 'mpfi', 'spdi', 'mfi', 'idi']


In [143]:
#importing different libraries for feature scaling and imputing
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.compose import make_column_transformer ##for column transforming
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [144]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[fueltype_categories,aspiration_categories,carbody_categories,drivewheel_categories,enginetype_categories,fuelsystem_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [145]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [147]:

X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=list(X_train.columns))
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=list(X_train.columns))


In [148]:
X_train.head()

Unnamed: 0,fueltype,aspiration,carbody,drivewheel,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,-0.69081,-0.644963,-0.978973,0.2987,-1.106194,-0.322565,-0.694296,-0.643522,0.090754,-0.18077,-0.912231,0.120692,0.859492,0.901698,0.329435,-0.504367,0.124217,-0.841774,-0.523732,-1.078302
1,-0.69081,-0.194988,-0.184619,-0.576417,0.515053,1.48633,0.587243,-2.317328,0.662606,-0.281246,1.30363,-0.287805,-0.923928,-0.663833,0.329435,-0.504367,-0.920665,1.251681,2.413273,0.413929
2,0.048076,0.979524,0.282648,0.965455,0.201079,-0.322565,-0.135079,0.74538,-0.608176,-0.203377,0.156596,0.222817,-0.626691,-0.379191,0.329435,-0.504367,-0.920665,-0.841774,-0.523732,0.413929
3,-0.369555,-0.355148,0.049014,-1.159829,-0.493469,-0.322565,-0.391387,-0.643522,1.012071,-0.256127,-0.104093,0.733438,-0.032218,0.047772,0.329435,-0.504367,0.124217,-0.841774,-0.523732,-1.078302
4,-0.465932,-0.042454,0.189194,-1.493207,0.711049,-0.322565,0.680446,0.923445,1.901618,-0.783629,1.069009,-0.287805,-0.923928,-0.948475,0.329435,1.982684,-0.920665,-0.841774,-0.523732,0.911339


In [149]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [150]:
regression=LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [151]:
regression.coef_

array([[  907.34033188,  -420.10134185,  1652.72144101,   391.84407578,
        -1859.12958788, -1547.47959197,  5827.61366211, -1344.7853545 ,
        -1404.64842178, -4054.76797714,  4182.38472123,   793.1371438 ,
        -1240.22384925,   962.21595437, -6592.43623712,  -839.48800914,
          544.64049388,  1001.30252995,  -621.85888865,  -972.47977217]])

In [152]:
regression.intercept_

array([13280.79837063])

In [153]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [154]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 3156.217574549209
MAE: 2467.249020933902
R2 score 83.41251307996276


Lasso
Model Training Performance
RMSE: 3144.3338835431946
MAE: 2457.5898156956196
R2 score 83.5371872942066


Ridge
Model Training Performance
RMSE: 3015.5256766105076
MAE: 2317.814609247521
R2 score 84.858364497245


Elasticnet
Model Training Performance
RMSE: 3210.137041430307
MAE: 2259.5865943268127
R2 score 82.84092502900825




In [155]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']