### Model Training

#### Import libraries 

In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from xgboost import XGBRegressor

#### Import data 

In [56]:
train_data=pd.read_csv('data/train.csv')
test_data=pd.read_csv('data/test.csv')
sample_data=pd.read_csv('data/sample_submission.csv')

In [57]:
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [58]:
sample_data.head()

Unnamed: 0,id,price
0,54273,39218.443
1,54274,39218.443
2,54275,39218.443
3,54276,39218.443
4,54277,39218.443


In [59]:
test_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes


In [60]:
sample_data=sample_data.drop(columns='id',axis=1)

In [61]:
sample_data.shape,test_data.shape

((36183, 1), (36183, 12))

In [62]:
test_df=pd.concat([test_data,sample_data],axis=1)

In [63]:
test_df.shape,train_data.shape

((36183, 13), (54273, 13))

In [64]:
df=pd.concat([train_data,test_df])

In [65]:
df.shape

(90456, 13)

In [66]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000.0
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250.0
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000.0
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500.0
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850.0


* Y train data

In [67]:
Y=df.loc[:,['price']]
Y.head()

Unnamed: 0,price
0,11000.0
1,8250.0
2,15000.0
3,63500.0
4,7850.0


In [68]:
X=df.drop(columns='price',axis=1)

In [69]:
X.shape,Y.shape

((90456, 12), (90456, 1))

#### Data Transformation
* Categorical features - One hot encoding
* Numerical features - Standard Scalar
* column tranformer -allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer.

* Train data transformation

In [70]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

cat_features=X.select_dtypes(include='object').columns
num_features=X.select_dtypes(exclude='object').columns
onehot_transform=OneHotEncoder()
num_transform=StandardScaler()
pre_process_train=ColumnTransformer([('OneHotED',onehot_transform,cat_features),('stdscalar',num_transform,num_features)])

In [71]:
X=pre_process_train.fit_transform(X)

In [72]:
X.shape

(90456, 3460)

#### Train Test Split

In [73]:
from sklearn.model_selection import train_test_split,GridSearchCV
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.4,random_state=45)

In [74]:
X_train.shape,Y_train.shape

((54273, 3460), (54273, 1))

In [76]:
X_test.shape,Y_test.shape

((36183, 3460), (36183, 1))

#### Model Building

#### Linear Regression

In [21]:
### Training the model
Linear_Reg=LinearRegression()
Linear_Reg.fit(X_train,Y_train)

In [22]:
### Prediction
y_train_pred=Linear_Reg.predict(X_train)
y_test_pred=Linear_Reg.predict(X_test)

* Model Evaluation

In [26]:
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np
mse_train=mean_squared_error(Y_train,y_train_pred)
mse_test=mean_squared_error(Y_test,y_test_pred)
rmse_train=np.sqrt(mse_train)
rmse=np.sqrt(mse_test)
r2_score_train=r2_score(Y_train,y_train_pred)
r2_score_test=r2_score(Y_test,y_test_pred)

In [28]:
#Root mean square error
rmse_train,rmse

(49361.21741061884, 59761.07029115648)

In [29]:
#r2 score
r2_score_train,r2_score_test

(0.17450265673176468, -0.012391329732159173)

#### Training with other Algorithms

In [30]:
models={
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'Decision Tree':DecisionTreeRegressor(),
    'Random Forest':RandomForestRegressor(),
    'XgBoost':XGBRegressor(),
    'AdaBoost':AdaBoostRegressor()
}
model_list=[]

# Train model
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,Y_train)

    #prediction
    Y_train_pred=model.predict(X_train)
    Y_test_pred=model.predict(X_test)

    # Model Evaluation
    mse_train_model=mean_squared_error(Y_train,Y_train_pred)
    mse_test_model=mean_squared_error(Y_test,Y_test_pred)
    rmse_train_model=np.sqrt(mse_train_model)
    rmse_model=np.sqrt(mse_test_model)
    r2_score_train=r2_score(Y_train,Y_train_pred)
    r2_score_test=r2_score(Y_test,Y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance measured using RMSE_Train:{0:.4f}".format(rmse_train_model))
    print("Model performance measured using RMSE_Test:{0:.4f}".format(rmse_model))
    print("-"*40)
    print("Model performance measured using r2_Train:{0:.4f}".format(r2_score_train))
    print("Model performance measured using r2_Test:{0:.4f}".format(r2_score_test))

    print("*"*30)
    print("\n")

Ridge
Model performance measured using RMSE_Train:49731.7501
Model performance measured using RMSE_Test:58391.2685
----------------------------------------
Model performance measured using r2_Train:0.1621
Model performance measured using r2_Test:0.0335
******************************


Lasso
Model performance measured using RMSE_Train:49586.6929
Model performance measured using RMSE_Test:58925.5314
----------------------------------------
Model performance measured using r2_Train:0.1669
Model performance measured using r2_Test:0.0157
******************************


Decision Tree
Model performance measured using RMSE_Train:0.0000
Model performance measured using RMSE_Test:67198.0599
----------------------------------------
Model performance measured using r2_Train:1.0000
Model performance measured using r2_Test:-0.2800
******************************




  return fit_method(estimator, *args, **kwargs)


Random Forest
Model performance measured using RMSE_Train:21808.0106
Model performance measured using RMSE_Test:59779.2283
----------------------------------------
Model performance measured using r2_Train:0.8389
Model performance measured using r2_Test:-0.0130
******************************


XgBoost
Model performance measured using RMSE_Train:26077.7554
Model performance measured using RMSE_Test:59829.0042
----------------------------------------
Model performance measured using r2_Train:0.7696
Model performance measured using r2_Test:-0.0147
******************************




  y = column_or_1d(y, warn=True)


AdaBoost
Model performance measured using RMSE_Train:97124.0396
Model performance measured using RMSE_Test:103809.0314
----------------------------------------
Model performance measured using r2_Train:-2.1959
Model performance measured using r2_Test:-2.0548
******************************




#### Ridge Regression

In [32]:
# Ridge regression has performed better compared to others
Ridge_reg=Ridge()
Ridge_reg.fit(X_train,Y_train)

In [33]:
#Prediction
Ridge_pred_train=Ridge_reg.predict(X_train)
Ridge_pred_test=Ridge_reg.predict(X_test)

In [35]:
#Evaluation
mse_ridge_train=mean_squared_error(Y_train,Ridge_pred_train)
mse_ridge_test=mean_squared_error(Y_test,Ridge_pred_test)
rmse_ridge_train=np.sqrt(mse_ridge_train)
rmse_ridge_test=np.sqrt(mse_ridge_test)

In [36]:
rmse_ridge_train,rmse_ridge_test

(49731.75005490878, 58391.26845264873)

In [38]:
pred_data=pd.DataFrame(Ridge_pred_test)

In [39]:
pred_data.insert(0,'id',test_data['id'],True)

In [40]:
pred_data.head()

Unnamed: 0,id,0
0,54273,41952.185883
1,54274,44603.893298
2,54275,28753.611607
3,54276,38764.628427
4,54277,25743.20984


* Rename columns

In [41]:
pred_data.rename(columns={0:'price'},inplace=True)

In [42]:
pred_data.head()

Unnamed: 0,id,price
0,54273,41952.185883
1,54274,44603.893298
2,54275,28753.611607
3,54276,38764.628427
4,54277,25743.20984


* change the type of column

In [43]:
pred_data['price']=pred_data['price'].apply(lambda x:format(float(x),".3f"))

In [44]:
pred_data.head()

Unnamed: 0,id,price
0,54273,41952.186
1,54274,44603.893
2,54275,28753.612
3,54276,38764.628
4,54277,25743.21


In [45]:
pred_data.shape

(36183, 2)

In [46]:
predicted_Data=pred_data.to_csv('prediction.csv',sep=',',index=False)

In [1]:

# Hyperparameter tuning using GridSearchCV
param_grid = {'alpha': np.logspace(-3, 3, 7),
              'l1_ratio': np.arange(0.1, 1.1, 0.1)}  # Explore alpha and l1_ratio
elastic_net = ElasticNet()
grid_search = GridSearchCV(elastic_net, param_grid, cv=5)
grid_search.fit(X_train,Y_train)

# Get the best model and its parameters
best_elastic_net = grid_search.best_estimator_
print("Best alpha:", best_elastic_net.alpha)
print("Best l1_ratio:", best_elastic_net.l1_ratio)

# Make predictions
y_pred = best_elastic_net.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(Y_test, y_pred, squared=False)
print("RMSE:", rmse)

NameError: name 'np' is not defined