In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer, PolynomialFeatures
from category_encoders import BinaryEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor

In [2]:
# Load the unpreprocessed data
with open('../data/unprocessed_data.pkl', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)
    
X_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand,Model
5639,Pune,2006,155000,Petrol,Manual,Third,15.40,1590.0,94.00,5.0,Maruti,Baleno
2303,Chennai,2006,109000,Petrol,Manual,First,16.90,1497.0,100.00,5.0,Honda,City
2617,Jaipur,2008,128813,Diesel,Manual,First,19.09,1396.0,69.00,5.0,Tata,Indigo
1397,Kochi,2016,35659,Diesel,Automatic,First,15.97,2993.0,258.00,7.0,BMW,X5
3703,Delhi,2013,51000,Diesel,Automatic,First,19.08,1582.0,126.32,5.0,Hyundai,Verna
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,Chennai,2015,15000,Petrol,Automatic,First,18.90,1197.0,82.00,5.0,Hyundai,Grand
5191,Hyderabad,2014,46000,Diesel,Manual,Second,20.14,1498.0,88.80,5.0,Volkswagen,Polo
5226,Bangalore,2012,73000,Diesel,Automatic,First,22.69,1995.0,190.00,5.0,BMW,3
5390,Pune,2010,45252,Petrol,Manual,First,18.90,1197.0,81.86,5.0,Hyundai,Grand


In [3]:
# Load the preprocessed data
with open('../data/preprocessed_data.pkl', 'rb') as f:
    X_train_prep, y_train_log, X_test_prep, y_test_log = pickle.load(f)

X_train_prep

array([[ 1.66018158e-01,  4.99750125e-03,  0.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  2.00600000e+03],
       [ 4.53955901e-02,  1.04947526e-01,  0.00000000e+00, ...,
         0.00000000e+00,  3.00000000e+00,  2.00600000e+03],
       [-8.56031128e-02, -4.11460936e-01,  0.00000000e+00, ...,
         1.00000000e+00,  3.00000000e+00,  2.00800000e+03],
       ...,
       [ 6.91309987e-01,  1.60419790e+00,  0.00000000e+00, ...,
         0.00000000e+00,  3.00000000e+00,  2.01200000e+03],
       [-3.43709468e-01, -1.97234716e-01,  0.00000000e+00, ...,
         0.00000000e+00,  3.00000000e+00,  2.01000000e+03],
       [ 1.33852140e+00,  1.38264201e-01,  3.00000000e+00, ...,
         0.00000000e+00,  2.00000000e+00,  2.01100000e+03]])

# Linear Regression


In [4]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_prep, y_train_log)   # Learn Weights from training data Using OLS (Normal Equation)


lr_scores = cross_val_score(lr, X_train_prep, y_train_log, cv=5, scoring='r2')
print(f"Linear Regression: {np.mean(lr_scores)}")

Linear Regression: 0.8822780005707245


In [5]:
scores_dict = {'Linear Regression': round(np.mean(lr_scores), 2)}
scores_dict

{'Linear Regression': np.float64(0.88)}

# Polynomial Regression


In [6]:
#  Preprocessing & Base Model

num_cols_norm = ['Engine', 'Power', 'Mileage', 'Seats']  
num_cols_log = ['Kilometers_Driven']
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']
ord_cat_cols = ['Owner_Type']

imputer = SimpleImputer(strategy="median")
scaler  = RobustScaler()
bin_enc = BinaryEncoder()
ord_enc = OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']],
                         handle_unknown="use_encoded_value", unknown_value=-1)
poly = PolynomialFeatures(degree=2, include_bias=False)

log_transformer = FunctionTransformer(np.log1p, feature_names_out="one-to-one" )  

num_pipeline = Pipeline(steps=[ ('imputer', imputer),
                                ('poly', poly),
                                ('scaler', scaler)
                              ])
log_pipeline = Pipeline(steps=[ ('imputer', imputer),
                              ('log_transform', log_transformer),
                              ('poly', poly),
                              ('scaler', scaler)
                            ])
transformer = ColumnTransformer(transformers=[
                                                ('num', num_pipeline, num_cols_norm),
                                                ('log_transform', log_pipeline, num_cols_log),
                                                ('bin_enc', bin_enc, nom_cat_cols),
                                                ('ord_enc', ord_enc, ord_cat_cols)
                                            ], remainder='passthrough'
                                )


# Linear Regression
lr = LinearRegression()
pipe = Pipeline(steps=[ ('preprocessor', transformer),
                        ('model', lr)
                      ])


poly2_scores = cross_val_score(pipe, X_train, y_train_log, cv=5, scoring='r2')
print("Cross-validated R²: %.2f (± %.2f)" % (poly2_scores.mean(), poly2_scores.std() * 2))

Cross-validated R²: 0.89 (± 0.02)


In [7]:
scores_dict.update({'Polynomial Regression (2)': round(np.mean(poly2_scores), 2)})
scores_dict

{'Linear Regression': np.float64(0.88),
 'Polynomial Regression (2)': np.float64(0.89)}

# Grid Search with Ridge Regression 

In [8]:
# Grid Search with Ridge Regression (tuning alpha and polynomial degree)
ridge = Ridge()
pipe_ridge = Pipeline(steps=[ ('preprocessor', transformer),
                              ('model', ridge)
                            ])
param_grid = {
    'preprocessor__num__poly__degree': [1, 2, 3, 4],
    'model__alpha': [0.01, 0.1, 1.0, 10.0]
}
ridge_search = GridSearchCV(pipe_ridge, param_grid, cv=5, scoring='r2')
ridge_search.fit(X_train, y_train_log)
print("Best parameters:", ridge_search.best_params_)
print("Best cross-validated R²: %.2f" % ridge_search.best_score_)

Best parameters: {'model__alpha': 1.0, 'preprocessor__num__poly__degree': 3}
Best cross-validated R²: 0.90


In [9]:
scores_dict.update({'Ridge Regression (3)': round(ridge_search.best_score_ , 2)})
scores_dict

{'Linear Regression': np.float64(0.88),
 'Polynomial Regression (2)': np.float64(0.89),
 'Ridge Regression (3)': np.float64(0.9)}

# Grid Search with Decision Tree

In [10]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
pipe_dt = Pipeline(steps=[ ('preprocessor', transformer),
                           ('model', dt)
                         ])
param_grid_dt = {
    'preprocessor__num__poly__degree': [1],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
grid_search_dt = GridSearchCV(pipe_dt, param_grid_dt, cv=5, scoring='r2')
grid_search_dt.fit(X_train, y_train_log)
print("Best parameters:", grid_search_dt.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_dt.best_score_)

Best parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'preprocessor__num__poly__degree': 1}
Best cross-validated R²: 0.90


In [11]:
scores_dict.update({'Decision Tree Regressor': round(grid_search_dt.best_score_, 2)})
scores_dict

{'Linear Regression': np.float64(0.88),
 'Polynomial Regression (2)': np.float64(0.89),
 'Ridge Regression (3)': np.float64(0.9),
 'Decision Tree Regressor': np.float64(0.9)}

# Grid Search with RandomForest Regressor

In [12]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
pipe_rf = Pipeline(steps=[ ('preprocessor', transformer),
                           ('model', rf)
                         ])
param_grid_rf = {
    'preprocessor__num__poly__degree': [1],
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}
grid_search_rf = GridSearchCV(pipe_rf, param_grid_dt, cv=5, scoring='r2')
grid_search_rf.fit(X_train, y_train_log)
print("Best parameters:", grid_search_rf.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_rf.best_score_)

Best parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'preprocessor__num__poly__degree': 1}
Best cross-validated R²: 0.94


In [13]:
scores_dict.update({'RandomForest Regressor': round(grid_search_rf.best_score_, 2)})
scores_dict

{'Linear Regression': np.float64(0.88),
 'Polynomial Regression (2)': np.float64(0.89),
 'Ridge Regression (3)': np.float64(0.9),
 'Decision Tree Regressor': np.float64(0.9),
 'RandomForest Regressor': np.float64(0.94)}

# XGBoost

In [14]:
# XGBRegressor model
xgb = XGBRegressor(random_state=42, n_jobs=-1)

# Pipeline with preprocessing + model
pipe_xgb = Pipeline(steps=[
    ('preprocessor', transformer),
    ('model', xgb)
])

# Hyperparameter grid
param_grid_xgb = {
    'preprocessor__num__poly__degree': [1],   
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__gamma': [0, 0.1, 0.2]
}

# Grid search with 5-fold CV
grid_search_xgb = RandomizedSearchCV(
    pipe_xgb,
    param_grid_xgb,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search_xgb.fit(X_train, y_train_log)

print("Best parameters:", grid_search_xgb.best_params_)
print("Best cross-validated R²: %.2f" % grid_search_xgb.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Best parameters: {'preprocessor__num__poly__degree': 1, 'model__subsample': 0.8, 'model__n_estimators': 200, 'model__max_depth': 7, 'model__learning_rate': 0.05, 'model__gamma': 0, 'model__colsample_bytree': 0.8}
Best cross-validated R²: 0.95


# Best Model


In [15]:
scores_dict.update({'XGB Regressor': round(grid_search_xgb.best_score_, 2)})
scores_dict

{'Linear Regression': np.float64(0.88),
 'Polynomial Regression (2)': np.float64(0.89),
 'Ridge Regression (3)': np.float64(0.9),
 'Decision Tree Regressor': np.float64(0.9),
 'RandomForest Regressor': np.float64(0.94),
 'XGB Regressor': np.float64(0.95)}

In [18]:
scores_df = pd.DataFrame(list(scores_dict.items()), columns=['Model', 'CV Avg R² Score'])
scores_df = scores_df.sort_values(by='CV Avg R² Score', ascending=False).reset_index(drop=True)
scores_df

Unnamed: 0,Model,CV Avg R² Score
0,XGB Regressor,0.95
1,RandomForest Regressor,0.94
2,Decision Tree Regressor,0.9
3,Ridge Regression (3),0.9
4,Polynomial Regression (2),0.89
5,Linear Regression,0.88


In [21]:
print ('Best Model:',  scores_df.nlargest(1, 'CV Avg R² Score')['Model'][0])

Best Model: XGB Regressor


# Final Evaluation

In [23]:
best_model = grid_search_xgb.best_estimator_
y_pred_log = best_model.predict(X_test)

print('Testing R2 Score: ', round(best_model.score(X_test, y_test_log), 2))

Testing R2 Score:  0.95


# Best Model Saving

In [24]:
# Create or open the models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

# Save the best model
with open('../models/best_xgb_model.pkl', 'wb') as f:
    pickle.dump(grid_search_xgb.best_estimator_, f)

print('Best Model Saved Successfully')

Best Model Saved Successfully
