In [15]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("Car Price.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [4]:
print(df.shape)
print(df.isnull().sum())

(15411, 14)
Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64


In [5]:
X = df.drop(['selling_price', 'Unnamed: 0', 'car_name'], axis=1)
y = df['selling_price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
cat_features = ["brand","model","seller_type","fuel_type","transmission_type"]
num_features = ["vehicle_age","km_driven","mileage","engine","max_power","seats"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ("num", StandardScaler(), num_features)
])


In [26]:
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(),
    'SVR': SVR()
}

In [27]:
param_grid = {
    "Linear": {},
    "Ridge": {
        "model__alpha": [0.1, 1, 10]
    },
    
    "SVR": {"model__C": [1, 10, 100],
            "model__kernel": ["rbf"],
            "model__gamma": ["scale"]}
    }


In [28]:
best_model = None
best_score = -np.inf

In [29]:
for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    grid = GridSearchCV(
        pipe,
        param_grid[name],
        cv=5,
        scoring="r2",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"\n{name}")
    print("Best Params:", grid.best_params_)
    print("R2:", r2)
    print("RMSE:", rmse)

    if r2 > best_score:
        best_score = r2
        best_model = grid.best_estimator_


Linear
Best Params: {}
R2: 0.8001729445432747
RMSE: 387848.1256802734

Ridge
Best Params: {'model__alpha': 1}
R2: 0.7954883039845115
RMSE: 392368.0424057203

SVR
Best Params: {'model__C': 100, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
R2: -0.00048667877669439363
RMSE: 867841.087644575


In [30]:
joblib.dump(best_model, "model.pkl")
print("\nBest model saved as model.pkl")


Best model saved as model.pkl
