In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub
import os, pandas as pd, numpy as np
from joblib import dump, load

In [24]:
# Download latest version
path = kagglehub.dataset_download("nehalbirla/vehicle-dataset-from-cardekho")

print("Path to dataset files:", path)

Path to dataset files: /home/rohnak.agarwal/.cache/kagglehub/datasets/nehalbirla/vehicle-dataset-from-cardekho/versions/4


In [25]:
df = pd.read_csv(os.path.join(path, "CAR DETAILS FROM CAR DEKHO.csv"))
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [26]:
df.drop("name", inplace=True, axis=1)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [27]:
threshold = df["selling_price"].quantile(0.75)
df = df[df["selling_price"] <= threshold]
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [28]:
cat_cols = df.select_dtypes(include=["object"]).columns
num_cols = df.select_dtypes(include=[int, float]).columns

print(cat_cols, num_cols)

Index(['fuel', 'seller_type', 'transmission', 'owner'], dtype='object') Index(['year', 'selling_price', 'km_driven'], dtype='object')


In [29]:
df_1hot = pd.get_dummies(df, columns=cat_cols, dtype=float, drop_first=True)
df_1hot.head()

Unnamed: 0,year,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2007,135000,50000,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2012,600000,100000,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,250000,46000,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,450000,141000,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [30]:
df_1hot[num_cols] = df_1hot[num_cols].astype(float)
df_1hot.head()

Unnamed: 0,year,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007.0,60000.0,70000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2007.0,135000.0,50000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2012.0,600000.0,100000.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017.0,250000.0,46000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014.0,450000.0,141000.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [31]:
X = df_1hot.drop("selling_price", axis=1)
y = df_1hot["selling_price"]

In [32]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007.0,70000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2007.0,50000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2012.0,100000.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017.0,46000.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014.0,141000.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [33]:
y.head()

0     60000.0
1    135000.0
2    600000.0
3    250000.0
4    450000.0
Name: selling_price, dtype: float64

In [34]:
!mkdir -p ./model ./data

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=327
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

num_cols = list(set(num_cols) & set(X.columns))
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

(2626, 13) (657, 13) (2626,) (657,)


In [36]:
dump(scaler, "./model/stdscaler.joblib")
pd.DataFrame(
    list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)),
    columns=["param", "mean", "scale"],
)

Unnamed: 0,param,mean,scale
0,km_driven,71002.455065,46124.988537
1,year,2012.060548,4.217767


In [37]:
scaler = load("./model/stdscaler.joblib")
pd.DataFrame(
    list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)),
    columns=["param", "mean", "scale"],
)

Unnamed: 0,param,mean,scale
0,km_driven,71002.455065,46124.988537
1,year,2012.060548,4.217767


In [38]:
X_train.to_csv("./data/x_train.csv", index=0)
X_test.to_csv("./data/x_test.csv", index=0)
y_train.to_csv("./data/y_train.csv", index=0)
y_test.to_csv("./data/y_test.csv", index=0)