In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("cars.csv")

print(df.shape)
df.head()

(8128, 5)


Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [11]:
df["owner"].value_counts()


owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [12]:
df_ohe = pd.get_dummies(df, columns=["fuel", "owner"])
df_ohe.head()


Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False


In [13]:
df_ohe_k1 = pd.get_dummies(df, columns=["fuel", "owner"], drop_first=True)
df_ohe_k1.head()


Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False


In [14]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 0:4]    # brand, km_driven, fuel, owner
y = df.iloc[:, -1]     # selling_price

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=2
)

print(X_train.shape, X_test.shape)
X_train.head()


(6502, 4) (1626, 4)


Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [15]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)

X_train_ohe = ohe.fit_transform(X_train[["fuel", "owner"]])
X_test_ohe = ohe.transform(X_test[["fuel", "owner"]])

print(X_train_ohe.shape)


(6502, 7)


In [16]:
X_train_final = np.hstack((X_train[["km_driven"]].values, X_train_ohe))
X_test_final  = np.hstack((X_test[["km_driven"]].values, X_test_ohe))

print(X_train_final.shape)
print(X_test_final.shape)


(6502, 8)
(1626, 8)


In [17]:
counts = df["brand"].value_counts()
threshold = 100

rare_brands = counts[counts <= threshold].index

# train and test me same replacement
X_train_brand = X_train["brand"].replace(rare_brands, "uncommon")
X_test_brand  = X_test["brand"].replace(rare_brands, "uncommon")


In [18]:
brand_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_brand_ohe = brand_ohe.fit_transform(X_train_brand.to_frame())
X_test_brand_ohe  = brand_ohe.transform(X_test_brand.to_frame())

print(X_train_brand_ohe.shape)


(6502, 13)


In [19]:
X_train_final = np.hstack((X_train[["km_driven"]].values, X_train_ohe, X_train_brand_ohe))
X_test_final  = np.hstack((X_test[["km_driven"]].values, X_test_ohe, X_test_brand_ohe))

print("✅ Final Train shape:", X_train_final.shape)
print("✅ Final Test shape :", X_test_final.shape)


✅ Final Train shape: (6502, 21)
✅ Final Test shape : (1626, 21)


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

lr = LinearRegression()

# train
lr.fit(X_train_final, y_train)

# predict
y_pred_lr = lr.predict(X_test_final)

print("✅ Linear Regression R2:", r2_score(y_test, y_pred_lr))
print("✅ Linear Regression MAE:", mean_absolute_error(y_test, y_pred_lr))


✅ Linear Regression R2: 0.5570843997674068
✅ Linear Regression MAE: 267116.23013175477


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

# train
rf.fit(X_train_final, y_train)

# predict
y_pred_rf = rf.predict(X_test_final)

print("✅ RandomForest R2:", r2_score(y_test, y_pred_rf))
print("✅ RandomForest MAE:", mean_absolute_error(y_test, y_pred_rf))


✅ RandomForest R2: 0.7908699010140947
✅ RandomForest MAE: 190322.7586415272
