# Car Price Prediction

In [9]:
# Import required Libraries

In [72]:
import pandas as pd
import seaborn as sns
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV , KFold , cross_val_score
from sklearn.linear_model import Lasso , Ridge , LinearRegression
from sklearn.metrics import mean_absolute_error ,mean_squared_error,root_mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [11]:
# load Data , preprocess it and clean

In [12]:
dataset = pd.read_csv(r"C:\Users\siddi\Desktop\Datasets\Car Dataset.csv")

In [13]:
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [14]:
dataset.shape

(4340, 8)

In [15]:
dataset.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [16]:
# Check And Encode The Chatagorical Data

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [18]:
current_date = 2025
dataset["car_age"] = current_date - dataset["year"]
dataset.drop(columns=["year"],inplace=True)

In [19]:
for colunm in dataset.select_dtypes(include="object").columns:
    print(f"Colunm Name : {colunm} \n Unique values {dataset[colunm].unique()}")

Colunm Name : name 
 Unique values ['Maruti 800 AC' 'Maruti Wagon R LXI Minor' 'Hyundai Verna 1.6 SX' ...
 'Mahindra Verito 1.5 D6 BSIII'
 'Toyota Innova 2.5 VX (Diesel) 8 Seater BS IV'
 'Hyundai i20 Magna 1.4 CRDi']
Colunm Name : fuel 
 Unique values ['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
Colunm Name : seller_type 
 Unique values ['Individual' 'Dealer' 'Trustmark Dealer']
Colunm Name : transmission 
 Unique values ['Manual' 'Automatic']
Colunm Name : owner 
 Unique values ['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']


In [20]:
# encoding

In [21]:
categorical_colunm = dataset.select_dtypes(include="object").columns
print(categorical_colunm)

Index(['name', 'fuel', 'seller_type', 'transmission', 'owner'], dtype='object')


In [22]:
encoder = OrdinalEncoder()
dataset[categorical_colunm] = encoder.fit_transform(dataset[categorical_colunm])

In [23]:
for colunm in dataset.select_dtypes(include="float64").columns:
    print(f"Colunm Name : {colunm} \n Unique values {dataset[colunm].unique()}")

Colunm Name : name 
 Unique values [ 775. 1041.  505. ...  733. 1418.  601.]
Colunm Name : fuel 
 Unique values [4. 1. 0. 3. 2.]
Colunm Name : seller_type 
 Unique values [1. 0. 2.]
Colunm Name : transmission 
 Unique values [1. 0.]
Colunm Name : owner 
 Unique values [0. 2. 1. 4. 3.]


In [24]:
# Spliting The data into train and test

In [25]:
x = dataset.drop("selling_price",axis=1)
y = dataset["selling_price"]

In [26]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=42)

In [27]:
# Linear Regression

In [28]:
lr = LinearRegression()

In [29]:
lr.fit(x_train , y_train)

In [30]:
lr.score(x_test , y_test) * 100 , lr.score(x_train , y_train) * 100

(39.59697625888199, 46.145258219994254)

In [31]:
print(root_mean_squared_error(y,lr.predict(x)))
print(mean_squared_error(y,lr.predict(x)))
print(mean_absolute_error(y,lr.predict(x)))
print(r2_score(y,lr.predict(x))*100)

429171.83256855654
184188461870.2531
235586.52695693568
44.95947063461298


In [32]:
# Lasso

In [33]:
lasso = Lasso(alpha=1)

In [34]:
lasso.fit(x_train , y_train)

In [35]:
lasso.score(x_test , y_test)*100 , lasso.score(x_train , y_train)*100

(39.597061162501234, 46.145258215811616)

In [36]:
print(root_mean_squared_error(y,lasso.predict(x)))
print(mean_squared_error(y,lasso.predict(x)))
print(mean_absolute_error(y,lasso.predict(x)))
print(r2_score(y,lasso.predict(x))*100)

429171.77220964956
184188410061.57132
235586.1543572835
44.95948611645827


In [37]:
# Ridge

In [38]:
ridge = Ridge(alpha=1)

In [39]:
ridge.fit(x_train , y_train)

In [40]:
ridge.score(x_test , y_test)*100 , ridge.score(x_train , y_train)*100

(39.61595825510054, 46.14503610169596)

In [41]:
print(root_mean_squared_error(y,ridge.predict(x)))
print(mean_squared_error(y,ridge.predict(x)))
print(mean_absolute_error(y,ridge.predict(x)))
print(r2_score(y,ridge.predict(x))*100)

429159.0427828852
184177484002.3223
235552.5775959244
44.96275112056786


In [42]:
# Decision Tree

In [43]:
dt = DecisionTreeRegressor(criterion="squared_error",max_depth=19,splitter="random")

In [44]:
dt.fit(x_train , y_train)

In [45]:
dt.score(x_test , y_test)*100 , dt.score(x_train , y_train)*100

(40.62413733857322, 99.50258658243742)

In [46]:
print(root_mean_squared_error(y,dt.predict(x)))
print(mean_squared_error(y,dt.predict(x)))
print(mean_absolute_error(y,dt.predict(x)))
print(r2_score(y,dt.predict(x))*100)

193907.69597234754
37600194557.30437
38752.94020367881
88.76403770539434


In [47]:
df = {
    "criterion" : ["squared_error", "friedman_mse", "absolute_error"],
    "splitter" : ["beest", "random"],
    "max_depth" : [i for i in range(2,20)]
}

In [48]:
gscv = GridSearchCV(DecisionTreeRegressor(),param_grid=df)

In [49]:
gscv.fit(x_train,y_train)

270 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invali

In [50]:
gscv.best_params_

{'criterion': 'squared_error', 'max_depth': 11, 'splitter': 'random'}

In [51]:
# SVM

In [52]:
svm = SVR()

In [53]:
svm.fit(x_train , y_train)

In [54]:
svm.score(x_train , y_train) * 100 , svm.score(x_test , y_test) * 100

(-7.240810011917698, -6.343585697190268)

In [55]:
print(root_mean_squared_error(y,svm.predict(x)))
print(mean_squared_error(y,svm.predict(x)))
print(mean_absolute_error(y,svm.predict(x)))
print(r2_score(y,svm.predict(x))*100)

598554.775409633
358267819165.6761
301309.928113442
-7.06018293020052


In [56]:
# KNN

In [57]:
knn = KNeighborsRegressor()

In [58]:
knn.fit(x_train , y_train)

In [59]:
knn.score(x_test , y_test) * 100 , knn.score(x_train , y_train) * 100

(26.84643223312666, 60.75251252588616)

In [60]:
print(root_mean_squared_error(y,knn.predict(x)))
print(mean_squared_error(y,knn.predict(x)))
print(mean_absolute_error(y,knn.predict(x)))
print(r2_score(y,knn.predict(x))*100)

389886.9594926741
152011841182.4421
202207.07709677418
54.57472132873045


In [61]:
df = {'n_neighbors':[i for i in range(1,21)]}

In [62]:
gscv2 = GridSearchCV(KNeighborsRegressor(),param_grid=df)

In [63]:
gscv2.fit(x_train , y_train)

In [64]:
gscv2.best_params_ , gscv2.best_score_

({'n_neighbors': 4}, 0.3505756598798526)

In [65]:
# RandomForest

In [66]:
rf = RandomForestRegressor()

In [67]:
rf.fit(x_train , y_train)

In [68]:
rf.score(x_test , y_test)*100 , rf.score(x_train , y_train)*100

(70.17440364828254, 96.8462434795346)

In [69]:
print(root_mean_squared_error(y,rf.predict(x)))
print(mean_squared_error(y,rf.predict(x)))
print(mean_absolute_error(y,rf.predict(x)))
print(r2_score(y,rf.predict(x))*100)

163801.74916477938
26831013029.441303
54329.760057142856
91.98216247882924


In [73]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# R² Score
r2_scores = cross_val_score(rf, x, y, cv=kfold, scoring='r2')
print("R² for each fold:", r2_scores)
print("Mean R²:", np.mean(r2_scores))

R² for each fold: [0.70146234 0.80181983 0.80664824 0.80667522 0.78324551]
Mean R²: 0.7799702283762453


In [None]:
df2 = {'n_estimators':[i for i in range(100,1000,100)],
      'criterion':["squared_error", "absolute_error", "friedman_mse", "poisson"]}

In [None]:
gscvrf = GridSearchCV(RandomForestRegressor(),param_grid=df2)

In [None]:
# gscvrf.fit(x_train , y_train)

In [None]:
testing_score = [lr.score(x_test , y_test) * 100 , lasso.score(x_test , y_test) * 100 , ridge.score(x_test , y_test) * 100
                 , dt.score(x_test , y_test) * 100 , svm.score(x_test , y_test) * 100 , knn.score(x_test , y_test) * 100,
                rf.score(x_test , y_test)*100]

In [None]:
training_score = [lr.score(x_train , y_train)*100 , lasso.score(x_train , y_train)*100 , ridge.score(x_train , y_train)*100 
                  , dt.score(x_train , y_train)*100 , svm.score(x_train , y_train)*100 , knn.score(x_train , y_train)*100,
                 rf.score(x_train , y_train)*100]

In [None]:
R2_Score = [r2_score(y,lr.predict(x))*100 , r2_score(y,lasso.predict(x))*100,
           r2_score(y,ridge.predict(x))*100 , r2_score(y,dt.predict(x))*100,
           r2_score(y,svm.predict(x))*100 , r2_score(y,knn.predict(x))*100,
           r2_score(y,rf.predict(x))*100]

In [None]:
rmse = [root_mean_squared_error(y,lr.predict(x)) , root_mean_squared_error(y,lasso.predict(x)),
        root_mean_squared_error(y,ridge.predict(x)) , root_mean_squared_error(y,dt.predict(x)),
       root_mean_squared_error(y,svm.predict(x)) , root_mean_squared_error(y,knn.predict(x)),
       root_mean_squared_error(y,rf.predict(x))*100]

In [None]:
mse = [mean_squared_error(y,lr.predict(x)) , mean_squared_error(y,lasso.predict(x)),
        mean_squared_error(y,ridge.predict(x)) , mean_squared_error(y,dt.predict(x)),
       mean_squared_error(y,svm.predict(x)) , mean_squared_error(y,knn.predict(x)),
      mean_squared_error(y,rf.predict(x))]

In [None]:
mae = [mean_absolute_error(y,lr.predict(x)) , mean_absolute_error(y,lasso.predict(x)),
      mean_absolute_error(y,ridge.predict(x)) , mean_absolute_error(y,dt.predict(x)),
      mean_absolute_error(y,svm.predict(x)) , mean_absolute_error(y,knn.predict(x)),
      mean_absolute_error(y,rf.predict(x))]

In [None]:
Df = pd.DataFrame({'Algorithm':['LinearRegression','Lasso','Ridge','DecisionTree','SVM','KNN','RandomForest'],'Testing Score':testing_score,
                   'Training Score':training_score,'R2 Sscore':R2_Score,'Root Mean Squared Error':rmse,
                   'Mean Squared Error':mse,'Mean Absolute Error':mae})

In [None]:
Df

In [None]:
# Save both model and encoder using joblib

In [None]:
joblib.dump(rf, "car_price_model.pkl")
joblib.dump(encoder , "encoder.pkl")

In [None]:
x.head(10)

In [None]:
y.head(10)