## Essential Libraires

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [25]:
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

In [26]:
cd = pd.read_csv('E:\Machine Learning\Projects\Car Price Prediction\cardekho.csv')
print(cd.shape)
print(cd.info())
cd.head(5)

(8128, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   seller_type         8128 non-null   object 
 6   transmission        8128 non-null   object 
 7   owner               8128 non-null   object 
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   object 
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 762.1+ KB
None


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


### Checking for null values

In [27]:
cd.isna().sum()

name                    0
year                    0
selling_price           0
km_driven               0
fuel                    0
seller_type             0
transmission            0
owner                   0
mileage(km/ltr/kg)    221
engine                221
max_power             215
seats                 221
dtype: int64

In [28]:
cd.head(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


## Data Preprocessing

In [29]:
# Droping Null Vlaues
cd = cd.rename(columns={'mileage(km/ltr/kg)': 'mileage'})
cd.dropna(subset=['mileage'], inplace=True)

cd['max_power'] = cd['max_power'].str.extract('(\d+\.\d+|\d+)').astype(float)
cd.dropna(subset=['max_power'], inplace=True)

# Label Encoding Owner, transmission, fuel and seler type
le = LabelEncoder()
cols = ['owner','transmission','seller_type','fuel']
for i in cols:
    cd[i] = le.fit_transform(cd[i])


In [30]:
cd.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


# Spliting Dataset to Feature and Target Column

In [31]:
x = cd.drop(columns=['selling_price', 'name'])
y = cd.selling_price
x.isna().sum()

year            0
km_driven       0
fuel            0
seller_type     0
transmission    0
owner           0
mileage         0
engine          0
max_power       0
seats           0
dtype: int64

### Split for Train and Test

In [32]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

## Model Training

In [33]:
rf = RandomForestRegressor()


print(f"{'Model':<20} | {'Train R2':<10} | {'Test R2':<10} | {'RMSE':<12} | {'Status'}")
print("-" * 75)


# Fit model
rf.fit(x_train, y_train)

# Predictions
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)

# Calculate R2 (Our "Accuracy")
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
    
# Calculate Error
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Diagnose Fit
if r2_train > 0.90 and (r2_train - r2_test) > 0.15:
    status = "Overfitting"
elif r2_train < 0.50:
    status = "Underfitting"
else:
    status = "Good Fit"

print(f"{'Random Forest':<20} | {r2_train:>10.2%} | {r2_test:>10.2%} | {rmse:>12,.2f} | {status}")

Model                | Train R2   | Test R2    | RMSE         | Status
---------------------------------------------------------------------------
Random Forest        |     99.42% |     98.39% |   105,822.60 | Good Fit


# Model Saving

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Define your columns
categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
numerical_cols = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

# The SECRET SAUCE: handle_unknown='use_encoded_value'
# This handles "Third Owner" even if it was missing during training
robust_encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value', 
    unknown_value=-1
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', robust_encoder, categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Create the full pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf) # Your trained Random Forest
])

# Fit the pipeline on the original dataframe (the one with string labels)
# Use your full X and y here
final_pipeline.fit(X, y)

# Save the robust version
joblib.dump(final_pipeline, 'car_price_pipeline.pkl')

['car_price_pipeline.pkl']