In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [65]:
df = pd.read_csv("car_price_prediction_.csv")
df

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series
2,3,Audi,2013,4.5,Electric,Manual,181601,New,44402.61,A4
3,4,Tesla,2011,4.1,Diesel,Automatic,68682,New,86374.33,Model Y
4,5,Ford,2009,2.6,Diesel,Manual,223009,Like New,73577.10,Mustang
...,...,...,...,...,...,...,...,...,...,...
2495,2496,Audi,2020,2.4,Petrol,Automatic,22650,Like New,61384.10,Q5
2496,2497,Audi,2001,5.7,Hybrid,Manual,77701,Like New,24710.35,A3
2497,2498,Ford,2021,1.1,Hybrid,Manual,272827,Like New,29902.45,Fiesta
2498,2499,Audi,2002,4.5,Diesel,Manual,229164,Like New,46085.67,Q5


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2500 non-null   int64  
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB


In [67]:
df['Car Age'] = 2025 - df['Year']
df.drop(columns=['Car ID','Year'], inplace=True)


In [68]:
df

Unnamed: 0,Brand,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model,Car Age
0,Tesla,2.3,Petrol,Manual,114832,New,26613.92,Model X,9
1,BMW,4.4,Electric,Manual,143190,Used,14679.61,5 Series,7
2,Audi,4.5,Electric,Manual,181601,New,44402.61,A4,12
3,Tesla,4.1,Diesel,Automatic,68682,New,86374.33,Model Y,14
4,Ford,2.6,Diesel,Manual,223009,Like New,73577.10,Mustang,16
...,...,...,...,...,...,...,...,...,...
2495,Audi,2.4,Petrol,Automatic,22650,Like New,61384.10,Q5,5
2496,Audi,5.7,Hybrid,Manual,77701,Like New,24710.35,A3,24
2497,Ford,1.1,Hybrid,Manual,272827,Like New,29902.45,Fiesta,4
2498,Audi,4.5,Diesel,Manual,229164,Like New,46085.67,Q5,23


In [69]:
X = df.drop(columns=["Price"])
y = df["Price"]

In [70]:
cat_cols= ["Brand", "Fuel Type", "Transmission", "Condition", "Model"]
num_cols = ["Car Age", "Engine Size", "Mileage"]

In [71]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

In [72]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)

In [76]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 809588404.8096877
R² Score: -0.0686883290037057


In [80]:
import joblib
joblib.dump(model, "car_price_model.pkl")
print("✅ Model saved as car_price_model.pkl")

✅ Model saved as car_price_model.pkl
