# 🚗 Car Price Prediction using CSV Dataset

In [1]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [4]:
# 📂 Load Data from CSV
# Make sure 'car_data.csv' is in the same directory as this notebook
df = pd.read_csv('car_price_prediction_.csv')
df.head()

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series
2,3,Audi,2013,4.5,Electric,Manual,181601,New,44402.61,A4
3,4,Tesla,2011,4.1,Diesel,Automatic,68682,New,86374.33,Model Y
4,5,Ford,2009,2.6,Diesel,Manual,223009,Like New,73577.1,Mustang


In [5]:
# 🧹 Drop Irrelevant Columns
df = df.drop(columns=['Car ID', 'Model'])

In [6]:
# 🔁 Encode Categorical Columns
df = pd.get_dummies(df, columns=['Brand', 'Fuel Type', 'Transmission', 'Condition'], drop_first=True)
df.head()

Unnamed: 0,Year,Engine Size,Mileage,Price,Brand_BMW,Brand_Ford,Brand_Honda,Brand_Mercedes,Brand_Tesla,Brand_Toyota,Fuel Type_Electric,Fuel Type_Hybrid,Fuel Type_Petrol,Transmission_Manual,Condition_New,Condition_Used
0,2016,2.3,114832,26613.92,False,False,False,False,True,False,False,False,True,True,True,False
1,2018,4.4,143190,14679.61,True,False,False,False,False,False,True,False,False,True,False,True
2,2013,4.5,181601,44402.61,False,False,False,False,False,False,True,False,False,True,True,False
3,2011,4.1,68682,86374.33,False,False,False,False,True,False,False,False,False,False,True,False
4,2009,2.6,223009,73577.1,False,True,False,False,False,False,False,False,False,True,False,False


In [7]:
# 🔢 Feature Scaling
scaler = StandardScaler()
df[['Year', 'Engine Size', 'Mileage']] = scaler.fit_transform(df[['Year', 'Engine Size', 'Mileage']])

In [8]:
# 🔀 Train-Test Split
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
# 🤖 Train Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [10]:
# 📊 Evaluate Model
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

RMSE: 28329.365192371388
R² Score: -0.05799936382064197




In [11]:
# 💾 Save Model
joblib.dump(model, 'car_price_model.pkl')

['car_price_model.pkl']

## 🔮 Predict Price for Custom Car Input

In [12]:
# 🔮 Predict Price for Any Custom Car Input

# Step 1: User Input
user_input = {
    'Brand': 'Tesla',              # Tesla, BMW, Audi, etc.
    'Year': 2016,
    'Engine Size': 2.3,
    'Fuel Type': 'Petrol',         # 'Petrol' or 'Electric'
    'Transmission': 'Manual',      # 'Manual' (or other)
    'Mileage': 114832,
    'Condition': 'New'            # 'New' or 'Used'
}

# Step 2: Convert to DataFrame
input_df = pd.DataFrame([user_input])

# Step 3: Apply same encoding as training set
input_df = pd.get_dummies(input_df)
for col in X.columns:
    if col not in input_df.columns:
        input_df[col] = 0  # Add missing columns
input_df = input_df[X.columns]  # Reorder columns

# Step 4: Scale numeric features
input_df[['Year', 'Engine Size', 'Mileage']] = scaler.transform(input_df[['Year', 'Engine Size', 'Mileage']])

# Step 5: Predict
predicted_price = model.predict(input_df)[0]
print(f"💰 Predicted Car Price: ${predicted_price:,.2f}")

💰 Predicted Car Price: $35,526.74
