In [1]:
# Standard libraries
import os  # For working with file paths and directories

# PyTorch imports for model loading and deep learning
import torch
import torch.nn as nn  # For defining and modifying neural network layers

# Data handling and image processing
import pandas as pd  # For reading and manipulating tabular data
from PIL import Image  # For opening and processing image files

# Image transformations and model
from torchvision import transforms  # For preprocessing image data
from torchvision.models import efficientnet_b0  # Pretrained EfficientNet B0 model

# Machine learning utilities from scikit-learn
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.preprocessing import OneHotEncoder  # For encoding categorical features
from sklearn.ensemble import RandomForestRegressor  # For training a regression model
from sklearn.metrics import mean_absolute_error  # For evaluating prediction error

# Gradio for creating a web-based user interface
import gradio as gr


In [2]:
# Define the path to the Excel file containing tabular car data
excel_path = r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\final car details.xlsx"

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_path)


In [3]:
# Set the device to GPU if available, else fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load EfficientNet-B0 model without pretrained ImageNet weights
model = efficientnet_b0(pretrained=False)

# Get the number of input features to the classifier layer
num_ftrs = model.classifier[1].in_features

# Replace the default classifier with a new one for 3 output classes
# Classes: dent, no_damage, scratch
model.classifier = nn.Sequential(
    nn.Dropout(0.4),  # Add dropout to reduce overfitting
    nn.Linear(num_ftrs, 3)  # Output layer for 3 damage classes
)

# Load the trained model weights
model.load_state_dict(torch.load(
    r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\image_classification_dataset\best_model.pth",
    map_location=device
))

# Move model to the appropriate device
model.to(device)

# Set model to evaluation mode (disables dropout, etc.)
model.eval()

# Define preprocessing steps for image classification
damage_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to 224x224 as expected by EfficientNet
    transforms.ToTensor(),  # Convert image to PyTorch tensor
    transforms.Normalize([0.485, 0.456, 0.406],  # Normalize using ImageNet mean
                         [0.229, 0.224, 0.225])  # and standard deviation
])

# Define class labels (order must match the model's training)
class_names = ['dent', 'no_damage', 'scratch']

# Path to the folder containing raw vehicle images
image_folder = r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\phase 1\Images Dataset"




In [4]:
# This function attempts to resolve the full path to an image file
# by checking common image file extensions.
def get_image_path(image_id):
    for ext in [".jpg", ".png", ".jpeg"]:
        # Construct the full file path using each extension
        candidate = os.path.join(image_folder, image_id + ext)
        
        # If the file exists with the current extension, return the full path
        if os.path.exists(candidate):
            return candidate

    # Return None if no file is found with any of the tested extensions
    return None


In [5]:
# Predict the damage class ('dent', 'scratch', or 'no_damage') for a given image ID
def predict_damage(image_id):
    # Get the full path of the image using its ID
    image_path = get_image_path(image_id)

    # If the image doesn't exist, log a warning and return a default value
    if image_path is None:
        print(f" Image not found: {image_id}")
        return "not_found"

    # Open the image and convert it to RGB format
    image = Image.open(image_path).convert("RGB")

    # Apply the same preprocessing used during model training
    img_tensor = damage_transform(image).unsqueeze(0).to(device)

    # Disable gradient computation for inference
    with torch.no_grad():
        outputs = model(img_tensor)  # Forward pass through EfficientNet
        _, pred = torch.max(outputs, 1)  # Get the index of the highest score (predicted class)

    # Return the corresponding damage class label
    return class_names[pred.item()]


In [6]:
# Apply the damage prediction function to each image column
# This adds 4 new columns with the predicted damage class for each side of the car
df["FrontDamage"] = df["Front Image"].apply(predict_damage)
df["BackDamage"] = df["Back Image"].apply(predict_damage)
df["LeftDamage"] = df["Left Image"].apply(predict_damage)
df["RightDamage"] = df["Right Image"].apply(predict_damage)

# Create a user-friendly damage summary for each row
# The summary combines all 4 predicted damage labels into a readable format
df["Damage Summary"] = df.apply(lambda row: f"""
Damage Summary:
- Front: {row['FrontDamage'].capitalize()}
- Back: {row['BackDamage'].capitalize()}
- Left: {row['LeftDamage'].capitalize()}
- Right: {row['RightDamage'].capitalize()}
""", axis=1)

# Define the output file path and save the updated DataFrame to CSV
output_path = r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\final_car_details_with_damage.csv"
df.to_csv(output_path, index=False)

# Confirm the save operation in the console
print("Damage labels and summary added and saved to:")
print(output_path)


Damage labels and summary added and saved to:
C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\final_car_details_with_damage.csv


In [7]:
# Load the updated dataset that includes:
# - Tabular car features
# - Predicted damage labels for all 4 sides
# - A human-readable damage summary
df = pd.read_csv(r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\final_car_details_with_damage.csv")

# Display the first few rows of the dataset to verify its structure and content
df.head()


Unnamed: 0,ID,Brand,Model,Year,kilometers Driven,Fuel Type,Transmission,Price,Number of Owners,Colour,Front Image,Back Image,Left Image,Right Image,FrontDamage,BackDamage,LeftDamage,RightDamage,Damage Summary
0,2013001,Tata Motors,Nexon,2017,30000,Petrol,Manual,650000,1,Red,2013001_1,2013001_2,2013001_3,2013001_4,no_damage,no_damage,no_damage,no_damage,\nDamage Summary:\n- Front: No_damage\n- Back:...
1,2013002,Tata Motors,Nexon,2018,79000,Diesel,Automatic,799000,1,Red,2013002_1,2013002_2,2013002_3,2013002_4,no_damage,dent,dent,dent,\nDamage Summary:\n- Front: No_damage\n- Back:...
2,2013003,Tata Motors,Nexon,2017,49317,Petrol,Automatic,800000,1,Blue,2013003_1,2013003_2,2013003_3,2013003_4,scratch,dent,no_damage,no_damage,\nDamage Summary:\n- Front: Scratch\n- Back: D...
3,2013004,Tata Motors,Nexon,2018,100000,Petrol,Manual,790000,1,White,2013004_1,2013004_2,2013004_3,2013004_4,no_damage,dent,scratch,no_damage,\nDamage Summary:\n- Front: No_damage\n- Back:...
4,2013005,Tata Motors,Nexon,2018,76000,Petrol,Manual,650000,1,Black,2013005_1,2013005_2,2013005_3,2013005_4,scratch,no_damage,dent,dent,\nDamage Summary:\n- Front: Scratch\n- Back: N...


In [8]:
# Import necessary libraries for model training and evaluation

import pandas as pd  # For data manipulation and loading
import joblib  # For saving and loading trained models and encoders

# Scikit-learn utilities for training and preprocessing
from sklearn.model_selection import train_test_split  # For splitting data into train and validation sets
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # For encoding categorical features and normalizing numerical ones
from sklearn.metrics import mean_absolute_error  # For evaluating model performance

# XGBoost library for training a high-performance regression model
from xgboost import XGBRegressor

# RandomizedSearchCV for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Locale for formatting currency output
import locale
locale.setlocale(locale.LC_ALL, 'en_IN')  # Set locale to Indian format for currency display


'en_IN'

In [9]:
# Load the dataset that includes vehicle details and predicted damage labels
df = pd.read_csv(r"C:\Users\SHAHZOR AHMED\OneDrive\Desktop\Major project\A new approach\final_car_details_with_damage.csv")

# Create a copy of the dataset to use for model training
# This helps preserve the original dataset for reference or other use
df_model = df.copy()


In [10]:
# Normalize the 'kilometers Driven' feature using standard scaling
# This scales the values to have a mean of 0 and standard deviation of 1,
# which helps many machine learning models perform better
scaler = StandardScaler()
df_model['kilometers Driven'] = scaler.fit_transform(df_model[['kilometers Driven']])

# Convert categorical damage labels into numerical severity scores
# 'no_damage' = 0, 'scratch' = 1, 'dent' = 2
# This allows the model to interpret the damage information as ordinal values
damage_map = {'no_damage': 0, 'scratch': 1, 'dent': 2}
for col in ['FrontDamage', 'BackDamage', 'LeftDamage', 'RightDamage']:
    df_model[col] = df_model[col].map(damage_map)


In [11]:
# Define the input features to be used for price prediction
# Includes tabular features and numerically encoded damage scores
features = [
    'Brand', 'Model', 'Year', 'kilometers Driven', 'Fuel Type',
    'Transmission', 'Number of Owners', 'Colour',
    'FrontDamage', 'BackDamage', 'LeftDamage', 'RightDamage'
]

# Separate the input features (X) and the target variable (y)
X = df_model[features]
y = df_model['Price']  # The resale price is the prediction target

# One-hot encode categorical features (e.g., Brand, Model, Fuel Type)
# This transforms them into binary vectors suitable for model input
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

# Split the dataset into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [12]:
# Define a parameter grid for hyperparameter tuning of the XGBoost regressor
# These parameters control model complexity, learning speed, and sampling strategy
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the ensemble
    'learning_rate': [0.01, 0.05, 0.1],       # Step size shrinkage used in updates
    'max_depth': [3, 5, 7],                   # Maximum depth of each tree
    'subsample': [0.6, 0.8, 1.0],             # Fraction of data used for each tree
    'colsample_bytree': [0.6, 0.8, 1.0]       # Fraction of features used per tree
}

# Initialize the XGBoost Regressor with a fixed objective and random seed
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Perform randomized search over the parameter grid
# - n_iter: number of random parameter combinations to try
# - scoring: use negative MAE as the optimization metric
# - cv: use 3-fold cross-validation to evaluate each combination
# - n_jobs=-1: use all available CPU cores
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Train the model and find the best parameter combination
random_search.fit(X_train, y_train)

# Extract the best performing model from the search
best_model = random_search.best_estimator_


Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [13]:
# Use the best trained XGBoost model to predict prices on the validation set
y_pred = best_model.predict(X_val)

# Calculate the Mean Absolute Error (MAE) between predicted and actual prices
mae = mean_absolute_error(y_val, y_pred)

# Display the MAE in Indian currency format for better interpretability
print("XGBoost Validation MAE:", locale.format_string("₹%.2f", mae, grouping=True))


XGBoost Validation MAE: ₹1,39,548.09


In [14]:
# Save the trained XGBoost model to a file for later use (e.g., in the web app)
joblib.dump(best_model, "price_model_xgb.pkl")

# Save the one-hot encoder used for categorical feature transformation
joblib.dump(encoder, "encoder_xgb.pkl")

# Save the scaler used to normalize the 'kilometers Driven' feature
joblib.dump(scaler, "scaler_km.pkl")


['scaler_km.pkl']

In [15]:
# Import additional evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_pred)

# Compute Root Mean Squared Error (RMSE) by taking the square root of MSE
rmse = np.sqrt(mse)

# Compute R² Score (coefficient of determination) to measure model fit
r2 = r2_score(y_val, y_pred)

# Display evaluation metrics using Indian currency formatting
print("MAE:", locale.format_string("₹%.2f", mae, grouping=True))   # Already computed earlier
print("RMSE:", locale.format_string("₹%.2f", rmse, grouping=True))  # Measures typical prediction error
print(f"R² Score: {r2:.4f}")  # Indicates how well the model explains variance (closer to 1 is better)


MAE: ₹1,39,548.09
RMSE: ₹2,06,563.75
R² Score: 0.9607
