<a href="https://colab.research.google.com/github/Utkarsh-Rane43/FODS-PROJECT/blob/main/FODS_PART3_(catboost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('/content/cardekho_dataset.csv')

# Identify categorical and numerical columns
categorical_features = ['brand', 'model', 'seller_type', 'fuel_type', 'transmission_type']
numerical_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

# Ensure categorical columns are of type 'category'
for cat_feature in categorical_features:
    df[cat_feature] = df[cat_feature].astype('category')

# Prepare features and target
X = df.drop(['selling_price', 'car_name', 'Unnamed: 0'], axis=1)
y = df['selling_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the CatBoost model with categorical features specified
catboost_model = CatBoostRegressor(random_state=42, verbose=0)
catboost_model.fit(X_train, y_train, cat_features=categorical_features)

# Predict and evaluate the model
y_pred = catboost_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("CatBoost Model Performance:")
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("R²: " + str(r2))

# Feature importance
feature_importance = catboost_model.feature_importances_
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(feature_importance)[::-1]

print("\nFeature Importance:")
for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], feature_importance[indices[f]]))

catboost_model.save_model("catboost_model.cbm")

CatBoost Model Performance:
MSE: 36846276540.035164
RMSE: 191953.83960743053
R²: 0.9510531776859485

Feature Importance:
1. max_power (47.916235)
2. vehicle_age (23.280563)
3. km_driven (6.212064)
4. engine (6.053022)
5. brand (5.994670)
6. mileage (3.169681)
7. model (2.643551)
8. fuel_type (2.200474)
9. transmission_type (1.587318)
10. seats (0.594274)
11. seller_type (0.348149)


In [35]:
# Sample input data for prediction
sample_data = pd.DataFrame({
    'brand': ['Mahindra'],
    'model': ['Bolero'],
    'vehicle_age': [10],
    'km_driven': [25000],
    'seller_type': ['Individual'],
    'fuel_type': ['Diesel'],
    'transmission_type': ['Manual'],
    'mileage': [13.6],
    'engine': [2523],
    'max_power': [63.0],
    'seats': [7]
})

# Ensure categorical columns are of type 'category'
for cat_feature in categorical_features:
    sample_data[cat_feature] = sample_data[cat_feature].astype('category')

# Predict the selling price using the trained model
predicted_price = catboost_model.predict(sample_data)

print("Predicted Selling Price: " + str(predicted_price[0]))

Predicted Selling Price: 382111.63076074515
