<a href="https://colab.research.google.com/github/Utkarsh-Rane43/FODS-PROJECT/blob/main/FODS_PART3_(xgboost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('cardekho_dataset.csv')

# Remove unnecessary columns
df = df.drop(['Unnamed: 0', 'car_name'], axis=1)

# Identify categorical and numerical columns
categorical_features = ['brand', 'model', 'seller_type', 'fuel_type', 'transmission_type']
numerical_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

# Label encode categorical features
le_dict = {}
for cat_feature in categorical_features:
    le = LabelEncoder()
    df[cat_feature] = le.fit_transform(df[cat_feature].astype(str))
    le_dict[cat_feature] = le

# Prepare features and target
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Feature importance
feature_importance = xgb_model.feature_importances_
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(feature_importance)[::-1]

print("\
Top 10 Feature Importance:")
for f in range(10):
    print(f"{f + 1}. {feature_names[indices[f]]}: {feature_importance[indices[f]]:.4f}")

# Sample input data for prediction
sample_data = pd.DataFrame({
    'brand': ['Hyundai'],
    'model': ['i20'],
    'vehicle_age': [5],
    'km_driven': [30000],
    'seller_type': ['Individual'],
    'fuel_type': ['Petrol'],
    'transmission_type': ['Manual'],
    'mileage': [18.0],
    'engine': [1197],
    'max_power': [83.0],
    'seats': [5]
})

# Encode categorical features in sample data
for cat_feature in categorical_features:
    le = le_dict[cat_feature]
    sample_data[cat_feature] = le.transform(sample_data[cat_feature].astype(str))

# Predict the selling price using the trained model
predicted_price = xgb_model.predict(sample_data)

print(f"\
Predicted Selling Price: {predicted_price[0]:.2f}")

xgb_model.save_model('xgb_model.cbm')


Model Performance:
Mean Squared Error: 50141289193.42
Root Mean Squared Error: 223922.51
R-squared Score: 0.93
Top 10 Feature Importance:
1. max_power: 0.6183
2. engine: 0.1675
3. vehicle_age: 0.0868
4. km_driven: 0.0484
5. model: 0.0213
6. brand: 0.0146
7. transmission_type: 0.0118
8. fuel_type: 0.0116
9. mileage: 0.0094
10. seats: 0.0069
Predicted Selling Price: 549963.31


