# EDA and Feature Importance — Used Car Price Prediction (India)

This notebook loads the project dataset (prefers `data/cleaned_cars.csv`, falls back to `data/cars.csv` or `data/sample_cars.csv`), performs quick exploratory data analysis (head, basic stats, histograms, correlation heatmap), and trains a small RandomForest to show top feature importances. Cells are runnable and guarded for small datasets.

In [None]:
# Section: Imports and setup
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

print('pandas', pd.__version__)
print('numpy', np.__version__)
import sklearn
print('scikit-learn', sklearn.__version__)

%matplotlib inline
sns.set_style('whitegrid')


In [None]:
# Section: Load data (prefer cleaned)
ROOT = Path('..') / ''
CLEANED = Path('..') / 'data' / 'cleaned_cars.csv'
CARS = Path('..') / 'data' / 'cars.csv'
SAMPLE = Path('..') / 'data' / 'sample_cars.csv'

if CLEANED.exists():
    data_path = CLEANED
elif CARS.exists():
    data_path = CARS
else:
    data_path = SAMPLE

print('Loading data from', str(data_path))
df = pd.read_csv(data_path)
print('Rows, cols:', df.shape)

# show head
display(df.head())

# info and describe
print('\nData types:')
print(df.dtypes)

print('\nDescriptive statistics (numeric):')
display(df.describe())


In [None]:
# Section: Histograms for numeric columns
numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in ['selling_price']]
# ensure some standard order if available
preferred = ['age','km_driven','mileage','engine','max_power','selling_price']
cols = [c for c in preferred if c in df.columns] + [c for c in numeric_cols if c not in preferred]

n = len(cols)
if n==0:
    print('No numeric columns found to plot.')
else:
    fig, axes = plt.subplots(min(4,n), 2, figsize=(12, 3*min(4,n)))
    axes = axes.flatten() if n>1 else [axes]
    for i, c in enumerate(cols[:8]):
        sns.histplot(df[c].dropna(), kde=False, ax=axes[i], color='steelblue')
        axes[i].set_title(c)
    plt.tight_layout()
    plt.show()


In [None]:
# Section: Correlation heatmap (numeric features)
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns]
if len(num_cols) >= 2:
    corr = df[num_cols].corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', center=0)
    plt.title('Correlation matrix (numeric)')
    plt.show()
else:
    print('Not enough numeric columns for correlation matrix.')


In [None]:
# Section: Simple feature importance via RandomForest

features = [
    'brand','model','age','km_driven','mileage','engine','max_power','owner','fuel_type','transmission','seller_type'
]
features = [f for f in features if f in df.columns]

if len(df) < 10:
    print('Dataset is small (<10 rows). Skipping model training.')
else:
    X = df[features]
    y = df['selling_price'] if 'selling_price' in df.columns else None
    if y is None:
        print('No target column `selling_price` found — cannot train model.')
    else:
        # numeric and categorical lists
        numeric_feats = [c for c in ['age','km_driven','mileage','engine','max_power','owner'] if c in X.columns]
        categorical_feats = [c for c in X.columns if c not in numeric_feats]

        # build preprocessor
        try:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        except TypeError:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        preprocessor = ColumnTransformer([
            ('num', StandardScaler(), numeric_feats),
            ('cat', ohe, categorical_feats)
        ])

        # small RandomForest for importance; keep n_estimators low for speed on small data
        n_estimators = 50 if len(df) < 200 else 200
        rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
        pipe = Pipeline([('preprocessor', preprocessor), ('model', rf)])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            pipe.fit(X_train, y_train)

        preds = pipe.predict(X_test)
        rmse = mean_squared_error(y_test, preds) ** 0.5
        r2 = r2_score(y_test, preds)
        print(f'RandomForest RMSE={rmse:.2f}, R2={r2:.3f}')

        # obtain feature names from preprocessor
        try:
            feature_names = pipe.named_steps['preprocessor'].get_feature_names_out()
        except Exception:
            # fallback: build names from numeric + categorical (rough)
            ohe_categories = []
            try:
                ohe_cats = pipe.named_steps['preprocessor'].named_transformers_['cat'].categories_
                for i, col in enumerate(categorical_feats):
                    cats = ohe_cats[i]
                    ohe_categories.extend([f"{col}__{v}" for v in cats])
            except Exception:
                ohe_categories = categorical_feats
            feature_names = list(numeric_feats) + list(ohe_categories)

        importances = pipe.named_steps['model'].feature_importances_
        fi = pd.Series(importances, index=feature_names)
        fi_sorted = fi.sort_values(ascending=False).head(20)

        # plot
        plt.figure(figsize=(10,6))
        sns.barplot(x=fi_sorted.values, y=fi_sorted.index, palette='viridis')
        plt.title('Top feature importances (RandomForest)')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.show()
