# Car Price Prediction — Improved Notebook

This notebook:
- Loads `/mnt/data/quikr_car.csv` (your uploaded dataset).
- Performs EDA with extra visualizations.
- Preprocesses categorical and numeric features using a sklearn `ColumnTransformer`.
- Trains multiple models (Linear Regression, RandomForest, XGBoost if installed).
- Compares model performance and saves the final pipeline (encoder + model) to a `.pkl` for web app use.

**Files produced by running the notebook:**
- `model_pipeline.pkl` — saved sklearn pipeline (preprocessor + model)
- `CAR_PRICE_PREDICTION_UPDATED.ipynb` — this notebook (you are opening it)

Run all cells to reproduce results. If you share the notebook with others, they must have the dataset at the same path or update the path accordingly.

In [None]:

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# ML imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib

# Optional: xgboost (may need pip install)
try:
    from xgboost import XGBRegressor
    xgb_available = True
except Exception as e:
    xgb_available = False
    XGBRegressor = None

print('xgboost available:', xgb_available)


In [None]:

# Load dataset
data_path = Path('/mnt/data/quikr_car.csv')
if not data_path.exists():
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please upload or change the path.")

df = pd.read_csv(data_path)
print('Dataset shape:', df.shape)
df.head()


In [None]:

# Quick info and missing values
display(df.info())
display(df.describe(include='all').T)
print('\nMissing values per column:')
print(df.isnull().sum())


In [None]:

# Visualizations
# Adjust these columns depending on your dataset's actual column names.
# Common columns in quikr car data: 'price', 'brand', 'model', 'year', 'km_driven', 'fuel_type', 'owner_type', 'transmission', 'location'

# Try to auto-detect some likely columns
cols = df.columns.tolist()
print('Columns detected:', cols)

price_col = None
for c in ['price','Price','selling_price','Selling_Price']:
    if c in cols:
        price_col = c
        break

numeric_candidates = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_candidates = df.select_dtypes(include=['object','category']).columns.tolist()

print('\\nNumeric candidates:', numeric_candidates)
print('Categorical candidates:', cat_candidates)

# Distribution of price (if present)
if price_col:
    plt.figure(figsize=(8,5))
    sns.histplot(df[price_col].dropna(), bins=50, kde=True)
    plt.title('Price distribution')
    plt.xlabel(price_col)
    plt.show()

    plt.figure(figsize=(8,5))
    sns.boxplot(x=df[price_col])
    plt.title('Price boxplot')
    plt.show()
else:
    print('No obvious price column detected; please set price_col variable to your target column name.')

# Correlation heatmap (numeric)
if len(numeric_candidates) >= 2:
    plt.figure(figsize=(10,8))
    corr = df[numeric_candidates].corr()
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Numeric features correlation matrix')
    plt.show()
else:
    print('Not enough numeric columns for correlation heatmap.')


In [None]:

# Preprocessing + Model training pipeline
# You'll need to set the target column name (e.g., 'price'). Change if needed.
TARGET = 'price' if 'price' in df.columns else ( 'Price' if 'Price' in df.columns else None )
if TARGET is None:
    raise ValueError('Could not auto-detect target column. Please set TARGET manually to your target column name.')

# Drop rows with missing target
df_model = df.dropna(subset=[TARGET]).copy()
X = df_model.drop(columns=[TARGET])
y = df_model[TARGET].astype(float)

# Fill simple missing values for demonstration
# Numeric: median, Categorical: 'missing'
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()

print('Numeric features:', numeric_features)
print('Categorical features:', categorical_features)

from sklearn.impute import SimpleImputer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ], remainder='drop'
)

# Models to try
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1)
}
if xgb_available:
    models['XGBoost'] = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train each model and collect metrics
results = []
pipelines = {}

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    print(f'\nTraining {name}...')
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    results.append({'model': name, 'r2': r2, 'rmse': rmse, 'mae': mae})
    pipelines[name] = pipe

results_df = pd.DataFrame(results).sort_values('r2', ascending=False).reset_index(drop=True)
display(results_df)

# Choose best model by R2
best_name = results_df.loc[0,'model']
best_pipeline = pipelines[best_name]
print('\nBest model:', best_name)


In [None]:

# Save the best pipeline (preprocessor + model) to a .pkl for your web app
output_path = Path('/mnt/data/model_pipeline.pkl')
joblib.dump(best_pipeline, output_path)
print('Saved pipeline to', output_path)

# Also save the list of categorical columns (useful for web app form generation)
meta = {
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'target': TARGET,
    'model_name': best_name
}
joblib.dump(meta, Path('/mnt/data/model_meta.pkl'))
print('Saved metadata to /mnt/data/model_meta.pkl')


In [None]:

# Feature importances (for tree-based models)
import numpy as np
best_model = best_pipeline.named_steps['model']
if hasattr(best_model, 'feature_importances_'):
    # need to get column names after preprocessing
    # get OHE feature names
    cat_ohe = best_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
    cat_names = []
    try:
        cat_names = cat_ohe.get_feature_names_out(categorical_features).tolist()
    except:
        # older sklearn compatibility
        cat_names = []
        for i, col in enumerate(categorical_features):
            # fallback will just use column name with index
            cat_names.append(col)
    feature_names = numeric_features + cat_names
    importances = best_model.feature_importances_
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(30)
    plt.figure(figsize=(10,6))
    sns.barplot(x=fi.values, y=fi.index)
    plt.title('Top feature importances')
    plt.show()
else:
    print('Best model has no feature_importances_ attribute.')


## Streamlit deployment example

Below is a minimal Streamlit app snippet that loads `model_pipeline.pkl` and `model_meta.pkl` and provides a simple form for prediction. Save it as `app.py` and run `streamlit run app.py`.


In [None]:

# streamlit_app example (save as app.py) - run externally
streamlit_code = r\"\"\"
import streamlit as st
import pandas as pd
import joblib
from pathlib import Path

MODEL_PATH = Path('model_pipeline.pkl')
META_PATH = Path('model_meta.pkl')

pipeline = joblib.load(MODEL_PATH)
meta = joblib.load(META_PATH)

st.title('Car Price Prediction')

# Build form
with st.form('predict_form'):
    inputs = {}
    for col in meta['numeric_features']:
        inputs[col] = st.number_input(col, value=0.0)
    for col in meta['categorical_features']:
        inputs[col] = st.text_input(col, value='')
    submitted = st.form_submit_button('Predict')
    if submitted:
        X = pd.DataFrame([inputs])
        pred = pipeline.predict(X)[0]
        st.success(f'Predicted {meta[\"target\"]}: {pred}')
\"\"\"
print(streamlit_code)
# Write file for convenience
Path('/mnt/data/streamlit_app_example.py').write_text(streamlit_code)
print('Saved Streamlit example to /mnt/data/streamlit_app_example.py')


### Notes & Next steps

- If `xgboost` isn't installed, you can install it with `pip install xgboost` and re-run the notebook.
- Tweak preprocessing and feature selection depending on the actual dataset columns.
- For production, consider saving a sklearn `ColumnTransformer` + model as a single pipeline (this notebook does that).
- If you want, I can also produce a ready-to-deploy Streamlit app that includes better input widgets (dropdowns, validation) and example screenshots.