In [3]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle
import shap
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Project paths
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'cleaned_engineered.csv')
MODEL_PATH = os.path.join(PROJECT_ROOT, 'models', 'baseline_model.pkl')
SCALER_PATH = os.path.join(PROJECT_ROOT, 'models', 'scaler.pkl')
FEATURE_PATH = os.path.join(PROJECT_ROOT, 'models', 'feature_names.pkl')
REPORTS_PATH = os.path.join(PROJECT_ROOT, 'reports')
os.makedirs(REPORTS_PATH, exist_ok=True)

# Load dataset
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.strip().str.lower()

# Log-transform target
df['log_price'] = np.log1p(df['listed_price'])

# Smoothed target encoding for high-cardinality categories
mean_global = df['log_price'].mean()
k = 5  # smoothing factor
for col in ['oem', 'model', 'city']:
    target_mean = df.groupby(col)['log_price'].mean()
    count = df.groupby(col)['log_price'].count()
    smooth = (target_mean * count + mean_global * k) / (count + k)
    df[f'{col}_target_enc'] = df[col].map(smooth)

# Frequency encoding for categorical variables
for col in ['oem', 'model', 'city']:
    freq = df[col].value_counts()
    df[f'{col}_freq_enc'] = df[col].map(freq)

# Interaction features
df['brand_age'] = df['car_age'] * df['oem_target_enc']
df['km_per_year_age'] = df['km_per_year'] * df['car_age']
df['power_weight_ratio'] = df['max power delivered'] / df['kerb weight']

# Load model, scaler, and feature names
with open(MODEL_PATH, 'rb') as f:
    model = pickle.load(f)
with open(SCALER_PATH, 'rb') as f:
    scaler = pickle.load(f)
with open(FEATURE_PATH, 'rb') as f:
    feature_names = pickle.load(f)

# Define features (same as in feature_engineering_experiments.ipynb)
num_cols = [
    'km', 'car_age', 'km_per_year', 'max power delivered', 'alloy wheel size',
    'length', 'width', 'height', 'wheel base', 'front tread', 'rear tread',
    'kerb weight', 'gross weight', 'top speed', 'acceleration', 'bore',
    'oem_target_enc', 'model_target_enc', 'city_target_enc',
    'brand_age', 'km_per_year_age', 'power_weight_ratio'
]
cat_cols = [
    'transmission', 'fuel', 'owner_type', 'drive type', 'steering type',
    'front brake type', 'rear brake type', 'tyre type'
]

# Keep only existing columns
num_cols = [col for col in num_cols if col in df.columns]
cat_cols = [col for col in cat_cols if col in df.columns]

# Prepare features and target
X = df[num_cols + cat_cols].copy()
y = df['log_price']

# Convert categorical columns to category dtype
for col in cat_cols:
    X[col] = X[col].astype('category')

# Scale numerical features
X[num_cols] = scaler.transform(X[num_cols])

# Compute predictions
y_pred = model.predict(X)
y_pred_actual = np.expm1(y_pred)
y_actual = np.expm1(y)

# Calculate residuals
residuals = np.abs(y_actual - y_pred_actual)

# Flag anomalies (e.g., residuals > 2 standard deviations)
residual_mean = residuals.mean()
residual_std = residuals.std()
anomaly_threshold = residual_mean + 2 * residual_std
anomalies = residuals > anomaly_threshold
print(f'Number of anomalies detected: {anomalies.sum()}')

# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Summary plot (feature importance)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, show=False)
plt.title('SHAP Feature Importance')
plt.savefig(os.path.join(REPORTS_PATH, 'shap_summary_plot.png'))
plt.close()

# Dependence plot for top feature (e.g., 'width' from RandomForest importance)
plt.figure(figsize=(10, 6))
shap.dependence_plot('width', shap_values, X, show=False)
plt.title('SHAP Dependence Plot for Width')
plt.savefig(os.path.join(REPORTS_PATH, 'shap_dependence_width.png'))
plt.close()

# Force plot for a sample anomaly
anomaly_idx = np.where(anomalies)[0][0]  # First anomaly
plt.figure(figsize=(12, 4))
shap.force_plot(explainer.expected_value, shap_values[anomaly_idx], X.iloc[anomaly_idx], matplotlib=True, show=False)
plt.title(f'SHAP Force Plot for Anomaly (Index {anomaly_idx})')
plt.savefig(os.path.join(REPORTS_PATH, f'shap_force_plot_anomaly_{anomaly_idx}.png'))
plt.close()

# Scatter plot of predicted vs. actual prices
plt.figure(figsize=(10, 6))
plt.scatter(y_actual[~anomalies], y_pred_actual[~anomalies], c='blue', alpha=0.5, label='Normal')
plt.scatter(y_actual[anomalies], y_pred_actual[anomalies], c='red', alpha=0.5, label='Anomaly')
plt.plot([y_actual.min(), y_actual.max()], [y_actual.min(), y_actual.max()], 'k--')
plt.xlabel('Actual Price (₹)')
plt.ylabel('Predicted Price (₹)')
plt.title('Predicted vs. Actual Prices with Anomalies Highlighted')
plt.legend()
plt.yscale('log')
plt.xscale('log')
plt.savefig(os.path.join(REPORTS_PATH, 'predicted_vs_actual.png'))
plt.close()

# Save anomaly data
anomaly_df = df[anomalies].copy()
anomaly_df['predicted_price'] = y_pred_actual[anomalies]
anomaly_df['residual'] = residuals[anomalies]
anomaly_df.to_csv(os.path.join(REPORTS_PATH, 'anomalies.csv'), index=False)
print(f'Anomaly data saved to {os.path.join(REPORTS_PATH, "anomalies.csv")}')

# Update README
readme_content = f"""
# SHAP Explainability Summary
- Used SHAP to explain LightGBM model predictions.
- Identified {anomalies.sum()} anomalies based on residuals > 2 standard deviations (threshold: ₹{anomaly_threshold:,.0f}).
- Generated SHAP summary plot, dependence plot for 'width', force plot for a sample anomaly, and predicted vs. actual scatter plot.
- Visuals saved in reports/: shap_summary_plot.png, shap_dependence_width.png, shap_force_plot_anomaly_{anomaly_idx}.png, predicted_vs_actual.png.
- Anomaly data saved to reports/anomalies.csv.
- Next steps: Implement business logic and rule-based checks for anomaly validation.
"""
with open(os.path.join(PROJECT_ROOT, 'README.md'), 'a', encoding='utf-8') as f:
    f.write(readme_content)
print('README.md updated with SHAP explainability summary.')

Number of anomalies detected: 1217
Anomaly data saved to C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\reports\anomalies.csv
README.md updated with SHAP explainability summary.


<Figure size 1000x600 with 0 Axes>

<Figure size 1200x400 with 0 Axes>