In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import shap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Project paths
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'cleaned_engineered.csv')
ANOMALIES_PATH = os.path.join(PROJECT_ROOT, 'reports', 'anomalies.csv')
MODEL_PATH = os.path.join(PROJECT_ROOT, 'models', 'baseline_model.pkl')
SCALER_PATH = os.path.join(PROJECT_ROOT, 'models', 'scaler.pkl')
FEATURE_PATH = os.path.join(PROJECT_ROOT, 'models', 'feature_names.pkl')
REPORTS_PATH = os.path.join(PROJECT_ROOT, 'reports')
os.makedirs(REPORTS_PATH, exist_ok=True)

# Load anomalies and full dataset
anomaly_df = pd.read_csv(ANOMALIES_PATH, low_memory=False)
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.strip().str.lower()

# Load model, scaler, and feature names
with open(MODEL_PATH, 'rb') as f:
    model = pickle.load(f)
with open(SCALER_PATH, 'rb') as f:
    scaler = pickle.load(f)
with open(FEATURE_PATH, 'rb') as f:
    feature_names = pickle.load(f)

# Feature engineering for full dataset (to match feature_engineering_experiments.ipynb)
df['log_price'] = np.log1p(df['listed_price'])
mean_global = df['log_price'].mean()
k = 5  # smoothing factor
for col in ['oem', 'model', 'city']:
    target_mean = df.groupby(col)['log_price'].mean()
    count = df.groupby(col)['log_price'].count()
    smooth = (target_mean * count + mean_global * k) / (count + k)
    df[f'{col}_target_enc'] = df[col].map(smooth)
for col in ['oem', 'model', 'city']:
    freq = df[col].value_counts()
    df[f'{col}_freq_enc'] = df[col].map(freq)
df['brand_age'] = df['car_age'] * df['oem_target_enc']
df['km_per_year_age'] = df['km_per_year'] * df['car_age']
df['power_weight_ratio'] = df['max power delivered'] / df['kerb weight']

# Define features
num_cols = [
    'km', 'car_age', 'km_per_year', 'max power delivered', 'alloy wheel size',
    'length', 'width', 'height', 'wheel base', 'front tread', 'rear tread',
    'kerb weight', 'gross weight', 'top speed', 'acceleration', 'bore',
    'oem_target_enc', 'model_target_enc', 'city_target_enc',
    'brand_age', 'km_per_year_age', 'power_weight_ratio'
]
cat_cols = [
    'transmission', 'fuel', 'owner_type', 'drive type', 'steering type',
    'front brake type', 'rear brake type', 'tyre type'
]

# Keep only existing columns
num_cols = [col for col in num_cols if col in df.columns]
cat_cols = [col for col in cat_cols if col in df.columns]

# Prepare features for SHAP
X = df[num_cols + cat_cols].copy()
for col in cat_cols:
    X[col] = X[col].astype('category')
X[num_cols] = scaler.transform(X[num_cols])

# Compute SHAP values for anomalies
anomaly_indices = df.index[df.index.isin(anomaly_df.index)].tolist()
X_anomalies = X.loc[anomaly_indices]
explainer = shap.TreeExplainer(model)
shap_values_anomalies = explainer.shap_values(X_anomalies)

# Define business rules for anomaly validation
def apply_business_rules(row):
    violations = []
    
    # Rule 1: Low price for low mileage (< 10,000 km)
    if row['listed_price'] < 100_000 and row['km'] < 10_000:
        violations.append('low_price_low_km')
    
    # Rule 2: Low price for recent car (car_age < 3 years)
    if row['listed_price'] < 100_000 and row['car_age'] < 3:
        violations.append('low_price_recent_car')
    
    # Rule 3: High price for old car (car_age > 15 years)
    if row['listed_price'] > 1_000_000 and row['car_age'] > 15:
        violations.append('high_price_old_car')
    
    # Rule 4: High price for high mileage (> 100,000 km)
    if row['listed_price'] > 1_000_000 and row['km'] > 100_000:
        violations.append('high_price_high_km')
    
    # Rule 5: High SHAP contribution for 'width' or 'myear' (top features from anomaly_scoring.ipynb)
    shap_idx = anomaly_df.index.get_loc(row.name)
    shap_contrib = shap_values_anomalies[shap_idx]
    feature_names = X_anomalies.columns
    width_idx = feature_names.get_loc('width') if 'width' in feature_names else -1
    myear_idx = feature_names.get_loc('myear') if 'myear' in feature_names else -1
    if width_idx != -1 and abs(shap_contrib[width_idx]) > 0.5:  # Arbitrary threshold
        violations.append('high_shap_width')
    if myear_idx != -1 and abs(shap_contrib[myear_idx]) > 0.5:
        violations.append('high_shap_myear')
    
    return violations

# Apply rules to anomalies
anomaly_df['rule_violations'] = anomaly_df.apply(apply_business_rules, axis=1)
anomaly_df['num_violations'] = anomaly_df['rule_violations'].apply(len)
validated_anomalies = anomaly_df[anomaly_df['num_violations'] > 0].copy()

# Summarize rule violations
rule_counts = {}
for violations in validated_anomalies['rule_violations']:
    for rule in violations:
        rule_counts[rule] = rule_counts.get(rule, 0) + 1
print("Rule violation counts:")
for rule, count in rule_counts.items():
    print(f"{rule}: {count}")

# Bar plot of rule violations
plt.figure(figsize=(10, 6))
plt.bar(rule_counts.keys(), rule_counts.values())
plt.title('Count of Business Rule Violations for Anomalies')
plt.xlabel('Rule')
plt.ylabel('Number of Violations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(REPORTS_PATH, 'rule_violations_bar.png'))
plt.close()

# Save validated anomalies
validated_anomalies.to_csv(os.path.join(REPORTS_PATH, 'validated_anomalies.csv'), index=False)
print(f'Validated anomaly data saved to {os.path.join(REPORTS_PATH, "validated_anomalies.csv")}')

# Update README
readme_content = f"""
# Business Rules Summary
- Applied business rules to validate {len(validated_anomalies)} out of {len(anomaly_df)} anomalies from SHAP analysis.
- Rules include: low price for low mileage, low price for recent cars, high price for old cars, high price for high mileage, and high SHAP contributions for 'width' or 'myear'.
- Rule violation counts: {rule_counts}.
- Bar plot of rule violations saved to reports/rule_violations_bar.png.
- Validated anomalies saved to reports/validated_anomalies.csv.
- Next steps: Build Streamlit demo skeleton for interactive visualization.
"""
with open(os.path.join(PROJECT_ROOT, 'README.md'), 'a', encoding='utf-8') as f:
    f.write(readme_content)
print('README.md updated with business rules summary.')

Rule violation counts:
high_price_high_km: 97
high_price_old_car: 7
Validated anomaly data saved to C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\reports\validated_anomalies.csv
README.md updated with business rules summary.
