In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
import pickle

# Project paths
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'cleaned_engineered.csv')
MODEL_PATH = os.path.join(PROJECT_ROOT, 'models', 'baseline_model.pkl')
SCALER_PATH = os.path.join(PROJECT_ROOT, 'models', 'scaler.pkl')
FEATURE_PATH = os.path.join(PROJECT_ROOT, 'models', 'feature_names.pkl')
os.makedirs(os.path.join(PROJECT_ROOT, 'models'), exist_ok=True)

# Load cleaned dataset
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.strip().str.lower()

# Drop unnecessary columns to reduce memory
drop_cols = ['usedcarskuid', 'images', 'ip', 'dvn']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Define numerical and categorical columns
num_cols = ['km', 'car_age', 'km_per_year', 'max power delivered', 'alloy wheel size', 
            'wheel base', 'no of cylinder', 'length', 'width', 'height', 'top speed', 
            'acceleration', 'kerb weight', 'gross weight', 'front tread', 'rear tread', 
            'turning radius', 'cargo volume', 'max torque delivered', 'max power at', 
            'max torque at', 'bore']
cat_cols = ['fuel', 'transmission', 'city', 'oem', 'model', 'variant', 'color', 
            'engine type', 'owner_type', 'drive type', 'steering type', 
            'front brake type', 'rear brake type', 'tyre type']

# Keep only existing columns
num_cols = [col for col in num_cols if col in df.columns]
cat_cols = [col for col in cat_cols if col in df.columns]

# Features and target
X = df[num_cols + cat_cols].copy()
y = np.log1p(df['listed_price'])

# Convert categorical columns to category dtype
for col in cat_cols:
    X[col] = X[col].astype('category')

print(f'Using {len(num_cols)} numerical and {len(cat_cols)} categorical features.')

# Scale numerical features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
with open(SCALER_PATH, 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names
feature_names = X.columns.tolist()
with open(FEATURE_PATH, 'wb') as f:
    pickle.dump(feature_names, f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train LightGBM (categorical features passed as indices)
cat_indices = [X.columns.get_loc(col) for col in cat_cols]



model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)

# Use callbacks for early stopping and logging
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)  # logs every 50 rounds
    ]
)

with open(MODEL_PATH, 'wb') as f:
    pickle.dump(model, f)

# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Convert to actual prices
rmse_actual = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))
mae_actual = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))

print(f'RMSE (log scale): {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}')
print(f'RMSE (₹): {rmse_actual:,.0f}, MAE (₹): {mae_actual:,.0f}')

# Write README safely
readme_content = """
# Optimized Baseline Model Summary
- Trained LightGBM Regressor with {} numerical and {} categorical features.
- Log-transformed target (listed_price) due to skew.
- Train/test split: 80/20.
- Performance (log scale): RMSE={:.4f}, MAE={:.4f}, R²={:.4f}.
- Performance (actual price, ₹): RMSE={:,.0f}, MAE={:,.0f}.
- Model and scaler saved to models/baseline_model.pkl and models/scaler.pkl.
- Feature names saved to models/feature_names.pkl.
- Next steps: Feature engineering, SHAP-based anomaly scoring.
""".format(len(num_cols), len(cat_cols), rmse, mae, r2, rmse_actual, mae_actual)

with open(os.path.join(PROJECT_ROOT, 'README.md'), 'a', encoding='utf-8') as f:
    f.write(readme_content)

print('README.md updated with optimized baseline model summary.')


Using 22 numerical and 14 categorical features.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6342
[LightGBM] [Info] Number of data points in the train set: 30250, number of used features: 36
[LightGBM] [Info] Start training from score 13.197339
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.201061	valid_0's l2: 0.0404254
[100]	valid_0's rmse: 0.165729	valid_0's l2: 0.027466
[150]	valid_0's rmse: 0.161138	valid_0's l2: 0.0259655
[200]	valid_0's rmse: 0.159664	valid_0's l2: 0.0254926
[250]	valid_0's rmse: 0.158968	valid_0's l2: 0.0252708
[300]	valid_0's rmse: 0.15857	valid_0's l2: 0.0251445
[350]	valid_0's rmse: 0.158372	valid_0's l2: 0.0250817
[400]	valid_0's rmse: 0.158202	valid_0's l2: 0.0250278
[450]	valid_0's rmse: 0.158138	valid_0's l2: 0.0250077
[500]	valid_0's rmse: 0.157926	valid_0's l2: 0.0249405
[