In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# -------------------------
# Load cleaned data
# -------------------------
df = pd.read_csv(
    r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\data\cleaned_engineered.csv'
)
df.columns = df.columns.str.strip().str.lower()

# -------------------------
# Log-transform target
# -------------------------
df['log_price'] = np.log1p(df['listed_price'])

# -------------------------
# Smoothed target encoding for high-cardinality categories
# -------------------------
mean_global = df['log_price'].mean()
k = 5  # smoothing factor
for col in ['oem', 'model', 'city']:
    target_mean = df.groupby(col)['log_price'].mean()
    count = df.groupby(col)['log_price'].count()
    smooth = (target_mean * count + mean_global * k) / (count + k)
    df[f'{col}_target_enc'] = df[col].map(smooth)

# -------------------------
# Frequency encoding for categorical variables
# -------------------------
for col in ['oem', 'model', 'city']:
    freq = df[col].value_counts()
    df[f'{col}_freq_enc'] = df[col].map(freq)

# -------------------------
# Interaction features
# -------------------------
df['brand_age'] = df['car_age'] * df['oem_target_enc']
df['km_per_year_age'] = df['km_per_year'] * df['car_age']
df['power_weight_ratio'] = df['max power delivered'] / df['kerb weight']

# -------------------------
# Numerical & categorical columns
# -------------------------
num_cols = [
    'km', 'car_age', 'km_per_year', 'max power delivered', 'alloy wheel size',
    'length', 'width', 'height', 'wheel base', 'front tread', 'rear tread',
    'kerb weight', 'gross weight', 'top speed', 'acceleration', 'bore',
    'oem_target_enc', 'model_target_enc', 'city_target_enc',
    'brand_age', 'km_per_year_age', 'power_weight_ratio'
]

cat_cols = [
    'transmission', 'fuel', 'owner_type', 'drive type', 'steering type',
    'front brake type', 'rear brake type', 'tyre type'
]

# -------------------------
# Features & target
# -------------------------
X = df[num_cols + cat_cols].copy()
y = df['log_price']

# -------------------------
# Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------
# Ensure categorical dtype for LightGBM
# -------------------------
for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# -------------------------
# Scale numerical features
# -------------------------
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# -------------------------
# Save scaler & feature names
# -------------------------
with open(
    r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\models\scaler.pkl',
    'wb'
) as f:
    pickle.dump(scaler, f)

feature_names = X_train.columns.tolist()
with open(
    r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\models\feature_names.pkl',
    'wb'
) as f:
    pickle.dump(feature_names, f)

# -------------------------
# Train LightGBM using categorical features
# -------------------------
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    categorical_feature=cat_cols,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
)

# -------------------------
# Save model
# -------------------------
with open(
    r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\models\baseline_model.pkl',
    'wb'
) as f:
    pickle.dump(lgb_model, f)

# -------------------------
# Evaluate
# -------------------------
y_pred = lgb_model.predict(X_test)
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

rmse_actual = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
mae_actual = mean_absolute_error(y_test_actual, y_pred_actual)
r2 = r2_score(y_test_actual, y_pred_actual)

print(f'RMSE (₹): {rmse_actual:,.0f}, MAE (₹): {mae_actual:,.0f}, R²: {r2:.4f}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4023
[LightGBM] [Info] Number of data points in the train set: 30250, number of used features: 30
[LightGBM] [Info] Start training from score 13.197339
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.212791	valid_0's l2: 0.0452799
[100]	valid_0's rmse: 0.177867	valid_0's l2: 0.0316367
[150]	valid_0's rmse: 0.169089	valid_0's l2: 0.028591
[200]	valid_0's rmse: 0.165705	valid_0's l2: 0.0274583
[250]	valid_0's rmse: 0.163317	valid_0's l2: 0.0266724
[300]	valid_0's rmse: 0.16217	valid_0's l2: 0.0262993
[350]	valid_0's rmse: 0.161441	valid_0's l2: 0.0260631
[400]	valid_0's rmse: 0.16087	valid_0's l2: 0.0258791
[450]	valid_0's rmse: 0.160447	valid_0's l2: 0.0257434
[500]	valid_0's rmse: 0.160241	valid_0's l2: 0.0256771
[550]	valid_0's rmse: 0.159777	valid_0's l2: 0.025

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

# Project paths
PROJECT_ROOT = r'C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'cleaned_engineered.csv')
SCALER_PATH = os.path.join(PROJECT_ROOT, 'models', 'scaler.pkl')

# Load dataset
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = df.columns.str.strip().str.lower()

# Feature engineering (same as in your scripts)
df['log_price'] = np.log1p(df['listed_price'])
mean_global = df['log_price'].mean()
k = 5
for col in ['oem', 'model', 'city']:
    target_mean = df.groupby(col)['log_price'].mean()
    count = df.groupby(col)['log_price'].count()
    smooth = (target_mean * count + mean_global * k) / (count + k)
    df[f'{col}_target_enc'] = df[col].map(smooth)
for col in ['oem', 'model', 'city']:
    freq = df[col].value_counts()
    df[f'{col}_freq_enc'] = df[col].map(freq)
df['brand_age'] = df['car_age'] * df['oem_target_enc']
df['km_per_year_age'] = df['km_per_year'] * df['car_age']
df['power_weight_ratio'] = df['max power delivered'] / df['kerb weight']

# Define features
num_cols = [
    'km', 'car_age', 'km_per_year', 'max power delivered', 'alloy wheel size',
    'length', 'width', 'height', 'wheel base', 'front tread', 'rear tread',
    'kerb weight', 'gross weight', 'top speed', 'acceleration', 'bore',
    'oem_target_enc', 'model_target_enc', 'city_target_enc',
    'brand_age', 'km_per_year_age', 'power_weight_ratio'
]
cat_cols = [
    'transmission', 'fuel', 'owner_type', 'drive type', 'steering type',
    'front brake type', 'rear brake type', 'tyre type'
]
num_cols = [col for col in num_cols if col in df.columns]
cat_cols = [col for col in cat_cols if col in df.columns]

# Prepare features
X = df[num_cols + cat_cols].copy()
for col in cat_cols:
    X[col] = X[col].astype('category')

# Train and save scaler
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
with open(SCALER_PATH, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler retrained and saved to {SCALER_PATH}")

Scaler retrained and saved to C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\models\scaler.pkl
