In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv(r"C:\Users\shrey\Desktop\Projects\Explainable Price Anomaly Detector for Indian Second-hand Marketplace\data\cleaned_engineered.csv")

# ============================
# Basic Cleaning
# ============================
# Drop duplicates
df = df.drop_duplicates()

# Handle missing values
df = df.ffill()

# ============================
# Feature Engineering
# ============================

# Convert year column to car age
if 'year' in df.columns:
    df['car_age'] = 2025 - df['year']

# Create km_per_year feature if possible
if 'kilometers' in df.columns and 'car_age' in df.columns:
    df['km_per_year'] = df['kilometers'] / (df['car_age'] + 1)

# Power-to-weight ratio if columns exist
if 'engine_power' in df.columns and 'kerb_weight' in df.columns:
    df['power_weight_ratio'] = df['engine_power'] / df['kerb_weight']

# ============================
# Encode Categorical Columns
# ============================
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# ============================
# Define Features and Target
# ============================
target = 'listed_price'  # Correct target column name

if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in dataframe.")

# Log-transform target due to skew
y = np.log1p(df[target])
X = df.drop(columns=[target])

# ============================
# Train-Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ============================
# Model Training
# ============================
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# ============================
# Evaluation
# ============================
y_pred = model.predict(X_test)

# Convert predictions back to original scale
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)

mse = mean_squared_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred_actual)

print(f"RMSE (₹): {rmse:,.0f}")
print(f"R2 Score: {r2:.4f}")

# Feature Importance
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head(10))

RMSE (₹): 162,694
R2 Score: 0.9579
width                   0.415995
myear                   0.223884
alloy wheel size        0.081287
max power delivered     0.049478
car_age                 0.040554
wheel base              0.033771
max torque delivered    0.015374
top speed               0.009955
gear box                0.008613
safety_features         0.008407
dtype: float64
