In [None]:
# train_ann_model.py

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# -----------------------------------------------------
# STEP 1: DATA LOADING AND CLEANING
# -----------------------------------------------------
print("Loading and cleaning data...")

data = pd.read_csv('quikr_car.csv')

# Clean year
data = data[data['year'].str.isnumeric()]
data['year'] = data['year'].astype(int)

# Clean price
data = data[data['Price'] != 'Ask For Price']
data['Price'] = data['Price'].str.replace(',', '').astype(int)

# Clean kms driven
data['kms_driven'] = data['kms_driven'].str.split(' ').str.get(0).str.replace(',', '')
data = data[data['kms_driven'].str.isnumeric()]
data['kms_driven'] = data['kms_driven'].astype(int)

# Remove missing and outliers
data = data[~data['fuel_type'].isna()]
data['name'] = data['name'].str.split(' ').str.slice(0, 3).str.join(' ')
data = data[data['Price'] < 6e6].reset_index(drop=True)

# -----------------------------------------------------
# STEP 2: FEATURE SELECTION
# -----------------------------------------------------
X = data.drop(columns='Price')
y = data['Price']

# -----------------------------------------------------
# STEP 3: ENCODING & SCALING
# -----------------------------------------------------
print("Encoding categorical features...")

ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[['name', 'company', 'fuel_type']])

column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_, handle_unknown='ignore'),
     ['name', 'company', 'fuel_type']),
    remainder='passthrough'
)

X_transformed = column_trans.fit_transform(X)

# Standardize features (ANNs perform better on scaled data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)

# -----------------------------------------------------
# STEP 4: SPLIT DATA
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.1, random_state=42
)

# -----------------------------------------------------
# STEP 5: ANN MODEL DESIGN
# -----------------------------------------------------
print("Building and training ANN model...")

model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # regression output
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early stopping to avoid overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# -----------------------------------------------------
# STEP 6: MODEL TRAINING
# -----------------------------------------------------
history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# -----------------------------------------------------
# STEP 7: EVALUATION
# -----------------------------------------------------
print("\nEvaluating model performance...")

y_pred = model.predict(X_test).flatten()

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
f2_like = 1 / (1 + (rmse / np.mean(y_test)))

print("\n---------- ANN MODEL PERFORMANCE ----------")
print(f"RÂ² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Custom F2-like Score: {f2_like:.4f}")

# -----------------------------------------------------
# STEP 8: SAVE MODEL & ENCODERS
# -----------------------------------------------------
print("\nSaving model and encoders...")

# Save ANN model
model.save('best_ann_model.h5')

# Save encoders
with open('encoder_categories.pkl', 'wb') as f:
    pickle.dump(ohe.categories_, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model, encoder, and scaler saved successfully!")
