# Final Code

In [1]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import pickle
import gzip
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Load Dataset ---
data = pd.read_csv("cardekho_imputated.csv", index_col=[0])  # Replace with your dataset file name

# Drop unnecessary columns
data.drop(['car_name', 'brand'], axis=1, inplace=True)

# Separate features and target
X = data.drop("selling_price", axis=1)
y = data["selling_price"]

# Apply LabelEncoder for 'model'
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

# Identify numeric and categorical columns
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']

# Preprocessing: OneHotEncoding and Scaling
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
    ], remainder='passthrough'
)

# --- Split Data into Training and Test Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Apply preprocessing ---
# Fit the preprocessor on the training data only
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)  # Apply transform to X_test (no fitting)

# Save the preprocessor, model, and label encoder using gzip
with gzip.open('preprocessor.pkl.gz', 'wb') as f:
    pickle.dump(preprocessor, f)

with gzip.open('label_encoder.pkl.gz', 'wb') as f:
    pickle.dump(le, f)

# --- Train the Random Forest model ---
model = RandomForestRegressor(
    n_estimators=1000, 
    min_samples_split=2, 
    max_features=5, 
    max_depth=15, 
    n_jobs=-1
)

model.fit(X_train_transformed, y_train)

# Save the trained model using gzip
with gzip.open('random_forest_model.pkl.gz', 'wb') as f:
    pickle.dump(model, f)

# --- Evaluate Model ---
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# Predictions
y_test_pred = model.predict(X_test_transformed)

# Evaluate model performance
mae, rmse, r2 = evaluate_model(y_test, y_test_pred)
print("\nModel Evaluation on Test Data:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}") 



Model Evaluation on Test Data:
Mean Absolute Error (MAE): 97172.2966
Root Mean Squared Error (RMSE): 206262.6250
R² Score: 0.9435
