## Step 1: Read Data and Rename Columns

In [None]:
import pandas as pd

# Interactive file picker if file not found
import os
if not os.path.exists("data/Automobile_data.csv"):
    import tkinter as tk
    from tkinter import filedialog
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(title="Select Automobile_data.csv", filetypes=[("CSV files", "*.csv")])
    if file_path:
        df = pd.read_csv(file_path)
    else:
        raise FileNotFoundError("No file selected. Please select a valid CSV file.")
else:
    df = pd.read_csv("data/Automobile_data.csv")

df.rename(columns={
    'r': 'range_km',
    'm (kg)': 'mass_kg',
    'Mt': 'co2_emission_tons',
    'Ewltp (g/km)': 'co2_wltp_g_per_km',
    'Ft': 'fuel_type',
    'Fm': 'fuel_mix',
    'ec (cm3)': 'engine_capacity_cc',
    'ep (KW)': 'engine_power_kw',
    'z (Wh/km)': 'energy_consumption_whpkm',
    'Erwltp (g/km)': 'co2_reduction_wltp_gpkm',
    'Fuel consumption': 'fuel_consumption',
    'Electric range (km)': 'electric_range_km'
}, inplace=True)

df.head()
print(df.columns.tolist())


## Step 2: Handle Missing Values

In [3]:
# Fill numerical columns with median
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

## Step 3: Feature Engineering

In [4]:
# Example feature: power to weight ratio
df['power_to_weight'] = df['engine_power_kw'] / df['mass_kg']

## Step 4: Outlier Removal

In [5]:
import numpy as np
df.columns = df.columns.str.strip()
def cap_outliers(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower_bound, upper_bound)
    return df

cap_columns = ['Fuel consumption', 'mass_kg', 'engine_power_kw', 'co2_wltp_g_per_km']
df = cap_outliers(df, cap_columns)


In [6]:
# Check if any missing values remain
missing_summary = df.isnull().sum()
print("Missing Values After Imputation:\n", missing_summary[missing_summary > 0])


Missing Values After Imputation:
 Series([], dtype: int64)


In [7]:
if df['range_km'].nunique() == 1:
    df.drop(columns=['range_km'], inplace=True)


In [8]:
def iqr_outlier_summary(data, columns):
    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        print(f"{col}: {len(outliers)} outliers remaining")
        print(f"IQR bounds: ({lower_bound:.2f}, {upper_bound:.2f})\n")

# List of numerical columns to check
iqr_columns = ['Fuel consumption', 'mass_kg', 'engine_power_kw', 'co2_wltp_g_per_km']
iqr_outlier_summary(df, iqr_columns)


Fuel consumption: 0 outliers remaining
IQR bounds: (4.30, 6.70)

mass_kg: 0 outliers remaining
IQR bounds: (527.50, 2507.50)

engine_power_kw: 0 outliers remaining
IQR bounds: (-13.00, 219.00)

co2_wltp_g_per_km: 0 outliers remaining
IQR bounds: (44.00, 196.00)



## Step 5: Encoding and Scaling

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns='Fuel consumption')
y = df['Fuel consumption']

# Identify columns
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(include='object').columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_features)
])

## Step 6: PCA

In [None]:
# Apply PCA after preprocessing
pca_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95))
])

X_pca = pca_pipeline.fit_transform(X)
print("Original shape:", X.shape)
print("Reduced shape:", X_pca.shape)

## Step 7: Model Building and Hyperparameter Tuning

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import numpy as np

# Split data (700k train, 200k validation, rest test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=700000, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

# Sample for tuning
X_sample = X_train.sample(n=100000, random_state=42)
y_sample = y_train.loc[X_sample.index]

def tune_model(name, model, params):
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    search = RandomizedSearchCV(
        pipe,
        param_distributions=params,
        n_iter=10,
        cv=3,
        scoring='r2',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_sample, y_sample)
    print(f"{name} best R² score: {search.best_score_:.4f}")
    return search

# Parameter grids
param_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, None],
    'model__min_samples_split': [2, 5]
}
param_gbr = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5]
}
param_hgb = {
    'model__max_iter': [100, 200],
    'model__learning_rate': [0.05, 0.1]
}
param_xgb = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.05, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

# Model tuning
search_rf = tune_model("RandomForest", RandomForestRegressor(), param_rf)
search_gbr = tune_model("GradientBoosting", GradientBoostingRegressor(), param_gbr)
search_hgb = tune_model("HistGradientBoosting", HistGradientBoostingRegressor(), param_hgb)
search_xgb = tune_model("XGBoost", XGBRegressor(objective='reg:squarederror', random_state=42), param_xgb)


## Step 8: Evaluation

In [None]:
def evaluate_model(name, model, X_data, y_data, dataset_name):
    y_pred = model.predict(X_data)
    rmse = np.sqrt(mean_squared_error(y_data, y_pred))
    mae = mean_absolute_error(y_data, y_pred)
    r2 = r2_score(y_data, y_pred)
    print(f"{name} on {dataset_name} set: RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.4f}")

models = {
    "RandomForest": search_rf,
    "GradientBoosting": search_gbr,
    "HistGradientBoosting": search_hgb,
    "XGBoost": search_xgb
}

for model_name, model in models.items():
    evaluate_model(model_name, model, X_train, y_train, "Train")
    evaluate_model(model_name, model, X_valid, y_valid, "Validation")
    evaluate_model(model_name, model, X_test, y_test, "Test (Production)")
