In [3]:
# ================================
# FEATURE ENGINEERING - STANDALONE SCRIPT (Updated)
# ================================

import pandas as pd
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
project_dir = Path(r"C:\Users\Admin\Downloads\Internship\Bosch_PMP")
data_dir = project_dir / "data" / "bosch"
processed_dir = project_dir / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# File paths
num_fp = data_dir / "train_numeric.csv.zip"
date_fp = data_dir / "train_date.csv.zip"
cat_fp = data_dir / "train_categorical.csv.zip"

# -----------------------------
# 1. Load numeric data & labels
# -----------------------------
print("Loading numeric data...")
num_df = pd.read_csv(num_fp, compression='zip', nrows=50000)  # adjust nrows for memory
labels = num_df[["Id", "Response"]].rename(columns={"Response": "target"})

# -----------------------------
# 2. Compute cycle time from date data
# -----------------------------
print("Loading date/time data...")
date_df = pd.read_csv(date_fp, compression='zip', nrows=200000)  # adjust nrows
date_cols = [c for c in date_df.columns if c != "Id"]
date_min = date_df[date_cols].min(axis=1, skipna=True)
date_max = date_df[date_cols].max(axis=1, skipna=True)
cycle_time = (date_max - date_min).astype("float32")
date_features = pd.DataFrame({"Id": date_df["Id"], "cycle_time": cycle_time})

# -----------------------------
# 3. Compute station flags from categorical data
# -----------------------------
print("Loading categorical data...")
# Fix: treat all columns as string to avoid mixed type warnings
cat_df = pd.read_csv(cat_fp, compression='zip', nrows=200000, dtype=str)

station_flags = pd.DataFrame({"Id": cat_df["Id"]})
for col in cat_df.columns:
    if col == "Id":
        continue
    parts = col.split("_")
    if len(parts) > 1:
        station = parts[1]
        flag_col = f"flag_{station}"
        if flag_col not in station_flags.columns:
            station_flags[flag_col] = 0
        # mark 1 if any non-missing value exists
        station_flags[flag_col] = station_flags[flag_col] | (~cat_df[col].isna()).astype(int)

# -----------------------------
# 4. Numeric aggregates (mean, median, missing count)
# -----------------------------
agg_df = num_df.drop(columns=["Id", "Response"], errors="ignore")
num_mean = agg_df.mean(axis=1, skipna=True)
num_median = agg_df.median(axis=1, skipna=True)
num_missing = agg_df.isna().sum(axis=1)

numeric_agg = pd.DataFrame({
    "Id": num_df["Id"],
    "num_mean": num_mean,
    "num_median": num_median,
    "num_missing": num_missing
})


# -----------------------------
# 5. Merge everything
# -----------------------------
print("Converting Id columns to int64 for merging...")
labels["Id"] = labels["Id"].astype("int64")
date_features["Id"] = date_features["Id"].astype("int64")
station_flags["Id"] = station_flags["Id"].astype("int64")
numeric_agg["Id"] = numeric_agg["Id"].astype("int64")

print("Merging features...")
df = labels.merge(date_features, on="Id", how="left")
df = df.merge(station_flags, on="Id", how="left")
df = df.merge(numeric_agg, on="Id", how="left")

# -----------------------------
# 6. Save processed dataset
# -----------------------------
output_fp = processed_dir / "bosch_clean.csv"
df.to_csv(output_fp, index=False)

print(f"✅ Feature engineering complete! Saved: {output_fp}")
print("Processed dataset shape:", df.shape)


Loading numeric data...
Loading date/time data...
Loading categorical data...
Converting Id columns to int64 for merging...
Merging features...
✅ Feature engineering complete! Saved: C:\Users\Admin\Downloads\Internship\Bosch_PMP\data\processed\bosch_clean.csv
Processed dataset shape: (50000, 40)
