# Data Preparation — Manutenção Preditiva (CNC)

In [None]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

DATA_RAW = "../data/raw"
DATA_INTERIM = "../data/interim"
os.makedirs(DATA_INTERIM, exist_ok=True)
df = pd.read_csv(os.path.join(DATA_RAW, "bootcamp_train.csv"))
df["torque_por_rpm"] = df["torque"] / df["velocidade_rotacional"].replace(0, np.nan)
target_cols = ["falha_maquina","FDF","FDC","FP","FTE","FA"]
X = df.drop(columns=target_cols)
Y = df[target_cols]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())])
cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preprocess = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)])
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y["falha_maquina"])
X_train.to_csv(os.path.join(DATA_INTERIM,"X_train.csv"),index=False)
X_valid.to_csv(os.path.join(DATA_INTERIM,"X_valid.csv"),index=False)
y_train.to_csv(os.path.join(DATA_INTERIM,"y_train.csv"),index=False)
y_valid.to_csv(os.path.join(DATA_INTERIM,"y_valid.csv"),index=False)