In [None]:
import numpy as np
import pandas as pd
import math

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

SEED = 7
np.random.seed(SEED)
tf.random.set_seed(SEED)

# --- Cargar CSV  ---
#############################################
#############################################
#############################################

df = pd.read_csv("")

#############################################
#############################################
#############################################

df.columns


In [None]:
df.head()

In [None]:
# --- Elegir columna objetivo  ---

#######################################################
#######################################################
#######################################################

objetivo = ""

#######################################################
#######################################################
#######################################################

y = df[objetivo]
X = df.drop(columns=[objetivo])

In [None]:
# =========================================================
# Separación: train_full / test, luego train / val
# =========================================================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
    )

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.20, random_state=SEED,  stratify=y_train_full
)

X_train.columns

In [None]:
# --- Seleccionar columnas numéricas y categóricas  ---

#######################################################
#######################################################
#######################################################

cols_num = []
cols_cat = []

#######################################################
#######################################################
#######################################################

# OJO: TRAIN debe salir de X_train (no de X_train_full)
X_train_num = X_train[cols_num]
X_train_cat = X_train[cols_cat]

X_val_num = X_val[cols_num]
X_val_cat = X_val[cols_cat]

X_test_num = X_test[cols_num]
X_test_cat = X_test[cols_cat]

cols_cat

In [None]:
# --- Seleccionar nominales y ordinales  ---

#######################################################
#######################################################
#######################################################

########## codificado de categoricas
# Columnas
cols_onehot  = []  # NOMINALES → One-Hot
cols_ordinal = []

# Categorías ordenadas para las ordinales (mismo orden que en cols_ordinal)
categorias_ordinales = []  # lista de listas

#######################################################
#######################################################
#######################################################

preprocessor_cat = ColumnTransformer(
    transformers=[
        (
            "onehot",
            Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(sparse_output=False, drop=None, handle_unknown="ignore"))
            ]),
            cols_onehot
        ),
        (
            "ordinal",
            Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OrdinalEncoder(
                    categories=categorias_ordinales,
                    handle_unknown="use_encoded_value",
                    unknown_value=-1
                ))
            ]),
            cols_ordinal
        ),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

preprocessor_cat.fit(X_train_cat)

X_train_cat_proc = preprocessor_cat.transform(X_train_cat)
X_val_cat_proc   = preprocessor_cat.transform(X_val_cat)
X_test_cat_proc  = preprocessor_cat.transform(X_test_cat)

cols_out_cat = list(preprocessor_cat.get_feature_names_out())

# -----------------------------------------
# Renombrar One-Hot a formato col___categoria
# -----------------------------------------
rename_map = {}
if len(cols_onehot) > 0:
    ohe = preprocessor_cat.named_transformers_["onehot"].named_steps["encoder"]
    ohe_names = list(ohe.get_feature_names_out(cols_onehot))

    for name in ohe_names:
        for col in cols_onehot:
            prefix = col + "_"
            if name.startswith(prefix):
                cat = name[len(prefix):]
                rename_map[name] = f"{col}___{cat}"
                break

cols_out_cat = [rename_map.get(c, c) for c in cols_out_cat]

df_train_cat_encode = pd.DataFrame(X_train_cat_proc, columns=cols_out_cat, index=X_train_cat.index)
df_val_cat_encode   = pd.DataFrame(X_val_cat_proc,   columns=cols_out_cat, index=X_val_cat.index)
df_test_cat_encode  = pd.DataFrame(X_test_cat_proc,  columns=cols_out_cat, index=X_test_cat.index)

df_train_cat_encode

In [None]:
# =========================================================
# Numéricas: imputación + escalado
# =========================================================}
# Asegurar que sean DataFrames y convertir a float32
X_train_num = X_train_num.astype(np.float32)
X_val_num = X_val_num.astype(np.float32)
X_test_num = X_test_num.astype(np.float32)


num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_pipe.fit(X_train_num)

T_train_num = num_pipe.transform(X_train_num)
T_val_num   = num_pipe.transform(X_val_num)
T_test_num  = num_pipe.transform(X_test_num)

num_cols_out = X_train_num.columns

T_train_num = pd.DataFrame(T_train_num, columns=num_cols_out, index=X_train_num.index)
T_val_num   = pd.DataFrame(T_val_num,   columns=num_cols_out, index=X_val_num.index)
T_test_num  = pd.DataFrame(T_test_num,  columns=num_cols_out, index=X_test_num.index)

T_train_num

In [None]:
# =========================================================
# Unir numéricas + categóricas
# =========================================================
X_train_final_df = pd.concat([T_train_num, df_train_cat_encode], axis=1)
X_val_final_df   = pd.concat([T_val_num,   df_val_cat_encode],   axis=1)
X_test_final_df  = pd.concat([T_test_num,  df_test_cat_encode],  axis=1)

# A numpy float32 para Keras (si se ocupa después)
X_train_final = X_train_final_df.to_numpy(dtype=np.float32)
X_val_final   = X_val_final_df.to_numpy(dtype=np.float32)
X_test_final  = X_test_final_df.to_numpy(dtype=np.float32)

# =========================================================
# Devolver la columna objetivo (alineando por índice)
# =========================================================
train_final = X_train_final_df.copy()
train_final["target"] = y_train.loc[X_train_final_df.index].to_numpy()

val_final = X_val_final_df.copy()
val_final["target"] = y_val.loc[X_val_final_df.index].to_numpy()

test_final = X_test_final_df.copy()
test_final["target"] = y_test.loc[X_test_final_df.index].to_numpy()

print("X_train_final:", X_train_final.shape)
print("X_val_final  :", X_val_final.shape)
print("X_test_final :", X_test_final.shape)

In [None]:
# =====================================================
# Guardar artefactos + datasets finales
# =====================================================
import os
import joblib
import json

ARTIFACT_DIR = "artifacts_preprocesamiento"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# 1) Transformadores
joblib.dump(num_pipe, f"{ARTIFACT_DIR}/num_pipe.joblib")
joblib.dump(preprocessor_cat, f"{ARTIFACT_DIR}/cat_preprocessor.joblib")
joblib.dump(list(X_train_final_df.columns), f"{ARTIFACT_DIR}/feature_names.joblib")

# 2) Datasets finales CON target
train_final.to_csv(f"{ARTIFACT_DIR}/train_final.csv", index=False)
val_final.to_csv(f"{ARTIFACT_DIR}/val_final.csv", index=False)
test_final.to_csv(f"{ARTIFACT_DIR}/test_final.csv", index=False)

metadata = {
    "cols_num": cols_num,
    "cols_cat": cols_cat,
    "cols_onehot": cols_onehot,
    "cols_ordinal": cols_ordinal,
    "cat_out_cols": list(X_train_final_df.columns[len(cols_num):]),  # solo las categóricas ya renombradas
    "feature_names": list(X_train_final_df.columns)                  # num + cat, orden final
}

with open(f"{ARTIFACT_DIR}/metadata_preprocesamiento.json", "w") as f:
    json.dump(metadata, f, indent=2)


print("Todos los artefactos y datasets fueron guardados correctamente.")


In [None]:
import zipfile
import os

ARTIFACT_DIR = "artifacts_preprocesamiento"
ZIP_NAME = "artifacts_preprocesamiento.zip"

with zipfile.ZipFile(ZIP_NAME, "w", zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(ARTIFACT_DIR):
        for f in files:
            full_path = os.path.join(root, f)
            z.write(full_path, arcname=os.path.relpath(full_path, ARTIFACT_DIR))

print("ZIP final creado correctamente:", ZIP_NAME)
