# Tarea 4 - Modelamiento
Roles involucrados: Ciencia de datos
Encargada: Ana Sofía Arizmendi - 202115093


### Instalar librerias

In [1]:
%pip install -q pandas numpy scikit-learn dash plotly joblib
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
%pip install -q "scikit-learn<1.4"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Cargar datos

In [2]:
# Ruta local al CSV limpio (Tarea 3)
CSV_PATH = Path("/Users/anasofiaarizmendi/Desktop/U/20252/Analitica/Proyecto 1/Repo_Proyecto_1/Tarea_3/incident_event_log_clean.csv")
df = pd.read_csv(CSV_PATH)
df.head(3)

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,resolved_at,closed_at,resolution_time,closure_time,is_outlier_reassignment,is_outlier_reopen,is_invalid_time,is_outlier_resolution_time,is_outlier_closure_time,resolution_time_log
0,INC0000045,New,True,0,0,0,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,2016-02-29 11:29:00,2016-05-03 12:00:00,0.425694,64.447222,False,False,False,False,False,0.354659
1,INC0000045,Resolved,True,0,0,2,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,2016-02-29 11:29:00,2016-05-03 12:00:00,0.425694,64.447222,False,False,False,False,False,0.354659
2,INC0000045,Resolved,True,0,0,3,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,2016-02-29 11:29:00,2016-05-03 12:00:00,0.425694,64.447222,False,False,False,False,False,0.354659


### Variables

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
import pandas as pd

#--Definir variables
# Candidatas categóricas
CANDIDATE_CAT = [
    "incident_state","contact_type","location","category","subcategory",
    "notify","closed_code","resolved_by","opened_by","sys_updated_by"
]
# Numéricas
NUM_COLS = ["reassignment_count","reopen_count","sys_mod_count"]
# Booleanas (0/1)
BOOL_COLS = [
    "active","made_sla","knowledge","u_priority_confirmation",
    "is_outlier_reassignment","is_outlier_reopen",
    "is_invalid_time","is_outlier_resolution_time","is_outlier_closure_time"
]

#--Tiempos
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "opened_at" in out.columns:
        dt = pd.to_datetime(out["opened_at"], errors="coerce", utc=True).dt.tz_localize(None)
        out["opened_year"] = dt.dt.year
        out["opened_month"] = dt.dt.month
        out["opened_day"] = dt.dt.day
        out["opened_hour"] = dt.dt.hour
        out["opened_dow"] = dt.dt.dayofweek
        out["opened_dom"] = dt.dt.day
        out["opened_week"] = dt.dt.isocalendar().week.astype(int)
        out["opened_is_weekend"] = (out["opened_dow"] >= 5).astype(int)
        out["opened_eom"] = (dt.dt.is_month_end).astype(int)
        out["opened_bom"] = (dt.dt.is_month_start).astype(int)
    return out

#--Selección de variables que existen en el CSV
def select_columns(df: pd.DataFrame, max_cardinality: int = 80):
    cols = df.columns.tolist()
    num = [c for c in NUM_COLS if c in cols]
    boo = [c for c in BOOL_COLS if c in cols]
    cats = []
    for c in CANDIDATE_CAT:
        if c in cols and df[c].nunique(dropna=True) <= max_cardinality:
            cats.append(c)
    time_num = [c for c in [
        "opened_year","opened_month","opened_day","opened_hour",
        "opened_dow","opened_dom","opened_week","opened_is_weekend",
        "opened_eom","opened_bom"
    ] if c in cols]
    return num + time_num, boo, cats

#--Prepara datos
def prepare_X(df_raw: pd.DataFrame, max_cardinality: int = 80):
    df2 = add_time_features(df_raw)
    num, boo, cat = select_columns(df2, max_cardinality=max_cardinality)
    keep = num + boo + cat
    X = df2[keep].copy()
    for b in boo:
        X[b] = X[b].astype("Int64").astype(float)
    for n in num:
        X[n] = pd.to_numeric(X[n], errors="coerce")
    schema = {"numeric": num, "boolean": boo, "categorical": cat}
    return X, schema

def build_preprocess(schema):
    numeric = schema["numeric"]
    boolean = schema["boolean"]
    categorical = schema["categorical"]

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", RobustScaler())
    ])
    bool_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ])
    # OneHotEncoder compatible con sklearn 1.4+ y versiones anteriores
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >= 1.2/1.4
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn < 1.4

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])

    pre = ColumnTransformer([
        ("num", num_pipe, numeric),
        ("boo", bool_pipe, boolean),
        ("cat", cat_pipe, categorical)
    ])
    return pre


### Modelo y Evaluación

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
import numpy as np, json
from pathlib import Path

ARTIFACTS = Path("models")  # carpeta de salida T4
ARTIFACTS.mkdir(exist_ok=True)

def evaluate(y_true, y_pred):
    return {
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "RMSE": float(mean_squared_error(y_true, y_pred, squared=False)),
        "R2": float(r2_score(y_true, y_pred))
    }

def time_split_indices(dates, frac_train=0.80):
    # dates: Serie datetime (puede tener NaT)
    if dates.notna().sum() < max(50, int(0.1*len(dates))):
        # Demasiados NaT: usar split aleatorio
        return None
    cutoff = dates.quantile(frac_train)
    tr = (dates <= cutoff).fillna(True).values
    te = (dates > cutoff).fillna(False).values
    if tr.sum() == 0 or te.sum() == 0:
        return None
    return tr, te

def train_target(df_in: pd.DataFrame, target_col: str, tag: str):
    d = df_in.copy()
    # filtra target válido
    d = d[np.isfinite(d[target_col])]
    d = d[d[target_col] >= 0]
    if len(d) < 100:
        print(f"[{tag}] Muy pocos registros ({len(d)}). Se entrena igual, pero revisa calidad.")
    # cap outliers etiqueta
    cap = d[target_col].quantile(0.99)
    d[target_col] = np.minimum(d[target_col], cap)

    X, schema = prepare_X(d)
    y = d[target_col].astype(float).values

    # split temporal si hay opened_at
    dt = pd.to_datetime(d.get("opened_at"), errors="coerce")
    idx = time_split_indices(dt, frac_train=0.80)
    if idx is not None:
        tr, te = idx
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y[tr], y[te]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    pre = build_preprocess(schema)
    base_model = HistGradientBoostingRegressor(
        loss="squared_error", learning_rate=0.06,
        max_depth=None, max_leaf_nodes=31, min_samples_leaf=20, random_state=42
    )
    model = TransformedTargetRegressor(regressor=base_model, func=np.log1p, inverse_func=np.expm1)

    from sklearn.pipeline import Pipeline
    pipe = Pipeline([("pre", pre), ("est", model)])
    pipe.fit(X_train, y_train)
    yhat = pipe.predict(X_test)
    metrics = evaluate(y_test, yhat)

    # guarda artefactos
    joblib.dump(pipe, ARTIFACTS / f"{tag}_model.pkl")
    with open(ARTIFACTS / f"{tag}_metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)
    with open(ARTIFACTS / f"{tag}_schema.json", "w") as f:
        json.dump(schema, f, indent=2)

    print(f"[{tag}] MAE={metrics['MAE']:.3f}  RMSE={metrics['RMSE']:.3f}  R2={metrics['R2']:.3f}")
    return metrics

report = {}
if "resolution_time" in df.columns:
    report["resolution_time"] = train_target(df, "resolution_time", "resolution_time")
else:
    print("No se encontró 'resolution_time'.")

if "closure_time" in df.columns and df["closure_time"].notna().sum() > 0:
    report["closure_time"] = train_target(df, "closure_time", "closure_time")
else:
    print("No se encontró 'closure_time' con valores; se omite entrenamiento de cierre.")

with open(ARTIFACTS / "report.json", "w") as f:
    json.dump(report, f, indent=2)

report


[resolution_time] MAE=4.773  RMSE=11.152  R2=0.381
[closure_time] MAE=27.521  RMSE=35.689  R2=0.676


{'resolution_time': {'MAE': 4.773339109795819,
  'RMSE': 11.151547263513972,
  'R2': 0.3814864893093579},
 'closure_time': {'MAE': 27.520750990882785,
  'RMSE': 35.68941856842456,
  'R2': 0.6762742905902279}}