# PASO 1 — Cargar, chequear y preparar features 

In [11]:
# === Celda 1: imports y paths ===
import os
import pandas as pd
import numpy as np

RAW = "../data/raw/status_clean.parquet"
OUT = "../data/curated/ecobici_model_ready.parquet"

assert os.path.exists(RAW), f"No existe {RAW}"


In [12]:
# === Celda 2: lectura y chequeos mínimos ===
df = pd.read_parquet(RAW)

# Recomendado: estos nombres suelen estar (ajustá si difieren)
CAND_TS = "ts_local"
CAND_ID = "station_id"
CAND_Y  = "num_bikes_available"   # <-- si tu target se llama distinto, cámbialo aquí

# Parseo de timestamp
if CAND_TS in df.columns:
    df[CAND_TS] = pd.to_datetime(df[CAND_TS], errors="coerce")

# Orden temporal y limpieza básica
df = df.sort_values([CAND_ID, CAND_TS]).reset_index(drop=True)
df = df.drop_duplicates(subset=[CAND_ID, CAND_TS])

# Normalizar station_id (todo int si se puede)
def _coerce_station_id_dtype(s):
    s = s.astype(str)
    if s.str.fullmatch(r"\d+").all():
        return pd.to_numeric(s, errors="coerce").astype("Int64")
    return s
df[CAND_ID] = _coerce_station_id_dtype(df[CAND_ID])

# Reporte rápido
print("Shape:", df.shape)
print("Cols:", df.columns.tolist())
print(df[[CAND_ID, CAND_TS, CAND_Y]].head())

Shape: (219128, 21)
Cols: ['station_id', 'num_bikes_available', 'num_bikes_available_types', 'num_bikes_disabled', 'num_docks_available', 'num_docks_disabled', 'last_reported', 'is_charging_station', 'status', 'is_installed', 'is_renting', 'is_returning', 'traffic', '_file_last_updated', 'ts_local', 'name', 'lat', 'lon', 'capacity', 'address', 'is_closed']
    station_id                  ts_local  num_bikes_available
0          101 2025-10-03 08:46:42-03:00                    3
2          101 2025-10-03 08:49:43-03:00                    3
5          101 2025-10-03 08:52:43-03:00                    3
7          101 2025-10-03 08:55:45-03:00                    2
10         101 2025-10-03 08:58:45-03:00                    2


In [13]:
# === Celda 3: features mínimas sin fuga ===
# Señales temporales
df["hour"]      = df[CAND_TS].dt.hour
df["dow"]       = df[CAND_TS].dt.dayofweek      # 0=Lunes
df["is_weekend"]= df["dow"].isin([5,6]).astype(int)
df["month"]     = df[CAND_TS].dt.month

# Cíclicas (opcional y útil)
df["hour_sin"]  = np.sin(2*np.pi*df["hour"]/24)
df["hour_cos"]  = np.cos(2*np.pi*df["hour"]/24)

# Lags y rolling por estación (evita fuga agrupando)
df["y_lag1"] = df.groupby(CAND_ID)[CAND_Y].shift(1)
df["y_lag2"] = df.groupby(CAND_ID)[CAND_Y].shift(2)
df["y_ma3"]  = (df.groupby(CAND_ID)[CAND_Y]
                  .transform(lambda s: s.shift(1).rolling(3).mean()))  # media móvil usando info previa

# (Opc) clip/llenado conservador: NO rellenar target; sí features derivadas si deseas
for c in ["y_lag1","y_lag2","y_ma3"]:
    df[c] = df[c].astype(float)

# Drop filas sin lags (primeras por estación)
min_lags = df[["y_lag1","y_lag2","y_ma3"]].isna().any(axis=1)
df_model = df.loc[~min_lags].copy()

print("Después de features:", df_model.shape)

Después de features: (217949, 30)


In [4]:
# === Celda 4: guardar dataset canónico de modelado ===
os.makedirs(os.path.dirname(OUT), exist_ok=True)
df_model.to_parquet(OUT, index=False)
print(f"Guardado: {OUT}")

Guardado: ../data/curated/ecobici_model_ready.parquet


In [14]:
# Para el df['ts_local']
df['ts_local'] = pd.to_datetime(df['ts_local'])

# Extraer solo la fecha (sin hora)
df['fecha'] = df['ts_local'].dt.date

# Contar días distintos
dias_distintos = df['fecha'].nunique()
print(dias_distintos)

3


In [15]:
df['fecha'].unique()

array([datetime.date(2025, 10, 3), datetime.date(2025, 10, 5),
       datetime.date(2025, 10, 6)], dtype=object)