In [1]:
# --- Importar librerías ---
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import holidays
from datetime import timedelta

# --- Cargar dataset ---
pkl_path = r'C:\\Users\\ignac\\Python\\asd\\data_limpia_llamadas.pkl'
df = pd.read_pickle(pkl_path)
df = df.sort_values('Fecha').reset_index(drop=True)

# --- Crear nuevas features temporales ---
cl_holidays = holidays.Chile()
df['anio'] = df['Fecha'].dt.year
df['mes'] = df['Fecha'].dt.month
df['dia'] = df['Fecha'].dt.day
df['dia_semana'] = df['Fecha'].dt.weekday  # 0=lunes, 6=domingo
df['fin_de_semana'] = df['dia_semana'].isin([5, 6]).astype(int)
df['es_feriado'] = df['Fecha'].isin(cl_holidays).astype(int)
df['es_laboral'] = (~df['fin_de_semana'].astype(bool) & ~df['es_feriado'].astype(bool)).astype(int)

# --- Agregar lag features ---
df['Llamadas_Lag1'] = df['Llamadas_Recibidas'].shift(1)
df['Llamadas_Lag7'] = df['Llamadas_Recibidas'].shift(7)
df['Llamadas_Rolling_Mean7'] = df['Llamadas_Recibidas'].rolling(window=7).mean()
df['Llamadas_Rolling_Std7'] = df['Llamadas_Recibidas'].rolling(window=7).std()

# Eliminar filas con NaN generados por las lag features
df = df.dropna().reset_index(drop=True)

# --- Variables predictoras y objetivo ---
features = ['Es_Cyber', 'anio', 'mes', 'dia', 'dia_semana', 'fin_de_semana', 'es_feriado', 'es_laboral',
            'Llamadas_Lag1', 'Llamadas_Lag7', 'Llamadas_Rolling_Mean7', 'Llamadas_Rolling_Std7']
X = df[features]
y = df['Llamadas_Recibidas']

# --- Optimización de hiperparámetros con GridSearchCV ---
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

model = XGBRegressor(random_state=42)
tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Mejores hiperparámetros: {grid_search.best_params_}")

# --- Evaluar el modelo mejorado ---
mae_scores = []
rmse_scores = []
r2_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_scores.append(r2_score(y_test, y_pred))

print(f"MAE promedio: {np.mean(mae_scores):.2f}")
print(f"RMSE promedio: {np.mean(rmse_scores):.2f}")
print(f"R² promedio: {np.mean(r2_scores):.4f}")

# --- Predicciones futuras ---
ultima_fecha = df['Fecha'].max()
futuras_fechas = pd.date_range(start=ultima_fecha + timedelta(days=1), periods=35)

df_futuro = pd.DataFrame({'Fecha': futuras_fechas})
df_futuro['anio'] = df_futuro['Fecha'].dt.year
df_futuro['mes'] = df_futuro['Fecha'].dt.month
df_futuro['dia'] = df_futuro['Fecha'].dt.day
df_futuro['dia_semana'] = df_futuro['Fecha'].dt.weekday
df_futuro['fin_de_semana'] = df_futuro['dia_semana'].isin([5, 6]).astype(int)
df_futuro['es_feriado'] = df_futuro['Fecha'].isin(cl_holidays).astype(int)
df_futuro['es_laboral'] = (~df_futuro['fin_de_semana'].astype(bool) & ~df_futuro['es_feriado'].astype(bool)).astype(int)

# Usar valores promedio para lag features en predicciones futuras
df_futuro['Llamadas_Lag1'] = df['Llamadas_Recibidas'].iloc[-1]
df_futuro['Llamadas_Lag7'] = df['Llamadas_Recibidas'].iloc[-7:].mean()
df_futuro['Llamadas_Rolling_Mean7'] = df['Llamadas_Recibidas'].iloc[-7:].mean()
df_futuro['Llamadas_Rolling_Std7'] = df['Llamadas_Recibidas'].iloc[-7:].std()

df_futuro['Es_Cyber'] = 0  # Asumir que no hay días Cyber por defecto
X_futuro = df_futuro[features]
df_futuro['Llamadas_Predichas'] = best_model.predict(X_futuro)

# --- Mostrar predicciones ---
print(df_futuro[['Fecha', 'Llamadas_Predichas']])

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Mejores hiperparámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
MAE promedio: 343.20
RMSE promedio: 547.32
R² promedio: 0.7754
        Fecha  Llamadas_Predichas
0  2025-04-01         3506.692871
1  2025-04-02         3425.605225
2  2025-04-03         3347.482666
3  2025-04-04         3269.905273
4  2025-04-05         2223.238281
5  2025-04-06         2070.725830
6  2025-04-07         3343.952881
7  2025-04-08         3284.282471
8  2025-04-09         3297.670898
9  2025-04-10         3212.987061
10 2025-04-11         3173.546631
11 2025-04-12         2217.300049
12 2025-04-13         2077.391846
13 2025-04-14         3344.852539
14 2025-04-15         3259.940430
15 2025-04-16         3220.159668
16 2025-04-17         3159.148682
17 2025-04-18         3125.781982
18 2025-04-19         2268.188477
19 2025-04-20         2130.325439
20 2025-04-21         3410.1176

In [2]:
# Guardar como pickle
pkl_path = r'C:\Users\ignac\Python\asd\data_limpia_llamadas2.pkl'
df.to_pickle(pkl_path)
print(f"\n✅ Archivo guardado como '{pkl_path}'")


✅ Archivo guardado como 'C:\Users\ignac\Python\asd\data_limpia_llamadas2.pkl'
