In [3]:
import os
import pandas as pd
import numpy as np
import naiveautoml
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, log_loss

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))
DATA_PATH = os.path.join(BASE_DIR, "dataset", "data", "premier_dataset_final.csv")
# --- cargar y ordenar ---
df = pd.read_csv(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

split_idx = len(df) // 2
train_df = df.iloc[:split_idx].copy()
valid_df = df.iloc[split_idx:].copy()

TARGET = "result"

# --- columnas post-partido (fuga directa) ---
post_match = [
    "home_goals","away_goals","total_goals","total_ht_goals","BTTS","CleanSheet",
    "home_elo_after","away_elo_after","elo_change_home","elo_change_away",
    "HS","AS","HST","AST","total_shots","total_shots_on_target",
    "shots_per_goal","shots_per_goal_home","shots_per_goal_away",
    "total_shot_accuracy","total_shot_accuracy_home","total_shot_accuracy_away",
    "shot_conversion","shot_conversion_home","shot_conversion_away",
    "shots_on_target_diff"
]

# --- columnas de mercado (pre-partido pero muy fuertes) ---
market_cols = [c for c in df.columns if any(k in c.lower() for k in [
    "odd_", "p_home", "p_draw", "p_away", "overround", "fair",
    "avgh","avgd","avga", "avg>2.5", "avg<2.5", "over_", "under_"
])]

# --- construir X,y sin 'date', sin post-partido y sin mercado ---
drop_cols = [TARGET, "date"] + [c for c in post_match if c in df.columns] + market_cols
X_train = train_df.drop(columns=drop_cols, errors="ignore").replace([np.inf, -np.inf], np.nan)
y_train = train_df[TARGET]
X_valid = valid_df.drop(columns=drop_cols, errors="ignore").replace([np.inf, -np.inf], np.nan)
y_valid = valid_df[TARGET]

# Eliminar columnas constantes
const_cols = [c for c in X_train.columns if X_train[c].nunique(dropna=False) <= 1]
X_train = X_train.drop(columns=const_cols, errors="ignore")
X_valid  = X_valid.drop(columns=const_cols, errors="ignore")

# Separar tipos
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

naml = naiveautoml.NaiveAutoML(
    show_progress=True,             
)

naml.fit(X_train, y_train, categorical_features=cat_cols, )

y_pred = naml.predict(X_valid)

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1-macro:", f1_score(y_valid, y_pred, average="macro"))
try:
    y_proba = naml.predict_proba(X_valid)
    print("LogLoss:", log_loss(y_valid, y_proba, labels=naml.classes_))
except Exception:
    pass

print("\nReporte de clasificación:\n", classification_report(y_valid, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_valid, y_pred))
print("\nCols eliminadas por constantes:", const_cols)
print("Núm. numéricas:", len(num_cols), " | Núm. categóricas:", len(cat_cols))

Progress for algorithm selection:


100%|██████████| 32/32 [00:32<00:00,  1.03s/it]


Progress for hyperparameter optimization:


100%|██████████| 100/100 [13:00<00:00,  7.81s/it]

Accuracy: 1.0
F1-macro: 1.0

Reporte de clasificación:
               precision    recall  f1-score   support

           A       1.00      1.00      1.00        70
           D       1.00      1.00      1.00        41
           H       1.00      1.00      1.00        79

    accuracy                           1.00       190
   macro avg       1.00      1.00      1.00       190
weighted avg       1.00      1.00      1.00       190

Matriz de confusión:
 [[70  0  0]
 [ 0 41  0]
 [ 0  0 79]]

Cols eliminadas por constantes: ['season', 'max_round', 'is_first_half', 'season_half']
Núm. numéricas: 23  | Núm. categóricas: 2



