# Model Training and Evaluation

In this notebook:
- Train baseline models (e.g., Random Forest, XGBoost)
- Compare performance metrics
- Log experiments using MLflow

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import mlflow

# Join features and target (if not pre-split)
df = pd.read_csv("../data/processed/flights_clean.csv")
df = pd.concat([X_train, y_train], axis=1)  # si ya tienes X_train, y_train

X = df.drop(columns=['is_delay'])
y = df['is_delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Start MLflow run
mlflow.set_experiment("flight_delay_prediction")
with mlflow.start_run(run_name="RandomForest_baseline"):
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Log metrics
    mlflow.log_metrics({"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "roc_auc": auc})

    # Log model
    mlflow.sklearn.log_model(clf, "model")

    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")


NameError: name 'X_train' is not defined