# XGBoost Model Implementation with CPU Optimization

This notebook implements an XGBoost-based model for soccer match draw prediction with CPU optimization. The implementation includes:

- Model creation and configuration
- Training with early stopping
- Threshold optimization
- Hyperparameter tuning
- Model evaluation
- MLflow integration for experiment tracking

## Setup and Imports

In [11]:
# --- Cell 1: Extended hypertune_elasticnet() function definition ---
import mlflow
import os
import sys
import numpy as np
import pandas as pd
from typing import Any, Dict, Tuple
from pathlib import Path
from datetime import datetime
from itertools import product
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import random
# Add project root to Python path
project_root = str(Path().absolute().parent.parent.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
# Initialize the ExperimentLogger
from utils.logger import ExperimentLogger
logger = ExperimentLogger("elasticnet_hypertuning_extended")

from models.StackedEnsemble.meta_learners.elasticnet_model import ElasticNetModel

def hypertune_elasticnet(X_train, y_train, X_test, y_test, X_val, y_val, logger):
    """
    Hypertune the ElasticNet model over an extended hard-coded grid and log the best candidate using MLflow.
    
    Parameters:
        X_train, y_train: Training data.
        X_test, y_test: Testing data.
        X_val, y_val: Validation data.
        logger: ExperimentLogger instance.
    
    Returns:
        best_params: Dictionary of the best candidate hyperparameters.
    """
    # Extended grid of candidate hyperparameters
    candidate_alphas = [0.005, 0.01, 0.015, 0.02, 0.025]
    candidate_l1_ratios = [0.8, 0.85, 0.9, 0.95, 1.0]
    candidate_max_iters = [150000, 200000, 250505]
    candidate_poly_degrees = [1, 2, 3]
    candidate_feature_selection_thresholds = [0.01, 0.014333562560261739, 0.02]
    
    best_precision = 0.0
    best_params = None
    target_precision = 0.45  # Set target precision threshold
    
    # Loop over all combinations in the extended grid
    for alpha, l1_ratio, max_iter, poly_degree, feat_sel_thresh in product(
            candidate_alphas,
            candidate_l1_ratios,
            candidate_max_iters,
            candidate_poly_degrees,
            candidate_feature_selection_thresholds):
        
        candidate_params = {
            'alpha': alpha,
            'alpha_grid_size': 170,  # Fixed value; adjust if needed
            'l1_ratio': l1_ratio,
            'max_iter': max_iter,
            'tol': 4.836479789228169e-05,
            'eps': 5.283984885624899e-05,
            'positive': False,
            'feature_selection_threshold': feat_sel_thresh,
            'imputation_strategy': 'median',
            'scaling_method': 'robust',
            'poly_degree': poly_degree,
            'random_seed': 42  # Fixed seed for reproducibility
        }
        
        logger.info(f"Evaluating candidate: alpha={alpha}, l1_ratio={l1_ratio}, "
                    f"max_iter={max_iter}, poly_degree={poly_degree}, "
                    f"feature_selection_threshold={feat_sel_thresh}")
        
        # Instantiate the ElasticNetModel with the candidate parameters
        model_instance = ElasticNetModel(
            experiment_name='elasticnet_hypertune',
            logger=logger,
            random_seed=candidate_params['random_seed']
        )
        # Create the model using candidate parameters.
        model_instance.model = model_instance._create_model(**candidate_params)
        
        try:
            # Fit the model to obtain evaluation metrics.
            # (Assume that the fit() method returns a dictionary with at least the 'precision' metric.)
            metrics = model_instance.fit(X_train, y_train, X_test, y_test, X_val, y_val)
            candidate_precision = metrics.get('precision', 0)
            logger.info(f"Candidate precision: {candidate_precision:.4f}")
            
            # Update best candidate if target precision is met and candidate's precision is highest so far.
            if candidate_precision >= target_precision and candidate_precision > best_precision:
                best_precision = candidate_precision
                best_params = candidate_params
                logger.info(f"New best candidate found: {best_params} with precision {best_precision:.4f}")
                
        except Exception as e:
            logger.error(f"Error evaluating candidate with alpha={alpha}, l1_ratio={l1_ratio}: {e}")
    
    # Log the best candidate to MLflow if a valid candidate was found
    if best_params is not None:
        with mlflow.start_run(run_name=f"elasticnet_hypertuning_best_{datetime.now().strftime('%Y%m%d_%H%M')}"):
            mlflow.log_params(best_params)
            mlflow.log_metric("best_precision", best_precision)
            logger.info("Logged best candidate to MLflow.")
    else:
        logger.error("No candidate met the target precision requirement.")
    
    return best_params


In [12]:
# --- Cell 2: Example usage of the extended hypertune_elasticnet() function ---
# Import data loading utilities
from models.StackedEnsemble.shared.data_loader import DataLoader

# Initialize data loader
data_loader = DataLoader(experiment_name="elasticnet_model")

# Load train, test and validation data
X_train, y_train, X_test, y_test, X_val, y_val = data_loader.load_data()


# Run the extended hypertuning function
best_candidate = hypertune_elasticnet(X_train, y_train, X_test, y_test, X_val, y_val, logger)
if best_candidate:
    logger.info(f"Best candidate hyperparameters: {best_candidate}")
else:
    logger.error("Hypertuning did not find any valid candidate.")


2025-02-20 23:17:54,062 | INFO     | elasticnet_hypertuning_extended | Loading data splits according to ensemble strategy
2025-02-20 23:17:54,066 | INFO     | elasticnet_hypertuning_extended | Returning features common to all models
2025-02-20 23:17:54,069 | INFO     | elasticnet_hypertuning_extended | Loaded 102 selected features
2025-02-20 23:17:54,138 | INFO     | elasticnet_hypertuning_extended | Loaded training data from parquet: c:\Users\szita\Documents\TheDrawCode\data\api_training_final.parquet
2025-02-20 23:17:54,275 | INFO     | elasticnet_hypertuning_extended | Loaded training/test data:
 - Training samples: 22343
 - Test samples: 5586
2025-02-20 23:17:54,277 | INFO     | elasticnet_hypertuning_extended | Loading training data from: c:\Users\szita\Documents\TheDrawCode\data\prediction\api_prediction_eval.xlsx
2025-02-20 23:18:13,718 | INFO     | elasticnet_hypertuning_extended | Ensemble evaluation set created with shape: (3374, 200)
2025-02-20 23:18:13,720 | INFO     | elas

NameError: name 'random' is not defined