# # CatBoost Model Implementation with CPU Optimization
# 
# This notebook implements a CatBoost-based model for soccer match draw prediction with CPU optimization. The implementation includes:
# 
# - Model creation and configuration 
# - Training with early stopping
# - Threshold optimization
# - Hyperparameter tuning
# - Model evaluation
# - MLflow integration for experiment tracking
# 
# ## Setup and Imports

In [86]:
# Block 1: Imports and Setup
import numpy as np
import pandas as pd
from pathlib import Path
import catboost as cb
from catboost import Pool
import joblib
import json
import os
import sys
import time
import optuna
import mlflow
import mlflow.catboost
import random
from typing import Any, Dict, Tuple
from datetime import datetime
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.metrics import roc_auc_score
import yaml

# Add project root to Python path
project_root = str(Path().absolute().parent.parent.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["ARROW_S3_DISABLE"] = "1"

from utils.logger import ExperimentLogger
experiment_name = "catboost_soccer_prediction"
logger = ExperimentLogger(experiment_name)

from utils.create_evaluation_set import setup_mlflow_tracking
from models.StackedEnsemble.utils.metrics import calculate_metrics
from models.StackedEnsemble.shared.data_loader import DataLoader

# Load data
mlruns_dir = setup_mlflow_tracking(experiment_name)
dataloader = DataLoader()
X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data()


2025-02-20 00:39:38,440 | INFO     | catboost_soccer_prediction | Setting up MLflow tracking for experiment: catboost_soccer_prediction
mlflow local_path_uri: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-02-20 00:39:39,475 | INFO     | catboost_soccer_prediction | Using existing experiment: catboost_soccer_prediction experiment_id: 360091729405522611
2025-02-20 00:39:39,475 | INFO     | catboost_soccer_prediction | MLflow tracking configured successfully at: c:/Users/szita/Documents/TheDrawCode/mlruns
2025-02-20 00:39:39,481 | INFO     | catboost_soccer_prediction | Loading data splits according to ensemble strategy
2025-02-20 00:39:39,501 | INFO     | catboost_soccer_prediction | Returning features common to all models
2025-02-20 00:39:39,503 | INFO     | catboost_soccer_prediction | Loaded 102 selected features
2025-02-20 00:39:39,578 | INFO     | catboost_soccer_prediction | Loaded training data from parquet: c:\Users\szita\Documents\TheDrawCode\data\api_training_final.parquet
2

In [87]:
# Block 2: Configuration Loading
def load_hyperparameter_space():
    """Define hyperparameter space for optimization."""
    try:
        hyperparameter_space = {
            'learning_rate': {
                'type': 'float',
                'low': 0.01,
                'high': 0.1,
                'log': True
            },
            'depth': {  # equivalent to max_depth
                'type': 'int',
                'low': 3,
                'high': 12
            },
            'min_data_in_leaf': {  # equivalent to min_child_weight
                'type': 'int', 
                'low': 1,
                'high': 100
            },
            'subsample': {  # called random_strength in CatBoost
                'type': 'float',
                'low': 0.3,
                'high': 0.8
            },
            'colsample_bylevel': {  # equivalent to colsample_bytree
                'type': 'float',
                'low': 0.3,
                'high': 1.0
            },
            'reg_lambda': {  # L2 regularization
                'type': 'float',
                'low': 1.0,
                'high': 15.0,
                'log': True
            },
            'leaf_estimation_iterations': {  # controls node value calculation
                'type': 'int',
                'low': 1,
                'high': 10
            },
            'bagging_temperature': {  # controls randomness
                'type': 'float',
                'low': 0.0,
                'high': 5.0
            },
            'scale_pos_weight': {  # class weights
                'type': 'float',
                'low': 1.0,
                'high': 15.0
            },
            'early_stopping_rounds': {
                'type': 'int',
                'low': 50,
                'high': 100
            }
        }
        
        return hyperparameter_space
    except Exception as e:
        logger.error(f"Error creating hyperparameter space: {str(e)}")
        return None

hyperparameter_space = load_hyperparameter_space()


In [88]:
# Block 3: Model Creation
def create_model(**kwargs):
    """Create and configure catboost model instance."""
    try:
        params = {
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'task_type': 'CPU',
            'thread_count': -1,
            'verbose': False
        }
        
        # Update with provided parameters
        params.update(kwargs)
        
        # Create model
        model = cb.CatBoostClassifier(**params)
        
        return model
        
    except Exception as e:
        logger.error(f"Error creating catboost model: {str(e)}")
        raise


In [89]:
# Block 6: Prediction Functions
def predict(model, X, threshold=0.5):
    """Generate predictions using trained model."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        probas = model.predict_proba(X)[:, 1]
        return (probas >= threshold).astype(int)
        
    except Exception as e:
        logger.error(f"Error in model prediction: {str(e)}")
        return np.zeros(len(X))

def predict_proba(model, X):
    """Generate probability predictions."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        return model.predict_proba(X)[:, 1]
        
    except Exception as e:
        logger.error(f"Error in probability prediction: {str(e)}")
        return np.zeros(len(X))


In [90]:
def evaluate(model, X: Any, y: Any, best_threshold: float) -> Dict[str, float]:
    """Evaluate model performance on given data."""
    if model is None:
        raise RuntimeError("Model must be trained before evaluation")
    
    try:
        # Get probability predictions
        y_prob = model.predict_proba(X)[:, 1]
        
        # Get binary predictions using best threshold
        y_pred = (y_prob >= best_threshold).astype(int)
        
        # Calculate metrics
        tp = np.sum((y == 1) & (y_pred == 1))
        fp = np.sum((y == 0) & (y_pred == 1))
        fn = np.sum((y == 1) & (y_pred == 0))
        
        metrics = {
            'precision': tp / (tp + fp + 1e-10),
            'recall': tp / (tp + fn + 1e-10),
            'f1': 2 * tp / (2 * tp + fp + fn + 1e-10),
            'auc': roc_auc_score(y, y_prob),
            'brier_score': np.mean((y_prob - y) ** 2),
            'threshold': best_threshold
        }
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error in model evaluation: {str(e)}")
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'auc': 0.0,
            'brier_score': 1.0,
            'threshold': best_threshold
        }


In [91]:
def optimize_threshold(model, y_true: np.ndarray, y_prob: np.ndarray) -> float:
    """Optimize prediction threshold with focus on precision while maintaining recall above 15%."""
    try:
        best_threshold = 0.5
        best_precision = 0.0
        
        # Search through thresholds
        for threshold in np.linspace(0.3, 0.8, 51):
            y_pred = (y_prob >= threshold).astype(int)
            
            # Calculate confusion matrix components
            tp = np.sum((y_true == 1) & (y_pred == 1))
            fp = np.sum((y_true == 0) & (y_pred == 1))
            fn = np.sum((y_true == 1) & (y_pred == 0))
            
            precision = tp / (tp + fp + 1e-10)
            recall = tp / (tp + fn + 1e-10)
            
            # Only consider thresholds that maintain recall above 15%
            if recall >= 0.15:
                if precision > best_precision:
                    best_precision = precision
                    best_threshold = threshold
        
        logger.info(f"Optimized threshold: {best_threshold:.3f} with precision: {best_precision:.3f}")

        metrics = evaluate(model, X_eval, y_eval, best_threshold)
        return metrics
        
    except Exception as e:
        logger.error(f"Error optimizing threshold: {str(e)}")
        return 0.5


In [92]:
# Block 5: Training Function
def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, **kwargs):
    """Train catboost model with early stopping."""
    try:
        # Create model with remaining parameters
        model = create_model(**kwargs)
        
        # Create eval set for early stopping
        eval_set = Pool(X_test, y_test)
        
        # Fit model with early stopping
        model.fit(
            Pool(X_train, y_train),
            eval_set=eval_set,
            verbose=100
        )
        
        # Get validation predictions
        y_prob = model.predict_proba(X_eval)[:, 1]
        metrics = optimize_threshold(model, y_eval, y_prob)
        
        return model, metrics
        
    except Exception as e:
        logger.error(f"Error training catboost model: {str(e)}")
        raise


In [93]:
# Block 7: Model Save/Load
def save_model(model, path, threshold=None):
    """Save catboost model and threshold to specified path."""
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Save model
        joblib.dump(model, path)
        
        # Save threshold
        if threshold:
            threshold_path = path.parent / "threshold.json"
            with open(threshold_path, 'w') as f:
                json.dump({
                    'threshold': threshold,
                    'model_type': 'catboost',
                    'params': model.get_params()
                }, f, indent=2)
                
        logger.info(f"Model saved to {path}")
        
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        raise

def load_model(path):
    """Load catboost model from specified path."""
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"No model file found at {path}")
        
    try:
        # Load model
        model = joblib.load(path)
        
        # Load threshold
        threshold_path = path.parent / "threshold.json"
        if threshold_path.exists():
            with open(threshold_path, 'r') as f:
                data = json.load(f)
                threshold = data.get('threshold', 0.5)
        else:
            threshold = 0.5
            
        logger.info(f"Model loaded from {path}")
        return model, threshold
        
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise


In [94]:
# Block 8: Hyperparameter Tuning
def objective(trial):
    """Optuna objective function for hyperparameter optimization."""
    try:
        params = {
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'task_type': 'CPU',
            'thread_count': -1,
            'verbose': 100
        }
        
        # Add hyperparameters from config
        hyperparameter_space = load_hyperparameter_space()

        for param_name, param_config in hyperparameter_space.items():
            if param_config['type'] == 'float':
                params[param_name] = trial.suggest_float(
                    param_name,
                    param_config['low'],
                    param_config['high'],
                    log=param_config.get('log', False)
                )
            elif param_config['type'] == 'int':
                params[param_name] = trial.suggest_int(
                    param_name,
                    param_config['low'],
                    param_config['high']
                )

        # Train model and get metrics
        model, metrics = train_model(
            X_train, y_train,
            X_test, y_test,
            X_eval, y_eval,
            **params
        )
        
        recall = metrics.get('recall', 0.0)
        precision = metrics.get('precision', 0.0)
        
        # Report intermediate values for pruning
        trial.report(precision, step=1)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Optimize for precision while maintaining minimum recall
        score = precision if recall >= 0.15 else 0.0
        
        logger.info(f"Trial {trial.number}:")
        logger.info(f"  Params: {params}")
        logger.info(f"  Score: {score}")
        
        for metric_name, metric_value in metrics.items():
            trial.set_user_attr(metric_name, metric_value)
        return score

    except Exception as e:
        logger.error(f"Error in trial {trial.number}: {str(e)}")
        raise


In [95]:
# Block 9: Hypertuning Function
def hypertune_catboost(experiment_name: str) -> float:
    """Run hyperparameter optimization with MLflow tracking."""
    try:
        # Create study
        study = optuna.create_study(
            study_name=f"catboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}",
            direction="maximize",
            sampler=TPESampler(seed=42),
            pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=5)
        )
        
        # Start MLflow run
        with mlflow.start_run(run_name=f"catboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}"):
            # Log dataset info
            mlflow.log_params({
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "eval_samples": len(X_eval),
                "features": X_train.shape[1]
            })
            
            # Set tags
            mlflow.set_tags({
                "model_type": "catboost_base",
                "optimization": "optuna",
                "cpu_only": True
            })
            
            # Optimize
            study.optimize(objective, n_trials=100, timeout=7200)  # 2 hours timeout
            
            # Log best trial info
            logger.info(f"Best trial value: {study.best_value}")
            logger.info(f"Best parameters found: {study.best_params}")
            
            # Train final model with best parameters
            logger.info("Training final model with best parameters")
            final_model, final_metrics = train_model(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                **study.best_params
            )
            
            # Log best parameters and metrics
            mlflow.log_params(study.best_params)
            mlflow.log_metrics(final_metrics)
            
            # Create and log model signature
            input_example = pd.DataFrame(X_train[:1].copy())
            signature = mlflow.models.infer_signature(
                model_input=input_example,
                model_output=final_model.predict_proba(input_example)
            )
            
            # Log model
            mlflow.catboost.log_model(
                xgb_model=final_model,
                artifact_path="catboost_base_model",
                registered_model_name=f"catboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}",
                signature=signature,
                input_example=input_example
            )
            
            # Save study results
            study_path = Path(mlruns_dir) / experiment_name / "optuna_studies"
            study_path.mkdir(parents=True, exist_ok=True)
            joblib.dump(study, study_path / f"study_{datetime.now().strftime('%Y%m%d_%H%M')}.pkl")
            
            logger.info(f"Training completed with precision: {final_metrics['precision']:.4f}")
            return final_metrics['precision']
            
    except Exception as e:
        logger.error(f"Error in hyperparameter optimization: {str(e)}")
        raise


In [96]:
def train_with_precision_target(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    X_eval: np.ndarray,
    y_eval: np.ndarray,
    logger: ExperimentLogger) -> Tuple[Any, float, Dict[str, Any]]:
    """Train catboost model with target precision threshold."""
    
    precision = 0
    best_precision = 0
    best_recall = 0
    best_params = None
    best_seed = 0
    best_model = None
    best_threshold = 0.5
    
    # Base parameters from previous optimization
    base_params = {
        'learning_rate': 0.0460229201936893,
        'max_depth': 11,
        'min_child_weight': 72,
        'subsample': 0.46392442652907506, 
        'colsample_bytree': 0.6671272425643389,
        'reg_alpha': 0.7544589894634769,
        'reg_lambda': 9.320178296187327,
        'gamma': 0.782032150358923,
        'early_stopping_rounds': 327,
        'scale_pos_weight': 2.4844767951297175,
        'tree_method': 'hist',  # Required for CPU-only training per project rules
        'objective': 'binary:logistic',
        'eval_metric': 'AUC', 
        'verbose': 100,
        'n_jobs': -1
    }
    
    while best_precision < 0.48:  # Target precision threshold
        for random_seed in range(1, 400):  # Try up to 1000 different seeds
            logger.info(f"Using sequential random seed: {random_seed}")
            
            # Set all random seeds
            os.environ['PYTHONHASHSEED'] = str(random_seed)
            np.random.seed(random_seed)
            random.seed(random_seed)
            base_params['random_state'] = random_seed
            
            try:
                # Create and train model
                model, metrics = train_model(
                    X_train, y_train,
                    X_test, y_test,
                    X_eval, y_eval,
                    **base_params
                )
                precision = metrics['precision']
                recall = metrics['recall']

                # Update best model if precision improved
                if precision > best_precision:
                    best_precision = precision
                    best_recall = recall
                    best_params = base_params.copy()
                    best_seed = random_seed
                    best_model = model
                    logger.info(f"New best precision: {precision:.4f} with seed {best_seed}")
                
                # Check if target precision reached
                if precision >= 0.48:
                    logger.info(f"Target precision achieved: {precision:.4f}")
                    return best_model, precision, recall, best_params
                
                logger.info(
                    f"Current precision: {precision:.4f}, "
                    f"target: 0.4800, highest precision: {best_precision:.4f}, "
                    f"best seed: {best_seed}"
                )
                
            except Exception as e:
                logger.error(f"Error training with seed {random_seed}: {str(e)}")
                continue
            
            # Clear model to free memory
            model = None
        
        # If target not reached after all seeds, return best model
        if precision < 0.48:
            logger.info(f"Target precision not reached, using best seed: {best_seed}")
            return best_model, best_precision, best_recall, best_params
            
    return best_model, best_precision, best_recall, best_params


In [97]:
def log_to_mlflow(model: object, precision: float, recall: float, params: dict, experiment_name: str) -> str:
    """Log model, metrics and parameters to MLflow.
    
    Args:
        model: Trained catboost model
        metrics (dict): Dictionary of metrics like precision, recall etc.
        params (dict): Model parameters used for training
        experiment_name (str): Name of the MLflow experiment
    """
    from utils.create_evaluation_set import setup_mlflow_tracking
    
    mlruns_dir = setup_mlflow_tracking(experiment_name)
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"catboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}") as run:
        
        # Log parameters
        mlflow.log_params(params)
        
        # Log metrics
        mlflow.log_metrics({
            "precision": precision,
            "recall": recall
        })
        
        # Register model with timestamp
        model_name = f"catboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}"
        
        # Log model with signature
        input_example = pd.DataFrame(model.feature_names_in_[:1].copy())
        signature = mlflow.models.infer_signature(
            model_input=input_example,
            model_output=predict_proba(model, input_example)
        )
        
        mlflow.catboost.log_model(
            xgb_model=model,
            artifact_path="model",
            registered_model_name=model_name,
            signature=signature
        )
        
        # Log run ID
        run_id = run.info.run_id
        logger.info(f"Run ID: {run_id}")
        return run_id


In [98]:
def train_seed_model():
    model, precision, recall, best_params = train_with_precision_target(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                logger
            )
    print(f"Training completed with precision: {precision:.4f}")
    
    # Log to MLflow if we got a valid model
    if model is not None:
        log_to_mlflow(model, precision, recall, best_params, experiment_name)


In [99]:
if __name__ == "__main__":
    hypertune_catboost(experiment_name)


[I 2025-02-20 00:39:57,382] A new study created in memory with name: catboost_optimization_20250220_0039
[W 2025-02-20 00:42:07,491] Trial 0 failed with parameters: {'learning_rate': 0.023688639503640783, 'depth': 12, 'min_data_in_leaf': 74, 'subsample': 0.5993292420985183, 'colsample_bylevel': 0.40921304830970556, 'reg_lambda': 1.525681189806849, 'leaf_estimation_iterations': 1, 'bagging_temperature': 4.330880728874676, 'scale_pos_weight': 9.415610164404923, 'early_stopping_rounds': 86} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\szita\.conda\envs\soccerpredictor_env\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\szita\AppData\Local\Temp\ipykernel_8052\1933250848.py", line 32, in objective
    model, metrics = train_model(
  File "C:\Users\szita\AppData\Local\Temp\ipykernel_8052\477178428.py", line 12, in train_model
    model.fit(
  File "c:\Users\s

KeyboardInterrupt: 