# # CatBoost Model Implementation with CPU Optimization
# 
# This notebook implements a CatBoost-based model for soccer match draw prediction with CPU optimization. The implementation includes:
# 
# - Model creation and configuration 
# - Training with early stopping
# - Threshold optimization
# - Hyperparameter tuning
# - Model evaluation
# - MLflow integration for experiment tracking
# 
# ## Setup and Imports

In [87]:
# Block 1: Imports and Setup
import numpy as np
import pandas as pd
from pathlib import Path
import catboost as cb
from catboost import Pool
import joblib
import json
import os
import sys
import time
import optuna
import mlflow
import mlflow.catboost
import random
from typing import Any, Dict, Tuple
from datetime import datetime
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.metrics import roc_auc_score
import yaml

# Add project root to Python path
project_root = str(Path().absolute().parent.parent.parent.parent)
if project_root not in sys.path:
    sys.path.append(project_root)
os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["ARROW_S3_DISABLE"] = "1"
# Configure Git executable path if available
git_executable = os.environ.get("GIT_PYTHON_GIT_EXECUTABLE")
if git_executable and os.path.exists(git_executable):
    import git
    git.refresh(git_executable)

from utils.logger import ExperimentLogger
experiment_name = "catboost_soccer_prediction"
logger = ExperimentLogger(experiment_name)

from utils.create_evaluation_set import setup_mlflow_tracking
from models.StackedEnsemble.utils.metrics import calculate_metrics
from models.StackedEnsemble.shared.data_loader import DataLoader
from models.ensemble.data_utils import balance_and_clean_dataset

# Load data
mlruns_dir = setup_mlflow_tracking(experiment_name)
dataloader = DataLoader()
# Load data and create defragmented dataframes
X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data()

min_recall = 0.20
n_trials = 200
base_params = {
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'task_type': 'CPU',
            'thread_count': -1,
            'verbose': 100
        }


2025-03-02 22:17:37,505 | INFO     | create_evaluation_set | Setting up MLflow tracking for experiment: catboost_soccer_prediction
mlflow local_path_uri: C:/Users/szita/Documents/TheDrawCode/mlruns
2025-03-02 22:17:37,630 | INFO     | create_evaluation_set | Using existing experiment: catboost_soccer_prediction experiment_id: 360091729405522611
2025-03-02 22:17:37,634 | INFO     | create_evaluation_set | MLflow tracking configured successfully at: C:/Users/szita/Documents/TheDrawCode/mlruns
2025-03-02 22:17:37,638 | INFO     | create_evaluation_set | Loading data splits according to ensemble strategy
2025-03-02 22:17:37,643 | INFO     | create_evaluation_set | Returning features common to all models
2025-03-02 22:17:37,648 | INFO     | create_evaluation_set | Loaded 102 selected features
2025-03-02 22:17:37,782 | INFO     | create_evaluation_set | Loaded training data from parquet: C:\Users\szita\Documents\TheDrawCode\data\api_training_final.parquet
2025-03-02 22:17:37,969 | INFO     |

In [88]:
# Block 2: Configuration Loading
def load_hyperparameter_space():
    """Define hyperparameter space for optimization."""
    try:
        hyperparameter_space = {
            'learning_rate': {
                'type': 'float',
                'low': 0.005,                    # Expanded lower bound for more precise tuning
                'high': 0.06,                   # Slightly increased upper bound
                'log': True
            },
            'depth': {
                'type': 'int',
                'low': 4,                       # Expanded lower bound for tree depth
                'high': 10                      # Increased upper bound for more complex trees
            },
            'min_data_in_leaf': {
                'type': 'int', 
                'low': 15,                      # Lower bound decreased for finer granularity
                'high': 40                      # Upper bound increased for more regularization options
            },
            'subsample': {
                'type': 'float',
                'low': 0.60,                    # Expanded lower bound
                'high': 0.85                    # Expanded upper bound for more exploration
            },
            'colsample_bylevel': {
                'type': 'float',
                'low': 0.35,                    # Wider range for feature sampling
                'high': 0.55
            },
            'reg_lambda': {
                'type': 'float',
                'low': 1.0,                     # Expanded lower bound for L2 regularization
                'high': 5.0,                    # Increased upper bound
                'log': True
            },
            'leaf_estimation_iterations': {
                'type': 'int',
                'low': 1,                       # Kept minimum
                'high': 5                       # Increased for more precise leaf value estimation
            },
            'bagging_temperature': {
                'type': 'float',
                'low': 2.5,                     # Wider range for bagging temperature
                'high': 5.0
            },
            'scale_pos_weight': {
                'type': 'float',
                'low': 3.0,                     # Expanded for better class imbalance handling
                'high': 10.0 
            },
            'early_stopping_rounds': {
                'type': 'int',
                'low': 50,                      # Lower for faster convergence testing
                'high': 300                     # Much higher to allow for more patience with slower learning rates
            }
        }
        return hyperparameter_space
    except Exception as e:
        logger.error(f"Error creating hyperparameter space: {str(e)}")
        return None

hyperparameter_space = load_hyperparameter_space()


In [89]:
# Block 3: Model Creation
def create_model(model_params):
    """Create and configure catboost model instance."""
    try:
        params = base_params
        
        # Update with provided parameters
        params.update(model_params)
        
        # Create model
        model = cb.CatBoostClassifier(**params)
        
        return model
        
    except Exception as e:
        logger.error(f"Error creating catboost model: {str(e)}")
        raise


In [90]:
# Block 6: Prediction Functions
def predict(model, X, threshold=0.5):
    """Generate predictions using trained model."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        probas = model.predict_proba(X)[:, 1]
        return (probas >= threshold).astype(int)
        
    except Exception as e:
        logger.error(f"Error in model prediction: {str(e)}")
        return np.zeros(len(X))

def predict_proba(model, X):
    """Generate probability predictions."""
    if model is None:
        raise RuntimeError("Model must be trained before prediction")
        
    try:
        return model.predict_proba(X)[:, 1]
        
    except Exception as e:
        logger.error(f"Error in probability prediction: {str(e)}")
        return np.zeros(len(X))


In [91]:
def evaluate(model, X: Any, y: Any, best_threshold: float) -> Dict[str, float]:
    """Evaluate model performance on given data."""
    if model is None:
        raise RuntimeError("Model must be trained before evaluation")
    
    try:
        # Get probability predictions
        y_prob = model.predict_proba(X)[:, 1]
        
        # Get binary predictions using best threshold
        y_pred = (y_prob >= best_threshold).astype(int)
        
        # Calculate metrics
        tp = np.sum((y == 1) & (y_pred == 1))
        fp = np.sum((y == 0) & (y_pred == 1))
        fn = np.sum((y == 1) & (y_pred == 0))
        
        metrics = {
            'precision': tp / (tp + fp + 1e-10),
            'recall': tp / (tp + fn + 1e-10),
            'f1': 2 * tp / (2 * tp + fp + fn + 1e-10),
            'auc': roc_auc_score(y, y_prob),
            'brier_score': np.mean((y_prob - y) ** 2),
            'threshold': best_threshold
        }
        
        return metrics
        
    except Exception as e:
        logger.error(f"Error in model evaluation: {str(e)}")
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1': 0.0,
            'auc': 0.0,
            'brier_score': 1.0,
            'threshold': best_threshold
        }


In [92]:
def optimize_threshold(model, y_true: np.ndarray, y_prob: np.ndarray) -> float:
    """Optimize prediction threshold with focus on precision while maintaining recall above 15%."""
    try:
        best_threshold = 0.5
        best_precision = 0.0
        
        # Search through thresholds
        for threshold in np.linspace(0.3, 0.8, 51):
            y_pred = (y_prob >= threshold).astype(int)
            
            # Calculate confusion matrix components
            tp = np.sum((y_true == 1) & (y_pred == 1))
            fp = np.sum((y_true == 0) & (y_pred == 1))
            fn = np.sum((y_true == 1) & (y_pred == 0))
            
            precision = tp / (tp + fp + 1e-10)
            recall = tp / (tp + fn + 1e-10)
            
            # Only consider thresholds that maintain recall above 15%
            if recall >= min_recall:
                if precision > best_precision:
                    best_precision = precision
                    best_threshold = threshold
        
        logger.info(f"Optimized threshold: {best_threshold:.3f} with precision: {best_precision:.3f}")

        metrics = evaluate(model, X_eval, y_eval, best_threshold)
        return metrics
        
    except Exception as e:
        logger.error(f"Error optimizing threshold: {str(e)}")
        return 0.5


In [93]:
# Block 5: Training Function
def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params):
    """Train catboost model with early stopping."""
    try:
        # Create model with remaining parameters
        model = create_model(model_params)
        
        # Create eval set for early stopping
        eval_set = Pool(X_test, y_test)
        train_set = Pool(X_train, y_train)
        # Fit model with early stopping
        model.fit(
            train_set,
            eval_set=eval_set,
            verbose=100
        )
        
        # Get validation predictions
        y_prob = model.predict_proba(X_eval)[:, 1]
        metrics = optimize_threshold(model, y_eval, y_prob)
        
        return model, metrics
        
    except Exception as e:
        logger.error(f"Error training catboost model: {str(e)}")
        raise


In [94]:
# Block 7: Model Save/Load
def save_model(model, path, threshold=None):
    """Save catboost model and threshold to specified path."""
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Save model
        joblib.dump(model, path)
        
        # Save threshold
        if threshold:
            threshold_path = path.parent / "threshold.json"
            with open(threshold_path, 'w') as f:
                json.dump({
                    'threshold': threshold,
                    'model_type': 'catboost',
                    'params': model.get_params()
                }, f, indent=2)
                
        logger.info(f"Model saved to {path}")
        
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        raise

def load_model(path):
    """Load catboost model from specified path."""
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"No model file found at {path}")
        
    try:
        # Load model
        model = joblib.load(path)
        
        # Load threshold
        threshold_path = path.parent / "threshold.json"
        if threshold_path.exists():
            with open(threshold_path, 'r') as f:
                data = json.load(f)
                threshold = data.get('threshold', 0.5)
        else:
            threshold = 0.5
            
        logger.info(f"Model loaded from {path}")
        return model, threshold
        
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise


In [95]:
# Block 8: Hyperparameter Tuning
def objective(trial):
    """Optuna objective function for hyperparameter optimization."""
    try:
        params = base_params
        
        # Add hyperparameters from config
        hyperparameter_space = load_hyperparameter_space()
        for param_name, param_config in hyperparameter_space.items():
            if param_config['type'] == 'float':
                params[param_name] = trial.suggest_float(
                    param_name,
                    param_config['low'],
                    param_config['high'],
                    log=param_config.get('log', False)
                )
            elif param_config['type'] == 'int':
                params[param_name] = trial.suggest_int(
                    param_name,
                    param_config['low'],
                    param_config['high']
                )
        # Train model and get metrics
        model, metrics = train_model(
            X_train, y_train,
            X_test, y_test,
            X_eval, y_eval,
            params
        )
        
        recall = metrics.get('recall', 0.0)
        precision = metrics.get('precision', 0.0)
        
        # Report intermediate values for pruning
        trial.report(precision, step=1)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Optimize for precision while maintaining minimum recall
        score = precision if recall >= min_recall else 0.0
        
        logger.info(f"Trial {trial.number}:")
        logger.info(f"  Params: {params}")
        logger.info(f"  Score: {score}")
        
        for metric_name, metric_value in metrics.items():
            trial.set_user_attr(metric_name, metric_value)
        return score
    except Exception as e:
        logger.error(f"Error in trial {trial.number}: {str(e)}")
        raise


In [96]:
# Block 9: Hypertuning Function
def hypertune_catboost(experiment_name: str) -> float:
    """Run hyperparameter optimization with MLflow tracking."""
    try:
        # Create study
        study = optuna.create_study(
            study_name=f"catboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}",
            direction="maximize",
            sampler = optuna.samplers.TPESampler(                     # Different seed for better randomization
                    n_startup_trials=20,         # Reduced from 50 - more efficient
                    prior_weight=0.4
                )
            # pruner = optuna.pruners.MedianPruner(
            #         n_startup_trials=30,  # Collect this many trials before pruning
            #         n_warmup_steps=1,     # No pruning before this many steps (CV folds) within each trial
            #         interval_steps=1      # Check pruning condition at every step
            #     )
        )
        
        # Start MLflow run
        with mlflow.start_run(run_name=f"catboost_optimization_{datetime.now().strftime('%Y%m%d_%H%M')}"):
            # Log dataset info
            mlflow.log_params({
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "eval_samples": len(X_eval),
                "features": X_train.shape[1]
            })
            
            # Optimize
            best_score = -float('inf')  # Initialize with worst possible score
            best_params = {}
            def callback(study, trial):
                nonlocal best_score
                nonlocal best_params
                logger.info(f"Current best score: {best_score:.4f}")
                if trial.value > best_score:
                    best_score = trial.value
                    best_params = trial.params
                    logger.info(f"New best score found in trial {trial.number}: {best_score:.4f}")
                return best_score
            
            study.optimize(objective, n_trials=n_trials, 
                            timeout=10000, show_progress_bar = True, callbacks=[callback])  # 2 hours timeout
            best_params.update(base_params)
            logger.info(f"Best trial value: {best_score}")
            logger.info(f"Best parameters found: {best_params}")
            # Train final model with best parameters
            logger.info("Training final model with best parameters")
            final_model, final_metrics = train_model(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                best_params
            )
            
            
            logger.info(f"Training completed with precision: {final_metrics['precision']:.4f}")
            return final_metrics['precision'], best_params
            
    except Exception as e:
        logger.error(f"Error in hyperparameter optimization: {str(e)}")
        raise


In [97]:
def train_with_precision_target(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    X_eval: np.ndarray,
    y_eval: np.ndarray,
    logger: ExperimentLogger) -> Tuple[Any, float, Dict[str, Any]]:
    """Train catboost model with target precision threshold."""
    
    precision = 0
    best_precision = 0
    best_recall = 0
    best_params = None
    best_seed = 0
    best_model = None
    best_threshold = 0.5
    
    # Base parameters from previous optimization
    base_params = {
        'learning_rate': 0.039154254905889536,
        'depth': 8,
        'min_data_in_leaf': 30,
        'subsample': 0.7752200875207855,
        'colsample_bylevel': 0.44370220186780523,
        'reg_lambda': 3.4083316844158364,
        'leaf_estimation_iterations': 2,
        'bagging_temperature': 3.7784573605971032,
        'scale_pos_weight': 8.520701738840673,
        'early_stopping_rounds': 89,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'CPU',
        'thread_count': -1,
        'verbose': 100
    }
    while best_precision < 0.48:  # Target precision threshold
        for random_seed in range(1, 400):  # Try up to 1000 different seeds
            logger.info(f"Using sequential random seed: {random_seed}")
            
            # Set all random seeds
            os.environ['PYTHONHASHSEED'] = str(random_seed)
            np.random.seed(random_seed)
            random.seed(random_seed)
            base_params['random_state'] = random_seed
            
            try:
                # Create and train model
                model, metrics = train_model(
                    X_train, y_train,
                    X_test, y_test,
                    X_eval, y_eval,
                    **base_params
                )
                precision = metrics['precision']
                recall = metrics['recall']

                # Update best model if precision improved
                if precision > best_precision:
                    best_precision = precision
                    best_recall = recall
                    best_params = base_params.copy()
                    best_seed = random_seed
                    best_model = model
                    logger.info(f"New best precision: {precision:.4f} with seed {best_seed}")
                
                # Check if target precision reached
                if precision >= 0.48:
                    logger.info(f"Target precision achieved: {precision:.4f}")
                    return best_model, precision, recall, best_params
                
                logger.info(
                    f"Current precision: {precision:.4f}, "
                    f"target: 0.4800, highest precision: {best_precision:.4f}, "
                    f"best seed: {best_seed}"
                )
                
            except Exception as e:
                logger.error(f"Error training with seed {random_seed}: {str(e)}")
                continue
            
            # Clear model to free memory
            model = None
        
        # If target not reached after all seeds, return best model
        if precision < 0.48:
            logger.info(f"Target precision not reached, using best seed: {best_seed}")
            return best_model, best_precision, best_recall, best_params
            
    return best_model, best_precision, best_recall, best_params


In [98]:
def log_to_mlflow(model: object, precision: float, recall: float, params: dict, experiment_name: str) -> str:
    """Log model, metrics and parameters to MLflow.
    
    Args:
        model: Trained catboost model
        metrics (dict): Dictionary of metrics like precision, recall etc.
        params (dict): Model parameters used for training
        experiment_name (str): Name of the MLflow experiment
    """
    from utils.create_evaluation_set import setup_mlflow_tracking
    
    mlruns_dir = setup_mlflow_tracking(experiment_name)
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"catboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}") as run:
        
        # Log parameters
        mlflow.log_params(params)
        
        # Log metrics
        mlflow.log_metrics({
            "precision": precision,
            "recall": recall
        })
        
        # Register model with timestamp
        model_name = f"catboost_base_{datetime.now().strftime('%Y%m%d_%H%M')}"
        
        # Log model with signature
        input_example = pd.DataFrame(model.feature_names_in_[:1].copy())
        signature = mlflow.models.infer_signature(
            model_input=input_example,
            model_output=predict_proba(model, input_example)
        )
        
        mlflow.catboost.log_model(
            xgb_model=model,
            artifact_path="model",
            registered_model_name=model_name,
            signature=signature
        )
        
        # Log run ID
        run_id = run.info.run_id
        logger.info(f"Run ID: {run_id}")
        return run_id


In [99]:
def train_seed_model():
    model, precision, recall, best_params = train_with_precision_target(
                X_train, y_train,
                X_test, y_test,
                X_eval, y_eval,
                logger
            )
    print(f"Training completed with precision: {precision:.4f}")
    
    # Log to MLflow if we got a valid model
    if model is not None:
        log_to_mlflow(model, precision, recall, best_params, experiment_name)


In [86]:
if __name__ == "__main__":
    best_precision = 0
    best_params = {}
    
    for i in range(5):
        logger.info(f"Starting hypertuning run {i+1}/5")
        precision, params = hypertune_catboost(experiment_name)
        logger.info(f"Run {i+1} completed with precision: {precision:.4f}")
        
        # Track the best run
        if precision > best_precision:
            best_precision = precision
            best_params = params
    logger.info(f"Best precision: {best_precision:.4f} and best Parameters {best_params}")
    # train_seed_model()


2025-03-02 22:18:04,562 | INFO     | create_evaluation_set | Starting hypertuning run 1/5


[I 2025-03-02 22:18:04,564] A new study created in memory with name: catboost_optimization_20250302_2218


  0%|          | 0/200 [00:00<?, ?it/s]

0:	test: 0.5212100	best: 0.5212100 (0)	total: 38.2ms	remaining: 38.1s
100:	test: 0.5992610	best: 0.5992610 (100)	total: 2.43s	remaining: 21.6s
200:	test: 0.6043299	best: 0.6045438 (199)	total: 4.83s	remaining: 19.2s
300:	test: 0.6047871	best: 0.6060191 (242)	total: 7.21s	remaining: 16.7s
Stopped by overfitting detector  (63 iterations wait)

bestTest = 0.6060191313
bestIteration = 242

Shrink model to first 243 iterations.
2025-03-02 22:18:12,384 | INFO     | create_evaluation_set | Optimized threshold: 0.780 with precision: 0.347
2025-03-02 22:18:12,405 | INFO     | create_evaluation_set | Trial 0:
2025-03-02 22:18:12,408 | INFO     | create_evaluation_set |   Params: {'loss_function': 'Logloss', 'eval_metric': 'AUC', 'task_type': 'CPU', 'thread_count': -1, 'verbose': 100, 'learning_rate': 0.04286001140194367, 'depth': 7, 'min_data_in_leaf': 35, 'subsample': 0.7284042098873166, 'colsample_bylevel': 0.37523896470973556, 'reg_lambda': 1.7220117803776371, 'leaf_estimation_iterations': 1,

[I 2025-03-02 23:29:48,373] A new study created in memory with name: catboost_optimization_20250302_2329


  0%|          | 0/200 [00:00<?, ?it/s]

0:	test: 0.5275282	best: 0.5275282 (0)	total: 13.7ms	remaining: 13.7s
100:	test: 0.5890108	best: 0.5908639 (53)	total: 1.25s	remaining: 11.1s
200:	test: 0.5926566	best: 0.5932959 (134)	total: 2.54s	remaining: 10.1s
300:	test: 0.5974501	best: 0.5976937 (293)	total: 3.68s	remaining: 8.56s
400:	test: 0.6001970	best: 0.6001970 (400)	total: 4.85s	remaining: 7.25s
500:	test: 0.6011116	best: 0.6012115 (475)	total: 6.08s	remaining: 6.05s
600:	test: 0.6018275	best: 0.6018549 (599)	total: 7.46s	remaining: 4.95s
700:	test: 0.6023147	best: 0.6023400 (697)	total: 8.63s	remaining: 3.68s
800:	test: 0.6027898	best: 0.6028059 (797)	total: 9.8s	remaining: 2.44s
900:	test: 0.6034541	best: 0.6034926 (890)	total: 11s	remaining: 1.21s
999:	test: 0.6043294	best: 0.6043514 (996)	total: 12.1s	remaining: 0us

bestTest = 0.6043513777
bestIteration = 996

Shrink model to first 997 iterations.
2025-03-02 23:30:00,911 | INFO     | create_evaluation_set | Optimized threshold: 0.760 with precision: 0.347
2025-03-02 2