In [2]:
# First, let's check currently installed packages
import pkg_resources
import subprocess
import sys

def get_installed_packages():
    """Get list of installed packages and their versions"""
    installed_packages = pkg_resources.working_set
    return {i.key: i.version for i in installed_packages}

def check_and_install_packages():
    """Check and install required packages"""
    required_packages = {
        'pandas': '1.2.4',
        'numpy': '1.19.5',
        'scikit-learn': '0.24.2',
        'xgboost': '1.4.2',
        'seaborn': '0.11.1',
        'matplotlib': '3.4.2',
        'plotly': '5.1.0',
        'joblib': '1.0.1'
    }
    
    installed = get_installed_packages()
    
    print("Checking required packages...")
    packages_to_install = []
    
    for package, version in required_packages.items():
        if package not in installed:
            print(f"{package} not found - will install version {version}")
            packages_to_install.append(f"{package}=={version}")
        else:
            print(f"{package} found - version {installed[package]}")
    
    if packages_to_install:
        print("\nInstalling missing packages...")
        for package in packages_to_install:
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
                print(f"Successfully installed {package}")
            except subprocess.CalledProcessError:
                print(f"Failed to install {package}")
    
    print("\nFinal package versions:")
    installed = get_installed_packages()
    for package in required_packages:
        print(f"{package}: {installed.get(package, 'Not installed')}")

def check_gpu_availability():
    """Check if GPU is available"""
    try:
        import torch
        print("\nGPU Availability:")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"GPU device: {torch.cuda.get_device_name(0)}")
    except ImportError:
        try:
            import tensorflow as tf
            print("\nGPU Availability:")
            print(f"GPU devices: {tf.config.list_physical_devices('GPU')}")
        except ImportError:
            print("\nNeither PyTorch nor TensorFlow is installed - cannot check GPU availability")

# Run the checks
if __name__ == "__main__":
    print("=== Package Setup and Verification ===\n")
    check_and_install_packages()
    check_gpu_availability()
    
    # Check memory availability
    import psutil
    memory = psutil.virtual_memory()
    print(f"\nAvailable Memory: {memory.available / (1024 * 1024 * 1024):.2f} GB")
    print(f"Total Memory: {memory.total / (1024 * 1024 * 1024):.2f} GB")

  import pkg_resources


=== Package Setup and Verification ===

Checking required packages...
pandas found - version 2.2.3
numpy found - version 1.26.4
scikit-learn found - version 1.2.2
xgboost found - version 2.0.3
seaborn found - version 0.12.2
matplotlib found - version 3.7.5
plotly found - version 5.24.1
joblib found - version 1.4.2

Final package versions:
pandas: 2.2.3
numpy: 1.26.4
scikit-learn: 1.2.2
xgboost: 2.0.3
seaborn: 0.12.2
matplotlib: 3.7.5
plotly: 5.24.1
joblib: 1.4.2

GPU Availability:
CUDA available: True
GPU device: Tesla T4

Available Memory: 29.88 GB
Total Memory: 31.35 GB


In [3]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")  # Prints the name of the first GPU
else:
    print("GPU is not available.")

# If you have multiple GPUs and want to use them with XGBoost, you'll need to configure XGBoost appropriately.
# XGBoost will automatically use all available GPUs by default if built with GPU support.
# If you need more fine-grained control, you can set the 'n_gpus' parameter in your XGBoost training parameters.

GPU is available!
Number of GPUs: 2
Current GPU: Tesla T4


In [4]:
# Module 1: Imports and Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import log_loss, roc_auc_score, precision_recall_curve, auc, roc_curve
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')
import os
import time
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [6]:
# Module 2: Data Loading and Initial Processing
def load_and_preprocess_data(train_features_path, train_targets_path, test_features_path):
    """
    Load and preprocess the data files
    """
    print("Loading data...")
    train_features = pd.read_csv("/kaggle/input/my-other-dataset/train_features.csv")
    train_targets_scored = pd.read_csv("/kaggle/input/my-other-dataset/train_targets_scored.csv")
    test_features = pd.read_csv("/kaggle/input/my-other-dataset/test_features.csv")
    
    # Separate features and targets
    X = train_features.drop(columns=['sig_id'])
    y = train_targets_scored.drop(columns=['sig_id'])
    
    # Handle control perturbations
    control_mask = X['cp_type'] == 'ctrl_vehicle'
    X = X[~control_mask]
    y = y[~control_mask]
    
    return X, y, test_features, control_mask

In [8]:
# Module 3: Feature Engineering
def engineer_features(X):
    """
    Perform feature engineering on the dataset
    """
    print("Engineering features...")
    # Statistical features
    X['g_mean'] = X.filter(regex='^g-').mean(axis=1)
    X['g_std'] = X.filter(regex='^g-').std(axis=1)
    X['c_mean'] = X.filter(regex='^c-').mean(axis=1)
    X['c_std'] = X.filter(regex='^c-').std(axis=1)
    
    # Optional: Comment these out if they're causing issues
    X['g_skew'] = X.filter(regex='^g-').skew(axis=1)
    X['c_skew'] = X.filter(regex='^c-').skew(axis=1)
    X['g_kurtosis'] = X.filter(regex='^g-').kurtosis(axis=1)
    X['c_kurtosis'] = X.filter(regex='^c-').kurtosis(axis=1)
    
    # Drop cp_type and encode categoricals
    X = X.drop(columns=['cp_type'])
    X = pd.get_dummies(X, columns=['cp_time', 'cp_dose'], drop_first=True)
    
    return X

In [11]:
# Module 4: Visualization Functions
def plot_target_distribution(y, save_path='target_distribution.png'):
    """
    Plot distribution of target variables
    """
    plt.figure(figsize=(15, 6))
    means = y.mean().sort_values(ascending=True)
    plt.bar(range(len(means)), means)
    plt.title('Distribution of Target Variables')
    plt.xlabel('Target Index')
    plt.ylabel('Mean Value')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_feature_importance(feature_importance_df, top_n=20, save_path='feature_importance.png'):
    """
    Plot top N most important features
    """
    plt.figure(figsize=(12, 8))
    sns.barplot(x='mean_importance', 
                y=feature_importance_df.index,
                data=feature_importance_df.sort_values('mean_importance', ascending=True).tail(top_n))
    plt.title(f'Top {top_n} Most Important Features')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_roc_curves(y_true, y_pred, target_name, save_path=None):
    """
    Plot ROC curve for a target
    """
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {target_name}')
    plt.legend(loc="lower right")
    
    if save_path:
        plt.savefig(save_path)
    plt.close()

In [12]:
# Module 5: Model Training and Evaluation
class ConstantPredictor:
    def __init__(self, value):
        self.value = float(value)
    
    def predict_proba(self, X):
        n_samples = len(X)
        probs = np.zeros((n_samples, 2))
        probs[:, 0] = 1 - self.value
        probs[:, 1] = self.value
        return probs
    
    def predict(self, X):
        return np.ones(len(X)) * self.value
    
    def fit(self, X, y):
        return self

def evaluate_predictions(y_true, y_pred, target_name):
    """
    Calculate various metrics for model evaluation
    """
    metrics = {}
    metrics['log_loss'] = log_loss(y_true, y_pred)
    
    # Convert probabilities to binary predictions using 0.5 threshold
    y_pred_binary = (y_pred > 0.5).astype(int)
    
    # Calculate additional metrics
    metrics['roc_auc'] = roc_auc_score(y_true, y_pred)
    
    # Precision-Recall AUC
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    metrics['pr_auc'] = auc(recall, precision)
    
    # Classification report
    report = classification_report(y_true, y_pred_binary, output_dict=True)
    metrics.update({
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1_score': report['1']['f1-score']
    })
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred_binary)
    metrics['confusion_matrix'] = cm
    
    return metrics

def train_target_model(target, X_train, y_train, X_val, y_val, xgb_params):
    """
    Train and evaluate model for a single target with GPU acceleration and multi-GPU support
    """
    result = {
        'target': target,
        'metrics': None,
        'model': None,
        'predictions': None,
        'feature_importance': None
    }
    
    print(f"\nProcessing target: {target}")
    
    # Check if target has variation
    if len(np.unique(y_train[target])) == 1:
        constant_value = float(y_train[target].iloc[0])
        print(f"Target {target} has only one class (constant value: {constant_value})")
        result['model'] = ConstantPredictor(constant_value)
        result['predictions'] = np.ones(len(y_val)) * constant_value
    else:
        try:
            # Create DMatrix for XGBoost
            dtrain = xgb.DMatrix(X_train, label=y_train[target])
            dval = xgb.DMatrix(X_val, label=y_val[target])
            
            # Calculate scale_pos_weight for imbalanced data
            pos_weight = (y_train[target] == 0).sum() / max(1, (y_train[target] == 1).sum())
            local_params = xgb_params.copy()
            local_params['scale_pos_weight'] = pos_weight
            
            # Train the model
            print(f"Training target {target} on GPU(s)...")
            model = xgb.train(
                local_params,
                dtrain,
                num_boost_round=200,
                evals=[(dval, 'eval')],
                early_stopping_rounds=20,
                verbose_eval=False
            )
            
            # Store the trained model and predictions
            result['model'] = model
            val_preds = model.predict(dval)
            result['predictions'] = val_preds
            
            # Calculate evaluation metrics
            result['metrics'] = evaluate_predictions(y_val[target], val_preds, target)
            
            # Get feature importance
            result['feature_importance'] = model.get_score(importance_type='gain')
            
            # Generate and save ROC curve
            # Need to import roc_curve from sklearn.metrics
            try:
                from sklearn.metrics import roc_curve
                fpr, tpr, _ = roc_curve(y_val[target], val_preds)
                plt.figure(figsize=(8, 6))
                plt.plot(fpr, tpr, label=f'ROC curve (AUC = {result["metrics"]["roc_auc"]:.2f})')
                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC Curve - {target}')
                plt.legend(loc="lower right")
                plt.savefig(f'roc_curves/{target}_roc.png')
                plt.close()
            except Exception as e:
                print(f"Error creating ROC curve for {target}: {str(e)}")
            
        except Exception as e:
            # Handle errors during training
            print(f"Error in training target {target}: {str(e)}")
            majority_class = float(y_train[target].mode()[0])
            result['model'] = ConstantPredictor(majority_class)
            result['predictions'] = np.ones(len(y_val)) * majority_class
    
    return result

In [13]:
# Module 6: Main Pipeline
def run_moa_pipeline(train_features_path, train_targets_path, test_features_path):
    """
    Run the complete MoA prediction pipeline
    """
    # Create directories for outputs
    os.makedirs('plots', exist_ok=True)
    os.makedirs('roc_curves', exist_ok=True)
    os.makedirs('metrics', exist_ok=True)
    
    # Load and preprocess data
    X, y, test_features, control_mask = load_and_preprocess_data(
        train_features_path, train_targets_path, test_features_path
    )
    
    # Engineer features
    X = engineer_features(X)
    
    # Plot initial target distribution
    plot_target_distribution(y, save_path='plots/target_distribution.png')
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y.iloc[:, 0]
    )
    
    # Scale features
    scaler = RobustScaler()
    numerical_cols = [col for col in X.columns if col.startswith(('g-', 'c-'))]
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
    
    # XGBoost parameters - FIX: Removed duplicate tree_method
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'gpu_hist',  # Use GPU-accelerated tree method
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'verbosity': 3,
        'n_gpus': 2  # Use both GPUs
    }
    
    # Train models in parallel
    start_time = time.time()
    results = Parallel(n_jobs=-1, verbose=10)(
        delayed(train_target_model)(
            target, X_train, y_train, X_val, y_val, xgb_params
        ) for target in y_train.columns
    )
    
    # Process results and create visualizations
    process_and_visualize_results(results, X_train.columns)
    
    # Prepare and make predictions on test set
    test_predictions = predict_test_set(
        test_features, control_mask, X_train.columns, 
        numerical_cols, scaler, results
    )
    
    # Save final results
    save_final_results(test_predictions, test_features['sig_id'], results)
    
    print(f"\nTotal execution time: {(time.time() - start_time)/60:.2f} minutes")

In [15]:
# Module 7: Results Processing and Visualization

def compile_feature_importance(results, feature_columns):
    """
    Compile feature importance scores from all trained models
    """
    feature_importance = []

    for result in results:
        target = result['target']
        importance = result['feature_importance']
        
        if importance:
            # Normalize importance values
            total_importance = sum(importance.values())
            normalized_importance = {k: v / total_importance for k, v in importance.items()}
            
            # Append to the list
            for feature, score in normalized_importance.items():
                feature_importance.append({
                    'feature': feature,
                    'importance': score,
                    'target': target
                })
    
    # Create a DataFrame
    feature_importance_df = pd.DataFrame(feature_importance)
    
    # Aggregate importance across all targets
    aggregated_importance = (
        feature_importance_df
        .groupby('feature')['importance']
        .mean()
        .reset_index()  # Reset index to avoid issues
        .rename(columns={'importance': 'mean_importance'})
    )
    
    # Sort by mean importance
    aggregated_importance = aggregated_importance.sort_values(by='mean_importance', ascending=False)
    
    return aggregated_importance


def plot_metrics_distribution(metrics_df, save_dir):
    """
    Create distribution plots for various metrics
    """
    # List of metrics to visualize
    metrics_to_plot = ['roc_auc', 'pr_auc', 'log_loss', 'f1_score']
    
    # Create a directory for saving plots if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    for metric in metrics_to_plot:
        if metric in metrics_df.columns:
            plt.figure(figsize=(10, 6))
            sns.histplot(data=metrics_df[metric], bins=50, kde=True)
            plt.title(f'Distribution of {metric}')
            plt.xlabel(metric)
            plt.ylabel('Count')
            plt.tight_layout()
            plt.savefig(f'{save_dir}/{metric}_distribution.png')
            plt.close()


def plot_feature_importance(feature_importance_df, top_n=20, save_path='feature_importance.png'):
    """
    Plot top N most important features
    """
    plt.figure(figsize=(12, 8))
    
    # Select top N features
    top_features = feature_importance_df.head(top_n)
    
    # Plot bar chart
    sns.barplot(
        x='mean_importance',
        y='feature',  # Use the 'feature' column instead of index
        data=top_features,
        orient='h'
    )
    
    plt.title(f'Top {top_n} Most Important Features')
    plt.xlabel('Mean Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()


def process_and_visualize_results(results, feature_columns):
    """
    Process results and create visualizations
    """
    # Compile metrics
    metrics_df = pd.DataFrame([
        {
            'target': r['target'],
            **r['metrics']
        } for r in results if r['metrics'] is not None
    ])
    
    # Create metrics visualizations
    plot_metrics_distribution(metrics_df, save_dir='plots')
    
    # Process feature importance
    feature_importance_df = compile_feature_importance(results, feature_columns)
    plot_feature_importance(feature_importance_df, save_path='plots/feature_importance.png')
    
    # Save detailed metrics
    metrics_df.to_csv('metrics/detailed_metrics.csv', index=False)
    feature_importance_df.to_csv('metrics/feature_importance.csv', index=False)

In [16]:
# Module 8: Test Set Prediction
from sklearn.metrics import roc_curve  # Add this import

def predict_test_set(test_features, control_mask, train_columns, 
                    numerical_cols, scaler, results):
    """
    Make predictions on the test set
    """
    # Preprocess test features
    test_features_processed = engineer_features(test_features[~control_mask].copy())
    
    # Align columns with training data
    test_features_processed = align_test_features(
        test_features_processed, train_columns
    )
    
    # Scale features
    test_features_processed[numerical_cols] = scaler.transform(
        test_features_processed[numerical_cols]
    )
    
    # Make predictions
    predictions = make_test_predictions(
        test_features_processed, results, test_features.index, control_mask
    )
    
    return predictions

In [17]:
# Module 9: Utility Functions
def align_test_features(test_features, train_columns):
    """
    Align test features with training features
    """
    missing_cols = set(train_columns) - set(test_features.columns)
    for col in missing_cols:
        test_features[col] = 0
    
    extra_cols = set(test_features.columns) - set(train_columns)
    test_features = test_features.drop(columns=list(extra_cols))
    
    return test_features[train_columns]

def make_test_predictions(test_features, results, index, control_mask):
    """
    Make predictions for test set
    """
    predictions = pd.DataFrame(index=index, 
                             columns=[r['target'] for r in results], 
                             dtype=float)
    
    # Make predictions for non-control samples
    for result in results:
        target = result['target']
        model = result['model']
        
        try:
            if isinstance(model, xgb.Booster):
                dtest = xgb.DMatrix(test_features)
                predictions.loc[~control_mask, target] = model.predict(dtest)
            else:
                predictions.loc[~control_mask, target] = model.predict(test_features)
        except Exception as e:
            print(f"Error predicting {target}: {str(e)}")
            if isinstance(model, ConstantPredictor):
                predictions[target] = float(model.value)
            else:
                predictions[target] = 0.0
    
    # Set control predictions to 0
    predictions.loc[control_mask, :] = 0.0
    
    # Fix: Use less aggressive clipping as per competition guidelines
    # According to: max(min(p,1−10−15),10−15)
    predictions = predictions.clip(1e-15, 1 - 1e-15)
    
    # Print prediction stats for verification
    pred_values = predictions.values.flatten()
    print("\nPrediction Statistics After Clipping:")
    print(f"Mean: {pred_values.mean():.6f}")
    print(f"Min: {pred_values.min():.6f}")
    print(f"Max: {pred_values.max():.6f}")
    print(f"Values <0.001: {(pred_values < 0.001).sum()} ({(pred_values < 0.001).sum()/len(pred_values)*100:.2f}%)")
    
    return predictions

In [18]:
# Module 10: Final Results and Metrics Output
def save_final_results(predictions, sig_ids, results, output_dir='outputs'):
    """
    Save final results and print detailed metrics
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create submission file
    submission = pd.DataFrame({'sig_id': sig_ids})
    submission = pd.concat([submission, predictions], axis=1)
    submission.to_csv(f'{output_dir}/submission.csv', index=False)
    
    # Compile and print detailed metrics
    print("\n=== Model Performance Metrics ===")
    
    # Overall metrics
    all_metrics = [r['metrics'] for r in results if r['metrics'] is not None]
    metrics_summary = {
        'roc_auc': [m['roc_auc'] for m in all_metrics if 'roc_auc' in m],
        'pr_auc': [m['pr_auc'] for m in all_metrics if 'pr_auc' in m],
        'log_loss': [m['log_loss'] for m in all_metrics if 'log_loss' in m],
        'f1_score': [m['f1_score'] for m in all_metrics if 'f1_score' in m]
    }
    
    print("\nOverall Metrics:")
    for metric, values in metrics_summary.items():
        print(f"{metric}:")
        print(f"  Mean: {np.mean(values):.4f}")
        print(f"  Std:  {np.std(values):.4f}")
        print(f"  Min:  {np.min(values):.4f}")
        print(f"  Max:  {np.max(values):.4f}")
    
    # Submission statistics
    print("\nSubmission Statistics:")
    print(f"Number of samples in submission: {len(submission)}")
    print(f"Number of MoA targets: {len(predictions.columns)}")
    
    # Prediction value ranges
    prediction_values = predictions.values.flatten()
    print("\nPrediction value ranges:")
    print(f"Mean: {prediction_values.mean():.6f}")
    print(f"Std: {prediction_values.std():.6f}")
    print(f"Min: {prediction_values.min():.6f}")
    print(f"Max: {prediction_values.max():.6f}")
    
    print("\nNumber of predictions by value range:")
    print(f"0-0.001: {np.sum((prediction_values > 0) & (prediction_values <= 0.001))}")
    print(f"0.001-0.01: {np.sum((prediction_values > 0.001) & (prediction_values <= 0.01))}")
    print(f"0.01-0.1: {np.sum((prediction_values > 0.01) & (prediction_values <= 0.1))}")
    print(f"0.1-0.5: {np.sum((prediction_values > 0.1) & (prediction_values <= 0.5))}")
    print(f"0.5-1.0: {np.sum((prediction_values > 0.5) & (prediction_values <= 1.0))}")
    
    # Validation checks
    print("\nValidation Checks:")
    print("Missing values:", submission.isnull().sum().sum())
    print("Infinite values:", np.isinf(predictions.values).sum())
    print("Values outside [0,1]:", np.sum((predictions.values < 0) | (predictions.values > 1)))
    
    # Save detailed metrics per target
    detailed_metrics = []
    for result in results:
        if result['metrics'] is not None:
            metrics_dict = {
                'target': result['target'],
                **result['metrics']
            }
            detailed_metrics.append(metrics_dict)
    
    detailed_metrics_df = pd.DataFrame(detailed_metrics)
    detailed_metrics_df.to_csv(f'{output_dir}/detailed_metrics.csv', index=False)
    
    # Print top and bottom performing targets
    print("\nTop 5 performing targets (by ROC-AUC):")
    print(detailed_metrics_df.nlargest(5, 'roc_auc')[['target', 'roc_auc', 'pr_auc', 'log_loss']])
    
    print("\nBottom 5 performing targets (by ROC-AUC):")
    print(detailed_metrics_df.nsmallest(5, 'roc_auc')[['target', 'roc_auc', 'pr_auc', 'log_loss']])

In [19]:
# Module 11: Main Execution
if __name__ == "__main__":
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Define paths
    TRAIN_FEATURES_PATH = '/kaggle/input/my-other-dataset/train_features.csv'
    TRAIN_TARGETS_PATH = '/kaggle/input/my-other-dataset/train_targets_scored.csv'
    TEST_FEATURES_PATH = '/kaggle/input/my-other-dataset/test_features.csv'
    OUTPUT_DIR = '/kaggle/working/moa_outputs'  # Updated to Kaggle's writable directory
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(f'{OUTPUT_DIR}/pipeline.log'),
            logging.StreamHandler()
        ]
    )
    
    try:
        # Verify file paths before running the pipeline
        if not os.path.exists(TRAIN_FEATURES_PATH):
            raise FileNotFoundError(f"Train features file not found at: {TRAIN_FEATURES_PATH}")
        if not os.path.exists(TRAIN_TARGETS_PATH):
            raise FileNotFoundError(f"Train targets file not found at: {TRAIN_TARGETS_PATH}")
        if not os.path.exists(TEST_FEATURES_PATH):
            raise FileNotFoundError(f"Test features file not found at: {TEST_FEATURES_PATH}")
        
        # Run the complete pipeline
        run_moa_pipeline(
            TRAIN_FEATURES_PATH,
            TRAIN_TARGETS_PATH,
            TEST_FEATURES_PATH
        )
        
        logging.info("Pipeline completed successfully!")
        
    except Exception as e:
        logging.error(f"Pipeline failed with error: {str(e)}")
        raise

Loading data...
Engineering features...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 10

Engineering features...

Prediction Statistics After Clipping:
Mean: 0.005170
Min: 0.000000
Max: 0.999925
Values <0.001: 560780 (68.36%)

=== Model Performance Metrics ===

Overall Metrics:
roc_auc:
  Mean: 0.7207
  Std:  0.1875
  Min:  0.0557
  Max:  1.0000
pr_auc:
  Mean: 0.1293
  Std:  0.2529
  Min:  0.0001
  Max:  1.0000
log_loss:
  Mean: 0.0179
  Std:  0.0202
  Min:  0.0005
  Max:  0.1087
f1_score:
  Mean: 0.1153
  Std:  0.2518
  Min:  0.0000
  Max:  1.0000

Submission Statistics:
Number of samples in submission: 3982
Number of MoA targets: 206

Prediction value ranges:
Mean: 0.005170
Std: 0.033372
Min: 0.000000
Max: 0.999925

Number of predictions by value range:
0-0.001: 560780
0.001-0.01: 194036
0.01-0.1: 59001
0.1-0.5: 5695
0.5-1.0: 780

Validation Checks:
Missing values: 0
Infinite values: 0
Values outside [0,1]: 0

Top 5 performing targets (by ROC-AUC):
                               target   roc_auc    pr_auc  log_loss
34             atp_synthase_inhibitor  1.000000  1.0000