In [None]:
!pip install "git+https://github.com/ray-project/xgboost_ray.git#egg=xgboost_ray"

In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import ray
from xgboost_ray import RayDMatrix, RayParams, train, predict
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.xgboost import TuneReportCheckpointCallback as XGBoostCallback
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback as LightGBMCallback

ray.init(ignore_reinit_error=True)


# ============================================
# 🧠 XGBOOST SECTION (Raw categorical features)
# ============================================

def load_fda_train_data():
    df = pd.read_parquet("/kaggle/input/fd-data-revamp-2/fd_train_df.parquet")
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    # y = label_encoder.fit_transform(y)
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=df["reaction_outcome"])

def load_fda_val_data():
    df = pd.read_parquet("/kaggle/input/fd-data-revamp-2/fd_val_df.parquet")
    df = df.reset_index(drop=True)
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    # y = label_encoder.fit_transform(y)
    return X, y

def train_xgb(config):
    train_x, test_x, train_y, test_y = load_fda_train_data()
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dtest = xgb.DMatrix(test_x, label=test_y)
    xgb.train(
        config,
        dtrain,
        evals=[(dtest, "eval")],
        verbose_eval=False,
        callbacks=[XGBoostCallback(frequency=1)],
    )

def run_xgb_tuning():
    search_space = {
        "objective": "multi:softprob",
        "eval_metric": ["mlogloss"],
        "num_class": 6,
        "max_depth": tune.randint(3, 8),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
        "tree_method": "hist",
        "device": "cuda", # Explicitly use GPU
    }
    # search_space = {
    #         "objective": "multi:softprob",
    #         "eval_metric": ["mlogloss"],
    #         "num_class": 6,
    #         "max_depth": tune.randint(3, 15),
    #         "min_child_weight": tune.randint(1, 10),
    #         "subsample": tune.uniform(0.5, 1.0),
    #         "colsample_bytree": tune.uniform(0.5, 1.0),
    #         "eta": tune.loguniform(1e-4, 1e-1),
    #         "gamma": tune.uniform(0, 5),
    #         "tree_method": "hist",
    #         "device": "cuda",      
    # }
    tuner = tune.Tuner(
        # train_xgb,
        tune.with_resources(train_xgb, {"cpu": 4, "gpu": 1}),
        # resources_per_trial=ray_params.get_tune_resources(),
        tune_config=tune.TuneConfig(
            metric="eval-mlogloss", mode="min",
            scheduler=ASHAScheduler(
                max_t=100,  # Maximum number of iterations
                grace_period=10,  # Minimum number of iterations
                reduction_factor=2  # Reduction factor for stopping trials
            ),
            num_samples=5
            # max_concurrent_trials=2
        ),
        param_space=search_space,
    )
    results = tuner.fit()
    best_result = results.get_best_result()
    best_model = XGBoostCallback.get_model(best_result.checkpoint)
    val_x, val_y = load_fda_val_data()
    y_pred = np.argmax(best_model.predict(xgb.DMatrix(val_x)), axis=1)
    print("✅ XGBoost Best Params:", best_result.config)
    print("🎯 XGBoost Accuracy:", accuracy_score(val_y, y_pred))
    print(classification_report(val_y, y_pred))
    best_model.save_model("fda_best_xgboost_model.json")

2025-07-12 16:59:33,174	INFO worker.py:1917 -- Started a local Ray instance.


In [2]:
print("🚀 Running XGBoost Tuning...")
run_xgb_tuning()

0,1
Current time:,2025-07-12 17:03:28
Running for:,00:03:54.03
Memory:,3.8/31.4 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-mlogloss
train_xgb_96564_00000,TERMINATED,172.19.2.2:3273,0.04132,3,1,0.737775,10,42.8676,1.46931
train_xgb_96564_00001,TERMINATED,172.19.2.2:3378,0.0178838,4,2,0.535441,10,42.5368,1.61678
train_xgb_96564_00002,TERMINATED,172.19.2.2:3473,0.00161641,4,2,0.928591,10,42.7193,1.77349
train_xgb_96564_00003,TERMINATED,172.19.2.2:3565,0.00307973,3,1,0.504781,10,42.8188,1.75907
train_xgb_96564_00004,TERMINATED,172.19.2.2:3658,0.0949067,6,1,0.849916,10,43.563,1.17442


[36m(train_xgb pid=3273)[0m   if ray.train.get_context().get_world_rank() in (0, None):
[36m(train_xgb pid=3273)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_xgb_2025-07-12_16-59-34/train_xgb_96564_00000_0_eta=0.0413,max_depth=3,min_child_weight=1,subsample=0.7378_2025-07-12_16-59-34/checkpoint_000000)
[36m(train_xgb pid=3273)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_xgb_2025-07-12_16-59-34/train_xgb_96564_00000_0_eta=0.0413,max_depth=3,min_child_weight=1,subsample=0.7378_2025-07-12_16-59-34/checkpoint_000001)
[36m(train_xgb pid=3273)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_xgb_2025-07-12_16-59-34/train_xgb_96564_00000_0_eta=0.0413,max_depth=3,min_child_weight=1,subsample=0.7378_2025-07-12_16-59-34/checkpoint_000002)
[36m(train_xgb pid=3273)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=

✅ XGBoost Best Params: {'objective': 'multi:softprob', 'eval_metric': ['mlogloss'], 'num_class': 6, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.8499162438945151, 'eta': 0.09490665818154173, 'tree_method': 'hist', 'device': 'cuda'}
🎯 XGBoost Accuracy: 0.36397258972299856
              precision    recall  f1-score   support

           0       0.05      0.12      0.07    402685
           1       0.51      0.03      0.06    151600
           2       0.68      0.38      0.49   1159802
           3       0.06      0.01      0.02     10562
           4       0.71      0.17      0.28   2578289
           5       0.34      0.77      0.47   1527284

    accuracy                           0.36   5830222
   macro avg       0.39      0.25      0.23   5830222
weighted avg       0.55      0.36      0.35   5830222



In [None]:
# !pip install lazypredict

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Load your prepared data
train_df = pd.read_parquet("/kaggle/input/fd-data-revamp-2/fd_train_df.parquet")
# val_df = pd.read_parquet("../data/fd_val_df.parquet")

# Combine train and validation for LazyPredict (you can split again)
# Or use your existing split if you prefer
X = train_df.drop(columns=['reaction_outcome'])  # Features
y = train_df['reaction_outcome']  # Target

# Split data (unless you want to use your predefined val_df)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=train_df["reaction_outcome"], random_state=42
)

# Initialize LazyClassifier
clf = LazyClassifier(
    verbose=0,
    ignore_warnings=True,
    custom_metric=None,
    predictions=False,
    random_state=42,
    classifiers='all'  # or specify particular ones
)

# Fit and evaluate models
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from lazypredict.Supervised import LazyClassifier

# Prepare features with VectorAssembler
feature_cols = [c for c in df.columns if c != "reaction_outcome"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [None]:
# Convert to pandas
pandas_df = df.toPandas()
X = pandas_df[feature_cols]
y = pandas_df["reaction_outcome"]

In [None]:
# Use LazyPredict
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Show model performance
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from xgboost_ray import RayDMatrix, RayParams, train, predict
from ray import tune
import ray

# Initialize Ray with 2 GPUs
ray.init(num_cpus=4, num_gpus=2, ignore_reinit_error=True)

# ============================================
# 🧠 DATA LOADING & PROCESSING
# ============================================

def load_fda_train_data():
    df = pd.read_parquet("/kaggle/input/fda-data-revamp/fd_train_df.parquet")
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    y = LabelEncoder().fit_transform(y)
    return train_test_split(X, y, test_size=0.2, random_state=42)

def load_fda_val_data():
    df = pd.read_parquet("/kaggle/input/fda-data-revamp/fd_val_df.parquet")
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    y = LabelEncoder().fit_transform(y)
    return X, y

# ============================================
# 🎯 RAY TUNE TRIAL FUNCTION
# ============================================

# num_actors = 2  # 2 actors for 2 GPUs
ray_params = RayParams(
    num_actors=2,
    cpus_per_actor=2,
    gpus_per_actor=1
)


def train_model(config):
    train_x, test_x, train_y, test_y = load_fda_train_data()

    dtrain = RayDMatrix(train_x, train_y)
    dtest = RayDMatrix(test_x, test_y)

    evals_result = {}
    booster = train(
        params=config,
        dtrain=dtrain,
        evals=[(dtest, "eval")],
        evals_result=evals_result,
        ray_params=ray_params,
        verbose_eval=False,
    )

    # Save best model
    booster.save_model("model.xgb")

    # Compute accuracy
    y_pred_prob = predict(booster, dtest, ray_params=ray_params)
    y_pred = np.argmax(y_pred_prob, axis=1)
    acc = accuracy_score(test_y, y_pred)

    tune.report(eval_error=1 - acc)  # or use mlogloss

# ============================================
# 🔍 HYPERPARAMETER SEARCH SPACE
# ============================================

search_space = {
    "objective": "multi:softprob",
    "num_class": 6,
    "eval_metric": "mlogloss",
    "tree_method": "gpu_hist",
    "eta": tune.loguniform(1e-4, 1e-1),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    "gamma": tune.uniform(0, 5),
    "max_depth": tune.randint(3, 15),
    "min_child_weight": tune.randint(1, 10),
}

# ============================================
# 🧪 RUN TUNING JOB
# ============================================
tuner = tune.Tuner(
    # tune.with_resources(train_model, {"cpu": 2, "gpu": 1} ),
    tune.with_resources(train_model, ray_params.get_tune_resources()),
    tune_config=tune.TuneConfig(
        metric="eval-mlogloss",
        mode="min",
        num_samples=10,
    ),
    run_config=tune.RunConfig(name="xgboost_ray_multi_gpu"),
    param_space=search_space,
)

results = tuner.fit()
print("✅ Best Config:", results.get_best_result().config)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
# We won't use the direct TuneReportCheckpointCallback for K-fold average reporting.
# Instead, we'll manually report the average metric.
# from ray.tune.integration.xgboost import TuneReportCheckpointCallback as XGBoostCallback

ray.init(ignore_reinit_error=True)

# ============================================
# 🧠 Data Loading Functions
# ============================================

def load_fda_full_train_data():
    """Loads the full training data for K-fold cross-validation."""
    df = pd.read_parquet("/kaggle/input/fda-data-revamp/fd_train_df.parquet")
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    y = LabelEncoder().fit_transform(y)  # maps to [0, 1, 2, ..., n_classes-1]
    return X, y

def load_fda_val_data():
    """Loads the separate validation data for final evaluation."""
    df = pd.read_parquet("/kaggle/input/fda-data-revamp/fd_val_df.parquet")
    for col in ["country", "reaction", "drug", "age_group"]:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    X = df.drop(columns=["reaction_outcome"])
    y = df["reaction_outcome"].astype(int)
    y = LabelEncoder().fit_transform(y)
    return X, y

# ============================================
# 🧠 XGBOOST SECTION (Raw categorical features)
# ============================================

def train_xgb_kfold(config):
    X, y = load_fda_full_train_data()
    num_classes = len(np.unique(y))
    config["num_class"] = num_classes
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mlogloss_scores = []
    accuracy_scores = []

    for fold_idx, (train_index, val_index) in enumerate(kf.split(X, y)):
        train_x, val_x = X.iloc[train_index], X.iloc[val_index]
        train_y, val_y = y[train_index], y[val_index]

        dtrain = xgb.DMatrix(train_x, label=train_y)
        dval = xgb.DMatrix(val_x, label=val_y)

        model = xgb.train(
            config,
            dtrain,
            evals=[(dval, "validation")],
            verbose_eval=False,
        )

        val_preds_proba = model.predict(dval)
        val_preds_labels = np.argmax(val_preds_proba, axis=1)

        mlogloss_scores.append(log_loss(val_y, val_preds_proba))
        accuracy_scores.append(accuracy_score(val_y, val_preds_labels))

    avg_mlogloss = np.mean(mlogloss_scores)
    avg_accuracy = np.mean(accuracy_scores)
    
    # Correct way to report metrics:
    tune.report(avg_mlogloss=avg_mlogloss, avg_accuracy=avg_accuracy)

def run_xgb_tuning_kfold():
    search_space = {
        "objective": "multi:softprob",
        "eval_metric": ["mlogloss"], # Note: 'eval_metric' for XGBoost internal monitoring, not directly for Tune
        "num_class": 6,
        "max_depth": tune.randint(3, 8),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
        "tree_method": "hist",
        "device": "cuda", # Explicitly use GPU
    }
    tuner = tune.Tuner(
        tune.with_resources(train_xgb_kfold, resources={"cpu": 4, "gpu": 1}), # Allocate GPU for each trial
        tune_config=tune.TuneConfig(
            metric="avg_mlogloss", mode="min", # Tune based on averaged mlogloss
            scheduler=ASHAScheduler(), num_samples=10), # Adjust num_samples based on resources
        param_space=search_space,
    )
    results = tuner.fit()
    best_result = results.get_best_result()

    print("✅ XGBoost Best Params (from K-fold tuning):", best_result.config)
    print("🎯 XGBoost Average mlogloss (on K-fold validation sets):", best_result.metrics.get("avg_mlogloss"))
    print("🎯 XGBoost Average Accuracy (on K-fold validation sets):", best_result.metrics.get("avg_accuracy"))


    # --- Final Evaluation on the Separate Validation Set (fd_val_df.parquet) ---
    print("\n--- Final XGBoost Model Training and Evaluation ---")
    val_x, val_y = load_fda_val_data() # Load the truly unseen validation data

    # Train a final model with the best parameters on the full training data
    X_full_train, y_full_train = load_fda_full_train_data()
    d_full_train = xgb.DMatrix(X_full_train, label=y_full_train)

    final_xgb_model = xgb.train(
        best_result.config,
        d_full_train,
        num_boost_round=1000, # Example: Train for more rounds
        callbacks=[xgb.callback.EarlyStopping(rounds=50, metric_name="mlogloss", data_name="train", maximize=False, save_best=True)] # Simple early stopping if desired
    )

    y_pred_val = np.argmax(final_xgb_model.predict(xgb.DMatrix(val_x)), axis=1)
    print("🎯 XGBoost Accuracy on Final Unseen Validation Set:", accuracy_score(val_y, y_pred_val))
    print(f"🎯 XGBoost Log Loss on Final Unseen Validation Set: {log_loss(val_y, final_xgb_model.predict(xgb.DMatrix(val_x))):.4f}")
    print("\nClassification Report (XGBoost on Final Unseen Validation Set):\n", classification_report(val_y, y_pred_val))
    print("\nConfusion Matrix (XGBoost on Final Unseen Validation Set):\n", confusion_matrix(val_y, y_pred_val))
    final_xgb_model.save_model("fda_best_xgboost_final_model.json")

In [None]:
print("Starting XGBoost K-fold Tuning...")
run_xgb_tuning_kfold()

In [None]:
ray.shutdown()