In [29]:
import xgboost as xgb
import torch
import joblib
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models import NodeConfig
import torchvision
import torch.nn as nn
from torch.nn import functional as F
import pandas as pd
from torch.autograd import Function
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import argparse
from pytorch_tabular.models import FTTransformerConfig

In [2]:
df_train = pd.read_excel(r"C:\Users\joshu\train_imputed.xlsx", index_col = "Unnamed: 0")
df_val = pd.read_excel(r"C:\Users\joshu\train_imputed.xlsx", index_col = "Unnamed: 0")
df_test = pd.read_excel(r"C:\Users\joshu\test_imputed.xlsx", index_col = "Unnamed: 0")

labs = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}
labs_reversed = {value: key for key, value in labs.items()}

total_feats = ['Hematocrit',
 'PTT',
 'Asparate Aminotransferase (AST)',
 'Chloride',
 'White Blood Cells',
 'Potassium',
 'Calcium, Total',
 'Phosphate',
 'Monocytes',
 'Eosinophils',
 'Urea Nitrogen',
 'pH',
 'pCO2']

encode = lambda x: [labs_reversed[i] for i in x]
decode = lambda x: [labs[i] for i in x]

cols = decode(df_train.columns.to_list())
targets = list(set(cols) - set(total_feats))

In [4]:
X_train = torch.tensor(df_train[encode(total_feats)].values).type(torch.float)
Y_train = torch.tensor(df_train[encode(targets)].values).type(torch.float)

Y_test =  torch.tensor(df_test[encode(targets)].values).type(torch.float)
X_test =  torch.tensor(df_test[encode(total_feats)].values).type(torch.float)

X_val = torch.tensor(df_val[encode(total_feats)].values).type(torch.float)
Y_val = torch.tensor(df_val[encode(targets)].values).type(torch.float)

In [5]:
# credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py
def _make_ix_like(input, dim=0):
    d = input.size(dim)
    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
    view = [1] * input.dim()
    view[0] = -1
    return rho.view(view).transpose(0, dim)


class SparsemaxFunction(Function):
    """
    An implementation of sparsemax (Martins & Astudillo, 2016). See
    :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
    By Ben Peters and Vlad Niculae
    """

    @staticmethod
    def forward(ctx, input, dim=-1):
        """sparsemax: normalizing sparse transform (a la softmax)

        Parameters
        ----------
        ctx : torch.autograd.function._ContextMethodMixin
        input : torch.Tensor
            any shape
        dim : int
            dimension along which to apply sparsemax

        Returns
        -------
        output : torch.Tensor
            same shape as input

        """
        ctx.dim = dim
        max_val, _ = input.max(dim=dim, keepdim=True)
        input -= max_val  # same numerical stability trick as for softmax
        tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
        output = torch.clamp(input - tau, min=0)
        ctx.save_for_backward(supp_size, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        supp_size, output = ctx.saved_tensors
        dim = ctx.dim
        grad_input = grad_output.clone()
        grad_input[output == 0] = 0

        v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
        v_hat = v_hat.unsqueeze(dim)
        grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
        return grad_input, None

    @staticmethod
    def _threshold_and_support(input, dim=-1):
        """Sparsemax building block: compute the threshold

        Parameters
        ----------
        input: torch.Tensor
            any dimension
        dim : int
            dimension along which to apply the sparsemax

        Returns
        -------
        tau : torch.Tensor
            the threshold value
        support_size : torch.Tensor

        """

        input_srt, _ = torch.sort(input, descending=True, dim=dim)
        input_cumsum = input_srt.cumsum(dim) - 1
        rhos = _make_ix_like(input, dim)
        support = rhos * input_srt > input_cumsum

        support_size = support.sum(dim=dim).unsqueeze(dim)
        tau = input_cumsum.gather(dim, support_size - 1)
        tau /= support_size.to(input.dtype)
        return tau, support_size


sparsemax = SparsemaxFunction.apply


class Sparsemax(nn.Module):

    def __init__(self, dim=-1):
        self.dim = dim
        super(Sparsemax, self).__init__()

    def forward(self, input):
        return sparsemax(input, self.dim)


class Entmax15Function(Function):
    """
    An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See
    :cite:`https://arxiv.org/abs/1905.05702 for detailed description.
    Source: https://github.com/deep-spin/entmax
    """

    @staticmethod
    def forward(ctx, input, dim=-1):
        ctx.dim = dim

        max_val, _ = input.max(dim=dim, keepdim=True)
        input = input - max_val  # same numerical stability trick as for softmax
        input = input / 2  # divide by 2 to solve actual Entmax

        tau_star, _ = Entmax15Function._threshold_and_support(input, dim)
        output = torch.clamp(input - tau_star, min=0) ** 2
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        Y, = ctx.saved_tensors
        gppr = Y.sqrt()  # = 1 / g'' (Y)
        dX = grad_output * gppr
        q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
        q = q.unsqueeze(ctx.dim)
        dX -= q * gppr
        return dX, None

    @staticmethod
    def _threshold_and_support(input, dim=-1):
        Xsrt, _ = torch.sort(input, descending=True, dim=dim)

        rho = _make_ix_like(input, dim)
        mean = Xsrt.cumsum(dim) / rho
        mean_sq = (Xsrt ** 2).cumsum(dim) / rho
        ss = rho * (mean_sq - mean ** 2)
        delta = (1 - ss) / rho

        # NOTE this is not exactly the same as in reference algo
        # Fortunately it seems the clamped values never wrongly
        # get selected by tau <= sorted_z. Prove this!
        delta_nz = torch.clamp(delta, 0)
        tau = mean - torch.sqrt(delta_nz)

        support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
        tau_star = tau.gather(dim, support_size - 1)
        return tau_star, support_size


class Entmoid15(Function):
    """ A highly optimized equivalent of lambda x: Entmax15([x, 0]) """

    @staticmethod
    def forward(ctx, input):
        output = Entmoid15._forward(input)
        ctx.save_for_backward(output)
        return output

    @staticmethod
    def _forward(input):
        input, is_pos = abs(input), input >= 0
        tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2
        tau.masked_fill_(tau <= input, 2.0)
        y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2
        return torch.where(is_pos, 1 - y_neg, y_neg)

    @staticmethod
    def backward(ctx, grad_output):
        return Entmoid15._backward(ctx.saved_tensors[0], grad_output)

    @staticmethod
    def _backward(output, grad_output):
        gppr0, gppr1 = output.sqrt(), (1 - output).sqrt()
        grad_input = grad_output * gppr0
        q = grad_input / (gppr0 + gppr1)
        grad_input -= q * gppr0
        return grad_input


entmax15 = Entmax15Function.apply
entmoid15 = Entmoid15.apply


class Entmax15(nn.Module):

    def __init__(self, dim=-1):
        self.dim = dim
        super(Entmax15, self).__init__()

    def forward(self, input):
        return entmax15(input, self.dim)

In [6]:
def initialize_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    return

class GBN(torch.nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """
    def __init__(self, input_dim, virtual_batch_size=512):
        super(GBN, self).__init__()
        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = nn.BatchNorm1d(self.input_dim)

    def forward(self, x):
        if self.training == True:
            chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
            res = [self.bn(x_) for x_ in chunks]
            return torch.cat(res, dim=0)
        else:
            return self.bn(x)

class LearnableLocality(nn.Module):

    def __init__(self, input_dim, k):
        super(LearnableLocality, self).__init__()
        self.register_parameter('weight', nn.Parameter(torch.rand(k, input_dim)))
        self.smax = Entmax15(dim=-1)

    def forward(self, x):
        mask = self.smax(self.weight)
        masked_x = torch.einsum('nd,bd->bnd', mask, x)  # [B, k, D]
        return masked_x

class AbstractLayer(nn.Module):
    def __init__(self, base_input_dim, base_output_dim, k, virtual_batch_size, bias=True):
        super(AbstractLayer, self).__init__()
        self.masker = LearnableLocality(input_dim=base_input_dim, k=k)
        self.fc = nn.Conv1d(base_input_dim * k, 2 * k * base_output_dim, kernel_size=1, groups=k, bias=bias)
        initialize_glu(self.fc, input_dim=base_input_dim * k, output_dim=2 * k * base_output_dim)
        self.bn = GBN(2 * base_output_dim * k, virtual_batch_size)
        self.k = k
        self.base_output_dim = base_output_dim

    def forward(self, x):
        b = x.size(0)
        x = self.masker(x)  # [B, D] -> [B, k, D]
        x = self.fc(x.view(b, -1, 1))  # [B, k, D] -> [B, k * D, 1] -> [B, k * (2 * D'), 1]
        x = self.bn(x)
        chunks = x.chunk(self.k, 1)  # k * [B, 2 * D', 1]
        x = sum([F.relu(torch.sigmoid(x_[:, :self.base_output_dim, :]) * x_[:, self.base_output_dim:, :]) for x_ in chunks])  # k * [B, D', 1] -> [B, D', 1]
        return x.squeeze(-1)


class BasicBlock(nn.Module):
    def __init__(self, input_dim, base_outdim, k, virtual_batch_size, fix_input_dim, drop_rate):
        super(BasicBlock, self).__init__()
        self.conv1 = AbstractLayer(input_dim, base_outdim // 2, k, virtual_batch_size)
        self.conv2 = AbstractLayer(base_outdim // 2, base_outdim, k, virtual_batch_size)

        self.downsample = nn.Sequential(
            nn.Dropout(drop_rate),
            AbstractLayer(fix_input_dim, base_outdim, k, virtual_batch_size)
        )

    def forward(self, x, pre_out=None):
        if pre_out == None:
            pre_out = x
        out = self.conv1(pre_out)
        out = self.conv2(out)
        identity = self.downsample(x)
        out += identity
        return F.leaky_relu(out, 0.01)


class DANet(nn.Module):
    def __init__(self, input_dim, num_classes, layer_num, base_outdim, k, virtual_batch_size, drop_rate=0.1):
        super(DANet, self).__init__()
        params = {'base_outdim': base_outdim, 'k': k, 'virtual_batch_size': virtual_batch_size,
                  'fix_input_dim': input_dim, 'drop_rate': drop_rate}
        self.init_layer = BasicBlock(input_dim, **params)
        self.lay_num = layer_num
        self.layer = nn.ModuleList()
        for i in range((layer_num // 2) - 1):
            self.layer.append(BasicBlock(base_outdim, **params))
        self.drop = nn.Dropout(0.1)

        self.fc = nn.Sequential(nn.Linear(base_outdim, 256),
                                nn.ReLU(inplace=True),
                                nn.Linear(256, 512),
                                nn.ReLU(inplace=True),
                                nn.Linear(512, num_classes))

    def forward(self, x):
        out = self.init_layer(x)
        for i in range(len(self.layer)):
            out = self.layer[i](x, out)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [7]:
class MLP(torchvision.ops.MLP):
    
    def __init__(self, in_channels, hidden_channels, norm_layer, activation_layer, bias, dropout):
        
        super().__init__(in_channels, hidden_channels, norm_layer, activation_layer, bias, dropout)
        
    def train(self, X_train, Y_train, iters=1000, lr=0.001, verbose=False):
        
        loss_f = nn.MSELoss()

        optimizer = torch.optim.Adam(self.parameters(), lr=lr)

        for step in range(iters):

            y_preds = self(X_train)

            # Compute loss
            loss = loss_f(y_preds, Y_train)
            
            if step%100 == 0 and verbose:
        
                print(f"train loss: {loss.item()}")

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [8]:
in_channels = len(total_feats)

out_dim =  len(targets)

hidden_channels = [125]*11 + [out_dim]

norm_layer = nn.LayerNorm

activation_layer = nn.ReLU

bias = True

dropout = 0.189
    
mlp = MLP(in_channels=in_channels,hidden_channels=hidden_channels,
                         norm_layer=norm_layer,activation_layer=activation_layer, bias=bias, dropout=dropout)
mlp.train(X_train, Y_train, lr=0.000418, verbose=True, iters=300)

train loss: 0.588984489440918
train loss: 0.04886932298541069
train loss: 0.041369207203388214


In [9]:
params = {'eta': 0.06014477612764848, 
          'max_depth': 5, 
          'min_child_weight': 14, 
          'lambda': 0.23378311898486798, 
          'alpha': 0.00011202585063587642, 
          'gamma': 0.0009675173727657638, 
          'subsample': 0.6661968185586394, 
          'colsample_bytree': 0.871814732691916, 
          'grow_policy': 'depthwise', 
          'max_bin': 123}
# Train
dtrain = xgb.DMatrix(X_train, label=Y_train)
xgb_model = xgb.train(params, dtrain, num_boost_round=100)


# 2. Load PyTorch checkpoint (.ckpt)
# (Assuming this is a PyTorch Lightning checkpoint)
ftt = TabularModel.load_model('ftt')

node = TabularModel.load_model('node')

in_channels = len(total_feats)

out_dim =  len(targets)

hidden_channels = [125]*11 + [out_dim]

norm_layer = nn.LayerNorm

activation_layer = nn.ReLU

bias = True

dropout = 0.189

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [14]:
class MetaModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MetaModel, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
        
    def forward(self, x):
        return self.fc(x)

In [10]:
import os
import torch.optim as optim
from tqdm import tqdm 
# Set GPU if available
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use GPU 0 if multiple available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert to PyTorch tensors (no normalization)
train_dataset = TensorDataset(
    X_train,
    Y_train
)
val_dataset = TensorDataset(
    X_val,
    Y_val
)
test_dataset = TensorDataset(
    X_test,
    Y_test
)

# Model configuration
config = {
    'input_dim': len(total_feats),
    'num_classes': len(targets),
    'layer_num': 12,               # Must be even number
    'base_outdim': 256,           # Hidden layer size
    'k': 6,                       # Multiplicative factor
    'virtual_batch_size': 256,     # For Ghost BatchNorm
    'drop_rate': 0.05020187264748346,
    'batch_size': 128,
    'lr': 0.005787836702412583,
    'weight_decay': 4.6353263330458526e-07,
    'epochs': 50,
    'patience': 5
}

# Initialize model
model = DANet(
    input_dim=config['input_dim'],
    num_classes=config['num_classes'],
    layer_num=config['layer_num'],
    base_outdim=config['base_outdim'],
    k=config['k'],
    virtual_batch_size=config['virtual_batch_size'],
    drop_rate=config['drop_rate']
).to(device)

# Multi-target loss function
def multi_target_mse(preds, targets):
    return torch.mean((preds - targets) ** 2)

optimizer = optim.AdamW(model.parameters(), 
                       lr=config['lr'], 
                       weight_decay=config['weight_decay'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                mode='min',
                                                patience=5,
                                                factor=0.5)

# Data loaders
train_loader = DataLoader(train_dataset, 
                         batch_size=config['batch_size'], 
                         shuffle=True)
val_loader = DataLoader(val_dataset, 
                       batch_size=config['batch_size'])
test_loader = DataLoader(test_dataset,
                        batch_size=config['batch_size'])

# Training loop
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(config['epochs']):
    # Training
    model.train()
    train_loss = 0
    for X_batch, Y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = multi_target_mse(outputs, Y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * X_batch.size(0)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            outputs = model(X_batch)
            val_loss += multi_target_mse(outputs, Y_batch).item() * X_batch.size(0)
    
    # Calculate metrics
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= config['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model
model.load_state_dict(torch.load('best_model.pth'))

# Evaluation function
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            outputs = model(X_batch)
            total_loss += multi_target_mse(outputs, Y_batch).item() * X_batch.size(0)
    return total_loss / len(loader.dataset)

# Final evaluation
test_loss = evaluate(model, test_loader)
print(f"\nTest MSE: {test_loss:.4f}")
print(f"Test RMSE: {np.sqrt(test_loss):.4f}")

# Optional: Save predictions
def save_predictions(model, loader, filename):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for X_batch, _ in loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch).cpu().numpy()
            all_preds.append(preds)
    np.save(filename, np.concatenate(all_preds))

save_predictions(model, test_loader, 'test_predictions.npy')

Epoch 1: 100%|██████████| 79/79 [00:11<00:00,  6.88it/s]


Epoch 1: Train Loss: 0.2570 | Val Loss: 0.0581


Epoch 2: 100%|██████████| 79/79 [00:09<00:00,  8.60it/s]


Epoch 2: Train Loss: 0.0539 | Val Loss: 0.0509


Epoch 3: 100%|██████████| 79/79 [00:09<00:00,  8.46it/s]


Epoch 3: Train Loss: 0.0518 | Val Loss: 0.0502


Epoch 4: 100%|██████████| 79/79 [00:09<00:00,  8.25it/s]


Epoch 4: Train Loss: 0.0506 | Val Loss: 0.0500


Epoch 5: 100%|██████████| 79/79 [00:09<00:00,  8.51it/s]


Epoch 5: Train Loss: 0.0503 | Val Loss: 0.0498


Epoch 6: 100%|██████████| 79/79 [00:09<00:00,  8.32it/s]


Epoch 6: Train Loss: 0.0500 | Val Loss: 0.0480


Epoch 7: 100%|██████████| 79/79 [00:09<00:00,  8.50it/s]


Epoch 7: Train Loss: 0.0494 | Val Loss: 0.0482


Epoch 8: 100%|██████████| 79/79 [00:09<00:00,  8.41it/s]


Epoch 8: Train Loss: 0.0488 | Val Loss: 0.0471


Epoch 9: 100%|██████████| 79/79 [00:09<00:00,  8.63it/s]


Epoch 9: Train Loss: 0.0490 | Val Loss: 0.0477


Epoch 10: 100%|██████████| 79/79 [00:09<00:00,  8.37it/s]


Epoch 10: Train Loss: 0.0485 | Val Loss: 0.0474


Epoch 11: 100%|██████████| 79/79 [00:10<00:00,  7.28it/s]


Epoch 11: Train Loss: 0.0486 | Val Loss: 0.0469


Epoch 12: 100%|██████████| 79/79 [00:11<00:00,  6.99it/s]


Epoch 12: Train Loss: 0.0481 | Val Loss: 0.0475


Epoch 13: 100%|██████████| 79/79 [00:11<00:00,  7.14it/s]


Epoch 13: Train Loss: 0.0482 | Val Loss: 0.0476


Epoch 14: 100%|██████████| 79/79 [00:11<00:00,  7.18it/s]


Epoch 14: Train Loss: 0.0482 | Val Loss: 0.0459


Epoch 15: 100%|██████████| 79/79 [00:10<00:00,  7.51it/s]


Epoch 15: Train Loss: 0.0479 | Val Loss: 0.0511


Epoch 16: 100%|██████████| 79/79 [00:10<00:00,  7.56it/s]


Epoch 16: Train Loss: 0.0480 | Val Loss: 0.0461


Epoch 17: 100%|██████████| 79/79 [00:10<00:00,  7.78it/s]


Epoch 17: Train Loss: 0.0475 | Val Loss: 0.0466


Epoch 18: 100%|██████████| 79/79 [00:11<00:00,  6.99it/s]


Epoch 18: Train Loss: 0.0477 | Val Loss: 0.0464


Epoch 19: 100%|██████████| 79/79 [00:11<00:00,  6.84it/s]


Epoch 19: Train Loss: 0.0474 | Val Loss: 0.0463
Early stopping at epoch 19


  model.load_state_dict(torch.load('best_model.pth'))



Test MSE: 0.0453
Test RMSE: 0.2129


In [11]:
danet = model.cpu()

In [22]:
data_config = DataConfig(
    target=encode(targets),
    continuous_cols=encode(total_feats),
)
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=100,
    accelerator="gpu"
)
optimizer_config = OptimizerConfig()

model_config = NodeConfig(
    num_layers=1,
    num_trees=2048,
    task="regression",  # or "regression"
    head="LinearHead",      # Using LinearHead with sigmoid
    head_config={
        "layers": None,    # No additional layers
        "activation": "Sigmoid"  # Sigmoid activation
    },
    #data_aware_init_batch_size=1000,
)

# Initialize and train the model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)
tabular_model.fit(train=df_train, validation=df_val)

Y_test =  torch.tensor(df_test[encode(targets)].values).type(torch.float).cuda()

def compute_mse_per_covariate(predictions, targets):
    # Ensure predictions and targets are the same shape
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match"

    # Compute squared error per covariate and average over the batch (dim=0)
    mse_per_covariate = torch.mean((predictions - targets) ** 2, dim=0)

    return mse_per_covariate  # Returns a tensor of shape (15,)

y_pred = tabular_model.predict(df_test)

mse_per_covariate = compute_mse_per_covariate(torch.tensor(y_pred.values).type(torch.float).cuda(),Y_test)
    # Convert MSE tensor to numpy and pair with column names
print(f"Overall MSE: {mse_per_covariate.mean()}")
with torch.no_grad(): 
    mse_per_covariate_np = mse_per_covariate.cpu().numpy()  # If using GPU: .cpu().numpy()

# Display as a DataFrame for better readability
mse_df = pd.DataFrame({
    'Covariate': targets,
    'MSE': mse_per_covariate_np
})

order = ['Albumin',
 'Alkaline Phosphatase',
 'Neutrophils',
 'pO2',
 'Magnesium',
 'MCH',
 'Red Blood Cells',
 'Creatinine',
 'Platelet Count',
 'PT',
 'Alanine Aminotransferase (ALT)',
 'Base Excess',
 'MCV',
 'Hemoglobin',
 'RDW-SD',
 'Creatine Kinase (CK)',
 'Glucose',
 'Bicarbonate',
 'Bilirubin, Total',
 'INR(PT)',
 'Lymphocytes',
 'MCHC',
 'Sodium',
 'Anion Gap',
 'RDW',
 'Lactate',
 'Calculated Total CO2',
 'Basophils']

mse_df_reordered = mse_df.set_index('Covariate').reindex(order).reset_index()
mse_df_reordered

print(mse_df_reordered)



Seed set to 42




GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.07585775750291836
Restoring states from the checkpoint path at C:\Users\joshu\stacked_models\.lr_find_bc90b942-824b-4429-ae23-759ca4f119d6.ckpt
Restored all states from the checkpoint at C:\Users\joshu\stacked_models\.lr_find_bc90b942-824b-4429-ae23-759ca4f119d6.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

`Trainer.fit` stopped: `max_epochs=10` reached.


  return torch.load(f, map_location=map_location)


Overall MSE: 0.042143866419792175
                         Covariate       MSE
0                          Albumin  0.030001
1             Alkaline Phosphatase  0.048064
2                      Neutrophils  0.036125
3                              pO2  0.030278
4                        Magnesium  0.061373
5                              MCH  0.085020
6                  Red Blood Cells  0.015027
7                       Creatinine  0.030382
8                   Platelet Count  0.061968
9                               PT  0.051331
10  Alanine Aminotransferase (ALT)  0.020215
11                     Base Excess  0.006348
12                             MCV  0.085208
13                      Hemoglobin  0.005085
14                          RDW-SD  0.028423
15            Creatine Kinase (CK)  0.030024
16                         Glucose  0.076888
17                     Bicarbonate  0.042306
18                Bilirubin, Total  0.036690
19                         INR(PT)  0.041935
20                   

NameError: name 'args' is not defined

In [24]:
node = tabular_model

In [30]:
data_config = DataConfig(
    target=encode(targets),
    continuous_cols=encode(total_feats),
)
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=256,
    accelerator="gpu",
    max_epochs=99
)
optimizer_config = OptimizerConfig()

model_config = FTTransformerConfig(
    num_heads=2,          # Number of attention heads
    num_attn_blocks=8,  # Number of transformer blocks
    input_embed_dim=512,
    embedding_dropout=0.1,            # Dropout for feature embeddings
    attn_dropout=0.1,            # Dropout for attention layers
    ff_dropout=0.1,                  # Dropout in feed-forward network
    task="regression",                # or "regression"
    target_range=[(0,1)]*len(targets)
)

# Initialize and train the model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)
tabular_model.fit(train=df_train, validation=df_val)

Y_test =  torch.tensor(df_test[encode(targets)].values).type(torch.float).cuda()

def compute_mse_per_covariate(predictions, targets):
    # Ensure predictions and targets are the same shape
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match"

    # Compute squared error per covariate and average over the batch (dim=0)
    mse_per_covariate = torch.mean((predictions - targets) ** 2, dim=0)

    return mse_per_covariate  # Returns a tensor of shape (15,)

y_pred = tabular_model.predict(df_test)

mse_per_covariate = compute_mse_per_covariate(torch.tensor(y_pred.values).type(torch.float).cuda(), Y_test)
print(f"Overall MSE: {mse_per_covariate.mean()}")
with torch.no_grad(): 
    mse_per_covariate_np = mse_per_covariate.cpu().numpy()  # If using GPU: .cpu().numpy()

# Display as a DataFrame for better readability
mse_df = pd.DataFrame({
    'Covariate': targets,
    'MSE': mse_per_covariate_np
})

order = ['Albumin',
 'Alkaline Phosphatase',
 'Neutrophils',
 'pO2',
 'Magnesium',
 'MCH',
 'Red Blood Cells',
 'Creatinine',
 'Platelet Count',
 'PT',
 'Alanine Aminotransferase (ALT)',
 'Base Excess',
 'MCV',
 'Hemoglobin',
 'RDW-SD',
 'Creatine Kinase (CK)',
 'Glucose',
 'Bicarbonate',
 'Bilirubin, Total',
 'INR(PT)',
 'Lymphocytes',
 'MCHC',
 'Sodium',
 'Anion Gap',
 'RDW',
 'Lactate',
 'Calculated Total CO2',
 'Basophils']

mse_df_reordered = mse_df.set_index('Covariate').reindex(order).reset_index()
mse_df_reordered

print(mse_df_reordered)

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory C:\Users\joshu\stacked_models\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:298: The number of training batches (40) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
C:\Users\joshu\anaconda3\envs\TT_net\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many w

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 1.3182567385564076e-05
Restoring states from the checkpoint path at C:\Users\joshu\stacked_models\.lr_find_4667d132-8fe2-4e2a-8b07-f299f671ee54.ckpt
Restored all states from the checkpoint at C:\Users\joshu\stacked_models\.lr_find_4667d132-8fe2-4e2a-8b07-f299f671ee54.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

  return torch.load(f, map_location=map_location)


Overall MSE: 0.03838161379098892
                         Covariate       MSE
0                          Albumin  0.029157
1             Alkaline Phosphatase  0.044323
2                      Neutrophils  0.034370
3                              pO2  0.027987
4                        Magnesium  0.055637
5                              MCH  0.075585
6                  Red Blood Cells  0.012180
7                       Creatinine  0.029232
8                   Platelet Count  0.057414
9                               PT  0.048430
10  Alanine Aminotransferase (ALT)  0.018610
11                     Base Excess  0.004358
12                             MCV  0.074474
13                      Hemoglobin  0.004577
14                          RDW-SD  0.024969
15            Creatine Kinase (CK)  0.025572
16                         Glucose  0.069630
17                     Bicarbonate  0.039498
18                Bilirubin, Total  0.032843
19                         INR(PT)  0.038799
20                    

In [31]:
ftt = tabular_model

In [50]:
class Stack:
    def __init__(self, danet, mlp, ftt, node, xgb_model):
        self.danet = danet
        self.mlp = mlp
        self.ftt = ftt
        self.node = node
        self.xgb_model = xgb_model
        self.MetaModel = MetaModel(5,1)
        
    def fit(self, X_train, df_train, Y_train):
        dtrain = xgb.DMatrix(X_train, label=Y_train)
        train_loader = DataLoader(X_train, 
                         batch_size=5000, 
                         shuffle=False)
        self.danet.eval()
        danet_preds = []
        with torch.no_grad():
            for X_batch in train_loader:
                X_batch = X_batch
                preds = self.danet(X_batch)
                danet_preds.append(preds)
        danet_preds = torch.cat(danet_preds, dim=0)
        with torch.no_grad():
            mlp_preds = self.mlp(X_train)
        ftt_preds = torch.Tensor(self.ftt.predict(df_train[encode(total_feats)]).values)
        node_preds = torch.Tensor(self.node.predict(df_train[encode(total_feats)]).values)
        xgboost_preds = torch.Tensor(self.xgb_model.predict(dtrain))
        combined = torch.stack([ftt_preds, node_preds, xgboost_preds, danet_preds, mlp_preds], dim=2)
        criterion = nn.MSELoss()
        
        danet_loss = criterion(danet_preds, Y_train).item()
        print(f"danet loss: {danet_loss}")
        
        node_loss = criterion(node_preds, Y_train).item()
        print(f"node loss: {node_loss}")
        
        xg_loss = criterion(xgboost_preds, Y_train).item()
        print(f"xg loss: {xg_loss}")
        
        ftt_loss = criterion(ftt_preds, Y_train).item()
        print(f"ftt loss: {ftt_loss}")
        
        mlp_loss = criterion(mlp_preds, Y_train).item()
        print(f"mlp loss: {mlp_loss}")
        
        optimizer = torch.optim.Adam(self.MetaModel.parameters(), lr=0.01)
        print(combined.shape)
        print(Y_train.shape)
        for epoch in range(10000):
            outputs = self.MetaModel(combined)
            outputs = outputs.squeeze(dim=2)
            loss = criterion(outputs, Y_train)
            optimizer.zero_grad()
            loss.backward()
            if epoch%100 == 0:
                print(loss.item())
            optimizer.step()
            
    def predict(self, X_test, df_test, Y_test):
        dtrain = xgb.DMatrix(X_test, label=Y_test)
        train_loader = DataLoader(X_test, 
                         batch_size=5000, 
                         shuffle=False)
        self.danet.eval()
        danet_preds = []
        with torch.no_grad():
            for X_batch in train_loader:
                X_batch = X_batch
                preds = self.danet(X_batch)
                danet_preds.append(preds)
        danet_preds = torch.cat(danet_preds, dim=0)
        with torch.no_grad():
            mlp_preds = self.mlp(X_test)
        ftt_preds = torch.Tensor(self.ftt.predict(df_test[encode(total_feats)]).values)
        node_preds = torch.Tensor(self.node.predict(df_test[encode(total_feats)]).values)
        xgboost_preds = torch.Tensor(self.xgb_model.predict(dtrain))
        combined = torch.stack([ftt_preds, node_preds, xgboost_preds, danet_preds, mlp_preds], dim=2)
        criterion = nn.MSELoss()
        
        danet_loss = criterion(danet_preds, Y_test).item()
        print(f"danet loss: {danet_loss}")
        
        node_loss = criterion(node_preds, Y_test).item()
        print(f"node loss: {node_loss}")
        
        xg_loss = criterion(xgboost_preds, Y_test).item()
        print(f"xg loss: {xg_loss}")
        
        ftt_loss = criterion(ftt_preds, Y_test).item()
        print(f"ftt loss: {ftt_loss}")
        
        mlp_loss = criterion(mlp_preds, Y_test).item()
        print(f"mlp loss: {mlp_loss}")
        print(combined.shape)
        print(Y_test.shape)
        
        with torch.no_grad():
            outputs = self.MetaModel(combined)
        return outputs
        
        

In [51]:
stack = Stack(danet, mlp, ftt, node, xgb_model)
stack.fit(X_train, df_train, Y_train)

danet loss: 0.04592215642333031
node loss: 0.027988247573375702
xg loss: 0.03253047168254852
ftt loss: 0.03677967190742493
mlp loss: 0.03960017114877701
torch.Size([10000, 28, 5])
torch.Size([10000, 28])
0.11069447547197342
0.032190680503845215
0.03031049855053425
0.02877306379377842
0.027724435552954674
0.027087774127721786
0.026723699644207954
0.0265158973634243
0.026391414925456047
0.02631155401468277
0.02625749073922634
0.026219988241791725
0.026193952187895775
0.02617613784968853
0.026164205744862556
0.02615642547607422
0.026151500642299652
0.02614847756922245
0.02614668942987919
0.02614566497504711
0.026145100593566895
0.02614480070769787
0.02614464983344078
0.02614457719027996
0.026144541800022125
0.02614452689886093
0.026144521310925484
0.026144517585635185
0.026144517585635185
0.026144517585635185
0.026144515722990036
0.026144515722990036
0.026144517585635185
0.026144519448280334
0.026144517585635185
0.026144517585635185
0.026144517585635185
0.026144515722990036
0.026144515722

In [52]:
type(Y_test)

torch.Tensor

In [62]:
def compute_mse_per_covariate(predictions, targets):
    # Ensure predictions and targets are the same shape
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match"

    # Compute squared error per covariate and average over the batch (dim=0)
    mse_per_covariate = torch.mean((predictions - targets) ** 2, dim=0)

    return mse_per_covariate  # Returns a tensor of shape (15,)

y_pred = stack.predict(X_test.cpu(), df_test, Y_test.cpu())

danet loss: 0.04533591866493225
node loss: 0.042143866419792175
xg loss: 0.03779824823141098
ftt loss: 0.03838161379098892
mlp loss: 0.04083168879151344
torch.Size([4640, 28, 5])
torch.Size([4640, 28])


In [63]:
y_pred = y_pred.squeeze()

In [64]:
def compute_mse_per_covariate(predictions, targets):
    # Ensure predictions and targets are the same shape
    assert predictions.shape == targets.shape, "Shapes of predictions and targets must match"

    # Compute squared error per covariate and average over the batch (dim=0)
    mse_per_covariate = torch.mean((predictions - targets) ** 2, dim=0)

    return mse_per_covariate  # Returns a tensor of shape (15,)

mse_per_covariate = compute_mse_per_covariate(y_pred, Y_test.cpu())
print(f"Overall MSE: {mse_per_covariate.mean()}")
with torch.no_grad(): 
    mse_per_covariate_np = mse_per_covariate.cpu().numpy()  # If using GPU: .cpu().numpy()

# Display as a DataFrame for better readability
mse_df = pd.DataFrame({
    'Covariate': targets,
    'MSE': mse_per_covariate_np
})

order = ['Albumin',
 'Alkaline Phosphatase',
 'Neutrophils',
 'pO2',
 'Magnesium',
 'MCH',
 'Red Blood Cells',
 'Creatinine',
 'Platelet Count',
 'PT',
 'Alanine Aminotransferase (ALT)',
 'Base Excess',
 'MCV',
 'Hemoglobin',
 'RDW-SD',
 'Creatine Kinase (CK)',
 'Glucose',
 'Bicarbonate',
 'Bilirubin, Total',
 'INR(PT)',
 'Lymphocytes',
 'MCHC',
 'Sodium',
 'Anion Gap',
 'RDW',
 'Lactate',
 'Calculated Total CO2',
 'Basophils']

mse_df_reordered = mse_df.set_index('Covariate').reindex(order).reset_index()
mse_df_reordered

print(mse_df_reordered)

Overall MSE: 0.043800074607133865
                         Covariate       MSE
0                          Albumin  0.030899
1             Alkaline Phosphatase  0.047540
2                      Neutrophils  0.038079
3                              pO2  0.033059
4                        Magnesium  0.064148
5                              MCH  0.087763
6                  Red Blood Cells  0.015655
7                       Creatinine  0.032206
8                   Platelet Count  0.064559
9                               PT  0.052126
10  Alanine Aminotransferase (ALT)  0.021747
11                     Base Excess  0.005186
12                             MCV  0.089636
13                      Hemoglobin  0.005692
14                          RDW-SD  0.030120
15            Creatine Kinase (CK)  0.031126
16                         Glucose  0.078515
17                     Bicarbonate  0.045638
18                Bilirubin, Total  0.038641
19                         INR(PT)  0.042347
20                   

In [None]:
print(Y_train.shape)

In [None]:
tensor1 = torch.randn(100, 5)  # 100 samples, 5 features
tensor2 = torch.randn(100, 3)  # 100 samples, 3 features
tensor3 = torch.randn(100, 2)  # 100 samples, 2 features

# Horizontal stacking (along feature dimension)
stacked_tensor = torch.cat([tensor1, tensor2, tensor3], dim=1)
print(stacked_tensor.shape)

In [None]:
stacked_model.fit(X_train, Y_train)