In [None]:
# ==================== Relevant Imports ====================

# Torch and Torch-related modules for deep learning
import torch  # Core PyTorch functionalities
from torch import nn
torch.autograd.set_detect_anomaly(True)  # Enables anomaly detection for debugging
from transformers import ViTHybridModel, AutoModelForImageClassification
from torchvision import models  # Pretrained models

# General-purpose libraries
import os  # OS-level operations (file paths, directory handling)
import random  # Random number generation

# c libraries
from skimage import exposure  # Module for image intensity adjustment and histogram-based operations, such as contrast enhancement and histogram equalization.
import cv2  # OpenCV for image processing
from PIL import Image  # Image handling
import pydicom  # Handling DICOM medical imaging files
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computations and array operations
import matplotlib.pyplot as plt  # Plotting and visualization

# Progress bar visualization
from tqdm import tqdm  # Displays progress bars for loops and tasks

# Machine learning utilities
from sklearn.model_selection import KFold  # Cross-validation splitting strategy

# Custom configurations and modules
from config import HyperP  # Hyperparameter configuration object

# ==========================================================


In [None]:
# Instantiate the HyperP class with model-specific configurations
# 'slope_train_vit_simple' indicates the model is for slope prediction
hyp = HyperP(model_type="slope_train_vit_simple")  

# Load a range of images from a CSV file into a Pandas DataFrame
# Assumes "images.csv" contains metadata or a list of images
images_range = pd.read_csv("images.csv")

# ==================== Set Random Seed for Reproducibility ====================

# Retrieve the seed value from the HyperP object
seed = hyp.seed  

# Set the seed for Python's built-in random module
random.seed(seed)  

# Set the PYTHONHASHSEED environment variable for hash-based operations 
# to ensure deterministic results
os.environ['PYTHONHASHSEED'] = str(seed)  

# Set the seed for NumPy's random number generator
np.random.seed(seed)  

# Set the seed for PyTorch to ensure reproducibility of results
torch.manual_seed(seed)  

# Configure PyTorch's CUDA backend for deterministic results
# `deterministic = True`: Ensures consistent results by using deterministic algorithms
torch.backends.cudnn.deterministic = True  

# Disable the use of non-deterministic algorithms to avoid performance trade-offs
# `benchmark = False`: Avoids auto-tuning for optimization, ensuring repeatability
torch.backends.cudnn.benchmark = False  

In [None]:
# Retrieve the root directory for data from the HyperP configuration object
root_path = hyp.data_folder  

# Load training metadata from a CSV file located in the data folder
# This CSV likely contains information about the training dataset
train = pd.read_csv(f'{root_path}/train.csv')  

# Filter out rows from the training data where the Patient ID matches specific values
# These Patient IDs are likely identified as problematic
train_without_badid = train[(train.Patient != 'ID00011637202177653955184') & 
                             (train.Patient != 'ID00052637202186188008618')]  


In [None]:
# Function to generate tabular features from a DataFrame for a single patient
def get_tab(df):
    # Initialize a feature vector with the normalized age of the patient
    # Normalization is done using the mean and standard deviation of the training data's Age
    vector = [(df.Age.values[0] - train.Age.values.mean()) / train.Age.values.std()]
    
    # Add a binary encoding for the patient's gender
    # 0 for 'Male', 1 for 'Female'
    if df.Sex.values[0] == 'Male':
        vector.append(0)
    else:
        vector.append(1)
    
    # Add a one-hot encoded representation for the patient's smoking status
    # 'Never smoked' -> [0, 0]
    # 'Ex-smoker' -> [1, 1]
    # 'Currently smokes' -> [0, 1]
    # Any other status -> [1, 0]
    if df.SmokingStatus.values[0] == 'Never smoked':
        vector.extend([0, 0])
    elif df.SmokingStatus.values[0] == 'Ex-smoker':
        vector.extend([1, 1])
    elif df.SmokingStatus.values[0] == 'Currently smokes':
        vector.extend([0, 1])
    else:
        vector.extend([1, 0])
    
    # Convert the feature vector to a NumPy array and return it
    return np.array(vector)


In [None]:
# Function to load and preprocess a medical image from a DICOM file
def get_img(path):  
    # Read the DICOM file using pydicom
    d = pydicom.dcmread(path)
    
    # Extract the pixel array from the DICOM and enhance contrast using histogram equalization
    # Resize the processed image to a fixed resolution of 384x384 using OpenCV
    output = cv2.resize(exposure.equalize_hist(d.pixel_array), (384, 384))
    
    # Return the preprocessed image
    return output

# Function to load and preprocess a mask image
def get_mask(path):
    # Open the mask image using PIL (assumes it is a standard image format like PNG or JPEG)
    mask = Image.open(path)
    
    # Convert the mask to a NumPy array and resize it to 384x384
    mask = cv2.resize(np.array(mask), (384, 384))
    
    # Reshape the mask to ensure it has dimensions 384x384 (optional step for consistency)
    return mask.reshape(384, 384)


In [None]:
# Initialize dictionaries and list to store results
A = {}  # Dictionary to store the slopes (linear regression results) for each patient
TAB = {}  # Dictionary to store tabular features for each patient
P = []  # List to store patient IDs

# Iterate through each unique patient in the training dataset
for i, p in enumerate(tqdm(train.Patient.unique())):
    # Filter data for the current patient
    sub = train.loc[train.Patient == p, :] 
    
    # Extract Forced Vital Capacity (FVC) and Weeks (time points) for the patient
    fvc = sub.FVC.values
    weeks = sub.Weeks.values
    
    # For each patient, the code calculates the slope of FVC over time (Weeks) using linear regression, generates tabular features, and stores the results (slope, features, and patient ID) in dictionaries and a list.
    c = np.vstack([weeks, np.ones(len(weeks))]).T  # Add a column of ones for the intercept term

    a, _ = np.linalg.lstsq(c, fvc, rcond=None)[0]  # Returns the slope 'a' of the best fit line
    # ref: https://numpy.org/doc/stable/reference/generated/numpy.linalg.lstsq.html

    A[p] = a
    TAB[p] = get_tab(sub)
    P.append(p)


100%|██████████| 176/176 [00:00<00:00, 1318.74it/s]


In [None]:
# Define a custom dataset class for training in PyTorch
class OSICData_train(torch.utils.data.Dataset):
    """
        Custom dataset class for loading and processing training data in the OSIC dataset.
        
        This dataset class filters out bad patient IDs, loads image data, corresponding masks, slopes, and tabular features, 
        and provides a random sampling of training data.

        Attributes:
        -----------
        BAD_ID : list
            A list of patient IDs to exclude from the dataset.
        keys : list
            A list of patient IDs to include in the dataset after filtering out bad IDs.
        a : dict
            A dictionary containing slopes for each patient.
        tab : dict
            A dictionary containing tabular features for each patient.
        all_data : list
            A list of image file paths for all selected patients.
        train_data : dict
            A dictionary where the key is a patient ID, and the value is a list of corresponding image file paths.
        mask_data : dict
            A dictionary where the key is a patient ID, and the value is a list of corresponding mask file paths.
        ten_percent : list
            A random sample (50%) of the data, used for training.

        Methods:
        --------
        __len__():
            Returns the number of samples (50% of available data) in the dataset.
        
        __getitem__(idx):
            Retrieves a specific sample (image, mask, tabular features, slope, and patient ID) at the given index `idx`.
        """
    BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']
    def __init__(self, keys, a, tab):
            
        """
        Initializes the OSICData_train dataset.

        Parameters:
        -----------
        keys : list
            A list of patient IDs to include in the dataset.
        a : dict
            A dictionary containing slopes for each patient.
        tab : dict
            A dictionary containing tabular features for each patient.
        """
        self.keys = [k for k in keys if k not in self.BAD_ID]
        
        # Store slopes (a) and tabular features (tab)
        self.a = a
        self.tab = tab
        
        # Initialize empty lists and dictionaries for storing data
        self.all_data = []
        self.train_data = {}
        self.mask_data = {}
        
        # Loop through each patient ID in the filtered list of keys
        for p in self.keys:  
            # Get the image range (min and max slices) for the current patient from the images_range DataFrame
            properties = images_range[images_range['ID'] == p]
            min_slice = properties['min']
            max_slice = properties['max'] 

            # List the image files and mask files for the patient, excluding the first and last 15% slices
            self.train_data[p] = os.listdir(f'{root_path}/train/{p}/')[int(min_slice.iloc[0]):int(max_slice.iloc[0])] 
            self.mask_data[p] = os.listdir(f'{root_path}/mask_clear/{p}/')[int(min_slice.iloc[0]):int(max_slice.iloc[0])] 
            
            # Add each image file path to the all_data list
            for m in self.train_data[p]:
                self.all_data.append(p + '/' + m)
            
        # Sample 50% of the data randomly for training
        length = int(0.5 * len(self.all_data))
        self.ten_percent = random.sample(self.all_data, length)

    # Define the length of the dataset (number of samples)
    def __len__(self):
        """
        Returns the number of samples in the dataset.

        Returns:
        --------
        int : The number of samples in the dataset (50% of the total available data).
        """
        return len(self.ten_percent)
    
    # Method to retrieve a specific sample from the dataset
    def __getitem__(self, idx):
        """
        Returns the number of samples in the dataset.

        Returns:
        --------
        int : The number of samples in the dataset (50% of the total available data).
        """
        # Initialize empty lists to hold masks, images, slopes, and tabular features
        masks = []
        x = []
        a, tab = [], [] 
        
        # Get the patient ID and image file path for the current sample
        k = self.ten_percent[idx]
        i = k[:25]  # Extract patient ID from the image file path
       
        try:
            # Construct the corresponding mask file path by replacing the extension with .jpg
            j = k[:-4] + '.jpg'
            
            # Load the image and mask using the get_img and get_mask functions
            img = get_img(f'{root_path}/train/{k}')
            mask = get_mask(f'{root_path}/mask_clear/{j}')

            # Append the mask, image, slope, and tabular features to the respective lists
            masks.append(mask)
            x.append(img)
            a.append(self.a[i])
            tab.append(self.tab[i])
        except:
            # Print a message if there is an error loading the image or mask
            print(k, j)

        # Convert the masks, images, slopes, and tabular features into PyTorch tensors
        masks, x, a, tab = torch.tensor(np.asanyarray(masks), dtype=torch.float32), torch.tensor(np.asanyarray(x), dtype=torch.float32), torch.tensor(np.asanyarray(a), dtype=torch.float32), torch.tensor(np.asanyarray(tab), dtype=torch.float32)
        
        # Remove the extra dimension from the tabular feature tensor
        tab = torch.squeeze(tab, axis=0)

        # Return the mask, image, and tabular feature tensors, along with the slope and patient ID
        return [masks, x, tab], a, k 



In [None]:
class OSICData_test(torch.utils.data.Dataset):
    # List of patient IDs to exclude from the dataset
    BAD_ID = ['ID00011637202177653955184', 'ID00052637202186188008618']
    
    def __init__(self, keys, a, tab):
        # Filter out bad patient IDs from the dataset
        self.keys = [k for k in keys if k not in self.BAD_ID]
        self.a = a  # Additional metadata for patients
        self.tab = tab  # Tabular data associated with patients

        self.train_data = {}  # Dictionary to store training image file names per patient
        self.mask_data = {}   # Dictionary to store corresponding mask file names per patient
        
        for p in self.keys:  # Iterate over patient IDs
            # Retrieve slice range information for the current patient
            properties = images_range[images_range['ID'] == p]
            min_slice = properties['min']
            max_slice = properties['max'] 

            # Get the total number of slices available for the patient
            p_n = len(os.listdir(f'{root_path}/train/{p}/'))

            # Select a subset of slices by removing a percentage from the beginning and end
            self.train_data[p] = os.listdir(f'{root_path}/train/{p}/')[int(hyp.strip_ct * p_n):-int(hyp.strip_ct * p_n)]
            self.mask_data[p] = os.listdir(f'{root_path}/mask_clear/{p}/')[int(hyp.strip_ct * p_n):-int(hyp.strip_ct * p_n)]

    def __len__(self):
        # Return the total number of patients in the dataset
        return len(self.keys)
    
    def __getitem__(self, idx):
        masks = []  # List to store mask images
        x = []  # List to store input images
        a, tab = [], []  # Lists to store corresponding metadata and tabular data
        
        k = self.keys[idx]  # Retrieve patient ID based on the provided index

        try:
            # Randomly select an image from the available slices for the patient
            i = np.random.choice(self.train_data[k], size=1)[0]
            j = i[:-4] + '.jpg'  # Convert filename to mask format (assuming a different extension)

            # Load the image and corresponding mask
            img = get_img(f'{root_path}/train/{k}/{i}')
            mask = get_mask(f'{root_path}/mask_clear/{k}/{j}')

            # Append the data to respective lists
            masks.append(mask)
            x.append(img)
            a.append(self.a[k])
            tab.append(self.tab[k])
        except:
            print(k, i)  # Print the patient ID and image filename in case of an error

        # Convert lists to PyTorch tensors
        masks = torch.tensor(np.asanyarray(masks), dtype=torch.float32)
        x = torch.tensor(np.asanyarray(x), dtype=torch.float32)
        a = torch.tensor(np.asanyarray(a), dtype=torch.float32)
        tab = torch.tensor(np.asanyarray(tab), dtype=torch.float32)
        
        # Remove extra dimensions from tabular data
        tab = torch.squeeze(tab, axis=0)

        return [masks, x, tab], a, k  # Return the data along with the patient ID


In [11]:
# Identity class
class Identity(nn.Module):
    # credit: ptrblck
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

In [12]:
class UnlearnableIdentityConvModel(nn.Module):
    def __init__(self):
        super(UnlearnableIdentityConvModel, self).__init__()
        self.conv = nn.Conv2d(
            in_channels=64,      # Number of input channels
            out_channels=64,     # Number of output channels
            kernel_size=1,       # 1x1 convolution
            stride=1,            # Stride of 1
            padding=0,           # No padding
            bias=False           # No bias term needed
        )
        
        # Initialize weights to be identity
        self.conv.weight.data = torch.eye(64).view(64, 64, 1, 1)
        
        # Freeze the weights
        self.conv.weight.requires_grad = False

    def forward(self, x):
        return self.conv(x)

In [14]:
# device
gpu = torch.device(f"cuda:{hyp.gpu_index}" if torch.cuda.is_available() else "cpu")
device = gpu

In [15]:
class Sparsemax(nn.Module):
    """
    This class is adapted from the TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py

    Sparsemax activation function for neural networks.
    """
    def __init__(self, dim=None):
        super(Sparsemax, self).__init__()
        self.dim = -1 if dim is None else dim

    def forward(self, input):
        input = input.transpose(0, self.dim)
        original_size = input.size()
        input = input.reshape(input.size(0), -1)
        input = input.transpose(0, 1)
        dim = 1

        number_of_logits = input.size(dim)
        
        input = input - torch.max(input, dim=dim, keepdim=True)[0].expand_as(input)
        zs = torch.sort(input=input, dim=dim, descending=True)[0]
        range = torch.arange(start=1, end=number_of_logits + 1, device=device,step=1, dtype=input.dtype).view(1, -1)
        range = range.expand_as(zs)

        bound = 1 + range * zs
        cumulative_sum_zs = torch.cumsum(zs, dim)
        is_gt = torch.gt(bound, cumulative_sum_zs).type(input.type())
        k = torch.max(is_gt * range, dim, keepdim=True)[0]
        zs_sparse = is_gt * zs
        taus = (torch.sum(zs_sparse, dim, keepdim=True) - 1) / k
        taus = taus.expand_as(input)
        self.output = torch.max(torch.zeros_like(input), input - taus)
        output = self.output
        output = output.transpose(0, 1)
        output = output.reshape(original_size)
        output = output.transpose(0, self.dim)
        return output
    def backward(self, grad_output):
        dim = 1
        nonzeros = torch.ne(self.output, 0)
        sum = torch.sum(grad_output * nonzeros, dim=dim) / torch.sum(nonzeros, dim=dim)
        self.grad_input = nonzeros * (grad_output - sum.expand_as(grad_output))
        return self.grad_input

In [16]:
def initialize_non_glu(module,inp_dim,out_dim):
     """
    Initialize the weights of a given module using Xavier normal initialization.

    Reference:
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py

    Args:
        module (torch.nn.Module): The module whose weights need initialization.
        inp_dim (int): Input dimension of the layer.
        out_dim (int): Output dimension of the layer.

    Returns:
        None
    """
     gain = np.sqrt((inp_dim+out_dim)/np.sqrt(4*inp_dim))
     torch.nn.init.xavier_normal_(module.weight, gain=gain)

In [17]:
class GBN(nn.Module):
    """
    Ghost Batch Normalization (GBN).
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py
    """
    def __init__(self, inp, vbs=128, momentum=0.01):
        super().__init__()
        self.bn = nn.BatchNorm1d(inp, momentum=momentum)
        self.vbs = vbs
    
    def forward(self, x):
        chunk = torch.chunk(x, max(1, x.size(0) // self.vbs), 0)
        res = [self.bn(y) for y in chunk]
        return torch.cat(res, 0)

class GLU(nn.Module):
    """ 
    Gated Linear Unit (GLU) activation 
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py
    """
    def __init__(self, inp_dim, out_dim, fc=None, vbs=128):
        super().__init__()
        self.fc = fc if fc else nn.Linear(inp_dim, out_dim * 2)
        self.bn = GBN(out_dim * 2, vbs=vbs)
        self.od = out_dim
    
    def forward(self, x):
        x = self.bn(self.fc(x))
        return x[:, :self.od] * torch.sigmoid(x[:, self.od:])

class FeatureTransformer(nn.Module):
    """ 
    Feature transformation block using GLU layers 
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py
    """
    def __init__(self, inp_dim, out_dim, shared, n_ind, vbs=128):
        super().__init__()
        self.shared = nn.ModuleList([GLU(inp_dim, out_dim, shared[0], vbs=vbs)]) if shared else None
        self.independ = nn.ModuleList([GLU(out_dim, out_dim, vbs=vbs) for _ in range(n_ind)])
        self.scale = torch.sqrt(torch.tensor([.5], device=device))
    
    def forward(self, x):
        if self.shared:
            x = sum(glu(x) for glu in self.shared) * self.scale
        for glu in self.independ:
            x = (x + glu(x)) * self.scale
        return x

class AttentionTransformer(nn.Module):
    """ 
    Attention-based feature selection 
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py
    """
    def __init__(self, inp_dim, out_dim, relax, vbs=128):
        super().__init__()
        self.fc = nn.Linear(inp_dim, out_dim)
        self.bn = GBN(out_dim, vbs=vbs)
        self.r = torch.tensor([relax], device=device)
    
    def forward(self, a, priors):
        a = self.bn(self.fc(a))
        mask = torch.sigmoid(a * priors)
        priors = priors * (self.r - mask)
        return mask

class DecisionStep(nn.Module):
    """ 
    Decision step combining feature transformation and attention 
    Adapted from TabNet implementation:
    https://github.com/dreamquark-ai/tabnet/blob/develop/pytorch_tabnet/tab_network.py
    """
    def __init__(self, inp_dim, n_d, n_a, shared, n_ind, relax, vbs=128):
        super().__init__()
        self.fea_tran = FeatureTransformer(inp_dim, n_d + n_a, shared, n_ind, vbs)
        self.atten_tran = AttentionTransformer(n_a, inp_dim, relax, vbs)
    
    def forward(self, x, a, priors):
        mask = self.atten_tran(a, priors)
        loss = (-mask * torch.log(mask + 1e-10)).mean()
        x = self.fea_tran(x * mask)
        return x, loss

In [None]:
class TabCT(nn.Module):
    def __init__(self, cnn,num_features=4, feature_dim=40, output_dim=20, num_decision_steps=3,  # 2 boud
                    relaxation_factor=1, batch_momentum=0.1, epsilon=0.00001, vgg_npy_path = None
                    , n_shared=2, n_d=64, n_a=64, n_ind=2, n_steps=4,relax=1.2,vbs=128):
        super(TabCT, self).__init__()
        
        if n_shared>0:
            self.shared = nn.ModuleList()
            self.shared.append(nn.Linear(num_features,2*(n_d+n_a)))
            for x in range(n_shared-1):
                self.shared.append(nn.Linear(n_d+n_a,2*(n_d+n_a)))
        else:
            self.shared=None
        self.first_step = FeatureTransformer(num_features,n_d+n_a,self.shared,n_ind) 
        self.steps = nn.ModuleList()
        for x in range(n_steps-1):
            self.steps.append(DecisionStep(num_features,n_d,n_a,self.shared,n_ind,relax,vbs))
        self.fc_tab = nn.Linear(n_d,output_dim)
        self.bn = nn.BatchNorm1d(num_features)
        self.n_d = n_d

        # CT features
        cnn_dict = {'vit_base_patch16lung0' :None, 'vit_b_16':None, 'vgg16':models.vgg16, 'resnet18': models.resnet18, 'resnet34': models.resnet34, 'resnet50': models.resnet50,
                   'resnet101': models.resnet101, 'resnet152': models.resnet152, 'resnext50': models.resnext50_32x4d,
                   'resnext101': models.resnext101_32x8d}
        '''
        fully conected 
        self.fullyconected = nn.Linear(5, 25)
        '''



      


        self.num_features = num_features
        self.feature_dim = feature_dim
        self.output_dim = output_dim
        self.num_decision_steps = num_decision_steps
        self.relaxation_factor = relaxation_factor
        self.batch_momentum = batch_momentum
        # self.virtual_batch_size = virtual_batch_size
        # self.num_classes = num_classes
        self.epsilon = epsilon
        # feature dim
        self.out_dict = {'vit_base_patch16lung0':768, 'vit_b_16': 768, 'resnet18': 512, 'resnet34': 512, 'resnet50': 2048, 'resnet101': 2048, 'resnet152': 2048,
                         'resnext50': 2048, 'resnext101': 2048, "efnb0": 1280, "efnb1": 1280, "efnb2": 1408, 
                          "efnb3": 1536, "efnb4": 1792, "efnb5": 2048, "efnb6": 2304, "efnb7": 2560, "vgg16": 512}
        
        self.n_tab = hyp.n_tab # n tabular features
        
        # efficient net b0 to b7
        
        if cnn in cnn_dict.keys(): # resnet or resnext or vgg16
            if cnn == 'vgg16':
                # load vgg16 model badan inja be khatere none noudan ye if bezar

                self.ct_cnn = cnn_dict[cnn](pretrained = True).features[2:]
                self.conv = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 3, 3)), mean=0, std=0.01))
                self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))

            elif cnn == 'vit_b_16':


                # model
                self.con1 = ViTHybridModel.from_pretrained("google/vit-hybrid-base-bit-384")
                # print("model")          


                # change configuration 
                self.input_channel_after_mul = 64
                self.input_size_after_mul = 192

                self.con1.config.num_channels = self.input_channel_after_mul
                self.con1.config.image_size = self.input_size_after_mul 

                self.con1.embeddings.patch_embeddings.num_channels = self.input_channel_after_mul
                self.con1.embeddings.patch_embeddings.image_size = (self.input_size_after_mul , self.input_size_after_mul)

                self.con1.embeddings.patch_embeddings.backbone.bit.config.num_channels = self.input_channel_after_mul

                self.con1.embeddings.patch_embeddings.backbone.bit.embedder.num_channels = self.input_channel_after_mul

                # First CNN Layer
                self.conv = self.con1.embeddings.patch_embeddings.backbone.bit.embedder.convolution
                # print("First CNN Layer")

                # First Mask Layer
                self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 7, 7)), mean=0, std=0.01))
                self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
                self.mask = self.con1.embeddings.patch_embeddings.backbone.bit.embedder.convolution
                self.mask.weight = self.W
                self.mask.bias = self.B
                # print("First Mask Layer")

                # Rest of Embeddings
                self.embeddings = self.con1.embeddings
                self.embeddings.patch_embeddings.backbone.bit.embedder.convolution = UnlearnableIdentityConvModel()
                # print(self.embeddings)
                # print("Rest of Embeddings")

                # Encoder(ViT), layernorm, pooler layer
                self.ct_cnn = self.con1.encoder
                # print(self.ct_cnn)
                self.norm = self.con1.layernorm
                self.pooler = self.con1.pooler
                # print("Encoder(ViT), layernorm, pooler layer")



            elif cnn == 'vit_base_patch16lung0':
                self.con1 = AutoModelForImageClassification.from_pretrained("DunnBC22/vit-base-patch16-224-in21k_lung_and_colon_cancer").vit
                self.conv_layer = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=3, stride=3, padding=80)
                self.conv = self.con1.embeddings.patch_embeddings.backbone.embedder# nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))# self.ct_cnn.conv_proj 
                self.ct_cnn = self.con1.encoder

                self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((768, 3, 3, 3)), mean=0, std=0.01))
                self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(768), mean=0, std=0.01))
                self.mask = self.con1.embeddings
                self.mask.patch_embeddings.projection.weight = self.W
                self.mask.patch_embeddings.projection.bias = self.B

                self.norm = self.con1.layernorm
                self.pooler = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k").pooler
            else:   
                self.ct_cnn = cnn_dict[cnn](pretrained = True)
                
                # make single channel
                self.conv = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
                self.ct_cnn.conv1 = nn.Identity()
                self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 3, 3)), mean=0, std=0.01))
                self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
                # self.ct_cnn.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
                
                # remove the fc layer/ add a simple linear layer
                self.ct_cnn.fc = nn.Linear(self.out_dict[cnn], hyp.cnn_dim)   # mapped to 64 dimensions, Identity()
            
        else:
            raise ValueError("cnn not recognized")
        
        # second feature extractor
        self.ct_cnn_s = models.resnet18(pretrained = True)
        self.conv_s = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.ct_cnn_s.conv1 = nn.Identity()
        self.W_s = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 7, 7)), mean=0, std=0.01))
        self.B_s = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
        self.mask_s = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.mask_s.weight = self.W_s
        self.mask_s.bias = self.B_s
        self.dropout = nn.Dropout(p=0.3)
        print("second feature extractor")
        
        self.fc_inter = nn.Linear(hyp.cnn_dim_s + self.n_tab + hyp.cnn_dim, hyp.fc_dim) 

        self.BN_fc_inter = nn.BatchNorm1d(hyp.fc_dim,
                                        momentum=self.batch_momentum)

        self.fc = nn.Linear(hyp.fc_dim, 1)


    def forward(self, x_ct, x_tab, masks):

        x_temp = self.bn(x_tab)
        x_a = self.first_step(x_temp)[:,self.n_d:]
        loss = torch.zeros(1).to(x_temp.device)
        out = torch.zeros(x_temp.size(0),self.n_d).to(x_temp.device)
        priors = torch.ones(x_temp.shape).to(x_temp.device)
        for step in self.steps:
            x_te,l = step(x_temp,x_a,priors)
            out += nn.functional.relu(x_te[:,:self.n_d])
            x_a = x_te[:,self.n_d:]
            loss += l
        # all_loss = []
        # self.all_loss.append(loss)
        # print("tabular finished")
        # 1 + 1 + 1
        x_ct = torch.cat((x_ct, torch.cat((x_ct, x_ct), 1)), 1)
        masks = torch.cat((masks, torch.cat((masks, masks), 1)), 1)
        # print("input concatenate")


        feature_map = self.conv(x_ct) # ViT
        feature_map_s = self.conv_s(x_ct) # CNN
        # print("first layer image")

        
        relevance_map_s = self.mask_s(masks) # CNN
        relevance_map = self.mask(masks)  #self.B # ViT
        # print("first layer mask")

        # multiple element-wise
        ct_att = torch.mul(feature_map, relevance_map) # ViT
        ct_att_s = torch.mul(feature_map_s, relevance_map_s) # CNN
        # print("multiply both")

        ct_f_s = self.ct_cnn_s(ct_att_s) # CNN 
        # print("rest of CNN")

        # ViT
        # print(ct_att.size())
        # print(ct_att.shape)
        ct_f = self.embeddings(ct_att)
        # print("embeddings")
        ct_f = self.ct_cnn(ct_f)
        # print("ct_cnn")
        # print(ct_f)
        # print(ct_f['last_hidden_state'].size())
        # print(type(ct_f))
        ct_f = self.norm(ct_f['last_hidden_state']) # ct features
        # print("norm")
        ct_f = self.pooler(ct_f)
        # print("pooler")
        # print("rest of ViT")

        # concatenate
        x = torch.cat((ct_f_s, self.fc_tab(out)), -1) # concat on last axis output_aggregated  # changed
        x = torch.cat((ct_f, x), -1) # concat on last axis #changed
        # print("concatenate outputs")

        # dropout
        x = self.dropout(x)

        x = self.fc_inter(x)

        x = self.fc(x)
        
        return x, loss

In [None]:
class TabCT(nn.Module):
    def __init__(self, cnn, num_features=4, output_dim=20, 
                 n_shared=2, n_d=64, n_a=64, n_ind=2, n_steps=4, relax=1.2, vbs=128):
        super(TabCT, self).__init__()

        # If shared layers are defined, create them
        if n_shared > 0:
            self.shared = nn.ModuleList()
            # First shared layer from num_features to 2*(n_d+n_a)
            self.shared.append(nn.Linear(num_features, 2*(n_d + n_a)))
            for _ in range(n_shared - 1):
                # Additional shared layers with dimensions (n_d+n_a)
                self.shared.append(nn.Linear(n_d + n_a, 2*(n_d + n_a)))
        else:
            self.shared = None
        
        # First feature transformer step
        self.first_step = FeatureTransformer(num_features, n_d + n_a, self.shared, n_ind)
        
        # Additional decision steps
        self.steps = nn.ModuleList()
        for _ in range(n_steps - 1):
            # Decision steps apply transformation to feature representation
            self.steps.append(DecisionStep(num_features, n_d, n_a, self.shared, n_ind, relax, vbs))
        
        # Fully connected layer for tabular features to output_dim
        self.fc_tab = nn.Linear(n_d, output_dim)
        
        # Batch normalization for tabular features
        self.bn = nn.BatchNorm1d(num_features)
        
        # Save the dimensions of features (n_d)
        self.n_d = n_d

        # # CNN Model dictionary with different backbone models (e.g., resnet, vgg, etc.)
        # cnn_dict = {'vit_base_patch16lung0': None, 'vit_b_16': None, 'vgg16': models.vgg16, 
        #             'resnet18': models.resnet18, 'resnet34': models.resnet34, 
        #             'resnet50': models.resnet50, 'resnet101': models.resnet101, 
        #             'resnet152': models.resnet152, 'resnext50': models.resnext50_32x4d, 
        #             'resnext101': models.resnext101_32x8d}
        
        # Dictionary for output dimensions of different models
        self.out_dict = {'vit_base_patch16lung0': 768, 'vit_b_16': 768, 'resnet18': 512, 'resnet34': 512, 
                         'resnet50': 2048, 'resnet101': 2048, 'resnet152': 2048, 'resnext50': 2048, 
                         'resnext101': 2048, "efnb0": 1280, "efnb1": 1280, "efnb2": 1408, "efnb3": 1536, 
                         "efnb4": 1792, "efnb5": 2048, "efnb6": 2304, "efnb7": 2560, "vgg16": 512}

        # Number of tabular features
        self.n_tab = hyp.n_tab  # n tabular features

        # Initialize the appropriate CNN model based on input 'cnn'
        # if cnn in cnn_dict.keys():
            # if cnn == 'vgg16':
            #     # For VGG16, customize CNN architecture by extracting layers
            #     self.ct_cnn = cnn_dict[cnn](pretrained=True).features[2:]
            #     self.conv = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            #     self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 3, 3)), mean=0, std=0.01))
            #     self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
            # elif cnn == 'vit_b_16':
                # For Vision Transformer (ViT), use pre-trained model and modify configuration
        self.con1 = ViTHybridModel.from_pretrained("google/vit-hybrid-base-bit-384")
        self.input_channel_after_mul = 64
        self.input_size_after_mul = 192
        # Adjust ViT configuration to work with our input size and channels
        self.con1.config.num_channels = self.input_channel_after_mul
        self.con1.config.image_size = self.input_size_after_mul
        self.ct_cnn = self.con1.encoder
        self.norm = self.con1.layernorm
        self.pooler = self.con1.pooler
            # elif cnn == 'vit_base_patch16lung0':
            #     # Load a pre-trained ViT model for lung and colon cancer classification
            #     self.con1 = AutoModelForImageClassification.from_pretrained("DunnBC22/vit-base-patch16-224-in21k_lung_and_colon_cancer").vit
            #     self.ct_cnn = self.con1.encoder
            #     self.norm = self.con1.layernorm
            #     self.pooler = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k").pooler
            # else:
            #     # For other models, initialize with pre-trained weights
            #     self.ct_cnn = cnn_dict[cnn](pretrained=True)
            #     self.conv = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            #     self.ct_cnn.conv1 = nn.Identity()
            #     self.W = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 3, 3)), mean=0, std=0.01))
            #     self.B = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
            #     self.ct_cnn.fc = nn.Linear(self.out_dict[cnn], hyp.cnn_dim)

        # Secondary feature extractor (ResNet18)
        self.ct_cnn_s = models.resnet18(pretrained=True)
        self.conv_s = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.ct_cnn_s.conv1 = nn.Identity()
        self.W_s = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty((64, 3, 7, 7)), mean=0, std=0.01))
        self.B_s = nn.Parameter(torch.nn.init.trunc_normal_(torch.empty(64), mean=0, std=0.01))
        self.mask_s = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.mask_s.weight = self.W_s
        self.mask_s.bias = self.B_s

        # Dropout layer for regularization
        self.dropout = nn.Dropout(p=0.3)

        # Intermediate fully connected layer
        self.fc_inter = nn.Linear(hyp.cnn_dim_s + self.n_tab + hyp.cnn_dim, hyp.fc_dim)
    

        # Final fully connected layer to predict output
        self.fc = nn.Linear(hyp.fc_dim, 1)

    def forward(self, x_ct, x_tab, masks):
        """
        Forward pass of the TabCT model.
        x_ct: Input CT scan images
        x_tab: Input tabular features (e.g., clinical data)
        masks: Masks for attention mechanisms in the model
        """
        # Batch normalization for tabular features
        x_temp = self.bn(x_tab)
        
        # First feature transformation step
        x_a = self.first_step(x_temp)[:, self.n_d:]
        
        # Initialize loss and output tensors
        loss = torch.zeros(1).to(x_temp.device)
        out = torch.zeros(x_temp.size(0), self.n_d).to(x_temp.device)
        priors = torch.ones(x_temp.shape).to(x_temp.device)
        
        # Process through additional decision steps
        for step in self.steps:
            x_te, l = step(x_temp, x_a, priors)
            out += nn.functional.relu(x_te[:, :self.n_d])
            x_a = x_te[:, self.n_d:]
            loss += l
        
        # Concatenate CT scan features and duplicate masks for processing
        x_ct = torch.cat((x_ct, torch.cat((x_ct, x_ct), 1)), 1)
        masks = torch.cat((masks, torch.cat((masks, masks), 1)), 1)
        
        # Pass the concatenated input through the first CNN layer
        feature_map = self.conv(x_ct)  # ViT model
        feature_map_s = self.conv_s(x_ct)  # CNN model

        # Apply attention mechanism using masks
        relevance_map_s = self.mask_s(masks)  # CNN mask
        relevance_map = self.mask(masks)  # ViT mask

        # Apply element-wise multiplication (attention) to the feature maps
        ct_att = torch.mul(feature_map, relevance_map)  # ViT
        ct_att_s = torch.mul(feature_map_s, relevance_map_s)  # CNN
        
        # Process CNN feature map
        ct_f_s = self.ct_cnn_s(ct_att_s)

        # Process ViT feature map through embeddings, encoder, normalization, and pooler
        ct_f = self.embeddings(ct_att)
        ct_f = self.ct_cnn(ct_f)
        ct_f = self.norm(ct_f['last_hidden_state'])  # ViT features
        ct_f = self.pooler(ct_f)

        # Concatenate both CNN and ViT outputs
        x = torch.cat((ct_f_s, self.fc_tab(out)), -1)  # Concatenated feature map
        x = torch.cat((ct_f, x), -1)  # Concatenate final feature map

        # Apply dropout for regularization
        x = self.dropout(x)

        # Final fully connected layers
        x = self.fc_inter(x)
        x = self.fc(x)
        
        return x, loss


In [19]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
def score(fvc_true, fvc_pred, sigma):
    sigma_clip = np.maximum(sigma, 70)
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta, 1000)
    sq2 = np.sqrt(2)
    metric = (delta / sigma_clip)*sq2 + np.log(sigma_clip* sq2)
    return np.mean(metric)


def score_avg(p, a): # patient id, predicted a
    percent_true = train.Percent.values[train.Patient == p]
    fvc_true = train.FVC.values[train.Patient == p]
    weeks_true = train.Weeks.values[train.Patient == p]

    fvc = a * (weeks_true - weeks_true[0]) + fvc_true[0]
    percent = percent_true[0] - a * abs(weeks_true - weeks_true[0])
    return score(fvc_true, fvc, percent)

def rmse_avg(p, a): # patient id, predicted a
    percent_true = train.Percent.values[train.Patient == p]
    fvc_true = train.FVC.values[train.Patient == p]
    weeks_true = train.Weeks.values[train.Patient == p]

    fvc = a * (weeks_true - weeks_true[0]) + fvc_true[0]
    return root_mean_squared_error(fvc_true, fvc)



def smape(targets, outs):
    denominator = (np.abs(targets) + np.abs(outs)) / 2
    ape = np.abs(targets - outs) / denominator
    return np.mean(ape) * 100

# def test_r2_score(p, a):
#     fvc_true = train.FVC.values[train.Patient == p]
# test_r2_score = r2_score(p.numpy(), pred_a[p].numpy())
# test_mape_score = torch.mean(torch.abs((p - pred_a[p]) / p)) * 100
# test_mae_score = torch.mean(torch.abs(pred_a[p] - p))
# test_smape_score = smape(p, pred_a[p])

def plot_figures(figures, nrows = 1, ncols=1):
    """Plot a dictionary of figures.

    Parameters
    ----------
    figures : <title, figure> dictionary
    ncols : number of columns of subplots wanted in the display
    nrows : number of rows of subplots wanted in the figure
    """

    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(figures[title], cmap=plt.gray())
        axeslist.ravel()[ind].set_title(title)
        axeslist.ravel()[ind].set_axis_off()
    plt.tight_layout() # optional

In [20]:
# hyperparams
result_dir = hyp.results_dir

# training only resnet models on gpu 0
train_models = hyp.train_models 

In [21]:
from sklearn.model_selection import train_test_split
train_s, test = train_test_split(P, test_size=0.2, random_state=57)
len(train_s)

140

In [None]:
nfold = hyp.nfold # hyper

# removing noisy data
P = [p for p in P if p not in ['ID00011637202177653955184', 'ID00052637202186188008618']] # in bayad bere bala

for model in train_models:
    log = open(f"{result_dir}/september8th/{model}simplestep2hybrid.txt", "a+")
    kfold =KFold(n_splits=nfold)
    
    ifold = 0
    min_sq = {}

    for train_index, valid_index in kfold.split(train_s):  
        
        p_train = np.array(P)[train_index] 
        p_valid = np.array(P)[valid_index] 
        print(len(p_train))
        osic_train = OSICData_train(p_train, A, TAB)
        train_loader = torch.utils.data.DataLoader(osic_train, batch_size=hyp.batch_size, shuffle=True, num_workers=hyp.num_workers, drop_last=True)

        osic_val = OSICData_test(p_valid, A, TAB)
        val_loader = torch.utils.data.DataLoader(osic_val, batch_size=hyp.batch_size, shuffle=True, num_workers=hyp.num_workers)
        print(len(osic_train))
        print(len(train_loader))
        print(len(val_loader))

        tabct = TabCT(cnn = model).to(gpu)
        print(f"creating {model}")
        print(f"fold: {ifold}")
        log.write(f"fold: {ifold}\n")

        

        n_epochs = hyp.n_epochs # max 30 epochs, patience 5, find the suitable epoch number for later final training

        best_epoch = n_epochs # 30


        optimizer = torch.optim.AdamW(tabct.parameters())
        criterion = torch.nn.L1Loss()

        max_score = 99999999.0000 # here, max score ]= minimum score
        tot_rmse = []
        for epoch in range(n_epochs):  # loop over the dataset multiple times
            running_loss = 0.0
            tabct.train()

            tabular_loss = []
            for i, data in enumerate(tqdm(train_loader, 0)):

                [mask, x, t], a, _ = data

                x = x.to(gpu)
                mask = mask.to(gpu)
                t = t.to(gpu)
                a = a.to(gpu)
                # print(x)
                # print(t)
                # print(mask)
                # t hanoun tabe
                # ultimate = ultimate.to(gpu)
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward + backward + optimize
                outputs, tab_loss = tabct(x, t, mask) # here
                # print(outputs.size())
                tabular_loss.append(tab_loss)
                loss = criterion(outputs, a)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
            print(f"tabular loss: {tabular_loss}")
            print(f"epoch {epoch+1} train: {running_loss}")
            log.write(f"epoch {epoch+1} train: {running_loss}\n")


            running_loss = 0.0
            pred_a = {}
            tabct.eval()
            tabular_loss = []
            for i, data in enumerate(tqdm(val_loader, 0)):

                [mask, x, t], a, pid = data

                x = x.to(gpu)
                mask = mask.to(gpu)
                t = t.to(gpu)
                a = a.to(gpu)

                # forward
                outputs, tab_loss = tabct(x, t, mask)
                loss = criterion(outputs, a)
                tabular_loss.append(tab_loss)
                pids = pid
                preds_a = outputs.detach().cpu().numpy().flatten()

                for j, p_d in enumerate(pids):
                    pred_a[p_d] = preds_a[j]

               


                # print statistics
                running_loss += loss.item()
            print(tabular_loss)
            print(f"epoch {epoch+1} val: {running_loss}")
            log.write(f"epoch {epoch+1} val: {running_loss}\n")
            # score calculation
            # print(pred_a)
            # print(len(pred_a))
            # print(p_test)
            # print(len(p_test))

            # totals
            tot_r2_score = []
            tot_mape_score = []
            tot_mae_score = []
            tot_smape_score = []

            # everyone
            score_v = 0.
            rmse = 0.
            test_r2_score = []
            test_mape_score = []
            test_mae_score = []
            test_smape_score = []
            print(len(p_valid))
            # fvcs_true = []
            # fvcs_pred = []
            for p in p_valid:
                score_v += (score_avg(p, pred_a[p]))/len(p_valid)
                rmse += (rmse_avg(p, pred_a[p]))/len(p_valid)
                fvc_true = train.FVC.values[train.Patient == p]
                weeks_true = train.Weeks.values[train.Patient == p]
                fvc_pred = pred_a[p] * (weeks_true - weeks_true[0]) + fvc_true[0]

                test_r2_score.append(r2_score(fvc_true, fvc_pred))
                test_mape_score.append(np.mean(np.abs((fvc_true - fvc_pred) / fvc_true)) * 100)
                test_mae_score.append(np.mean(np.abs(fvc_pred - fvc_true)))
                test_smape_score.append(smape(fvc_true, fvc_pred))
            #------------------------
            tot_rmse.append(rmse)
            tot_r2_score.append(np.asanyarray(test_r2_score))
            tot_mape_score.append(np.asanyarray(test_mape_score))
            tot_mae_score.append(np.asanyarray(test_mae_score))
            tot_smape_score.append(np.asanyarray(test_smape_score))
            #------------------------

            print("this is rmse")
            print(tot_rmse)
            print("this is r2")
            print(tot_r2_score)
            print("this is mape")
            print(tot_mape_score)
            print("this is mae")
            print(tot_mae_score)
            print("this is smape")
            print(tot_smape_score)
            print(f"val score: {score_v}")
            log.write(f"val score: {score_v}\n")
            log.write(f"val rmse: {rmse}\n")

            if score_v <= max_score:
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': tabct.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'score': score_v
                    }, f"{result_dir}/september8th/{model}simplestep2hybrid.tar")
                max_score = score_v
                best_epoch = epoch + 1
        min_sq[ifold] = np.array(tot_rmse)
        # print("all mean square error")
        # print(min_sq)
        ifold += 1
        # destroy model
        del tabct
        torch.cuda.empty_cache()


# # ref: https://www.kaggle.com/miklgr500/linear-decay-based-on-resnet-cnn
# # https://pytorch.org/docs/stable/index.html



112
6021
752
4


Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
creating vit_b_16
fold: 0


100%|██████████| 752/752 [17:11<00:00,  1.37s/it]


tabular loss: [tensor([0.9053], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9045], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8873], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9140], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8982], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8871], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9089], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9055], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8971], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8963], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8953], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8857], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8986], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8960], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9097], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8925], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9209], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


[tensor([0.9846], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8660], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9588], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0207], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 1 val: 43.17375159263611
28
this is rmse
[350.229522457417]
this is r2
[array([-1.15813149e+00,  4.71121604e-01, -5.73835951e-01,  4.73609377e-01,
       -6.82230311e-02, -3.92423838e+00, -1.16209624e+01, -1.98904129e-01,
        4.26865272e-02, -6.41610806e-02, -1.02262768e+03, -1.77336546e+01,
       -9.41929986e-01, -6.25464495e-01, -2.25633652e+00, -5.11487742e+01,
       -6.09919668e-01,  2.13234836e-01, -6.76655819e+00,  4.76093447e-01,
        4.87942639e-01, -9.94184891e-02, -3.50976122e+00,  2.67210188e-01,
        3.56895185e-01, -3.72993740e-01, -4.64132806e-01,  4.39718751e-01])]
this is mape
[array([ 5.48340278,  2.80422079, 12.85681838,  3.12256813,  6.79260147,
       11.40542324, 15.06864976,  6.22554585,  6.27271241,  6.25696509,


100%|██████████| 752/752 [16:27<00:00,  1.31s/it]


tabular loss: [tensor([0.8951], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8877], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9031], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9006], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8892], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8942], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8991], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8995], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9056], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8969], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8939], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8952], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8982], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8930], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8984], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9104], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8949], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


[tensor([0.9708], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9878], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9534], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8411], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 2 val: 20.584007740020752
28
this is rmse
[350.229522457417, 189.1896070049379]
this is r2
[array([-1.10675235,  0.49335952, -1.03081483,  0.47890498,  0.044532  ,
        0.19280381, -1.91149273, -0.38523622,  0.08801708, -0.07053265,
       -5.85226844, -1.66927457, -0.10547582, -0.54787736,  0.28694764,
       -1.89280928, -0.62185573,  0.07883804, -7.45683536,  0.52623782,
        0.47827197, -0.10702787, -0.91930589,  0.30396039,  0.40265275,
       -0.5466539 , -0.42547665,  0.23485163])]
this is mape
[array([ 5.38280877,  2.75378941, 14.50853075,  3.13107031,  6.40874808,
        5.00556716,  7.39005146,  6.66076952,  5.98407362,  6.30080961,
        7.7870278 ,  8.24361475,  3.86606256,  6.9450946 ,  5.37049476,
        2.28490396,  4.6697

100%|██████████| 752/752 [16:44<00:00,  1.34s/it]


tabular loss: [tensor([0.9065], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9085], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8997], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9123], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9107], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9089], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8944], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9092], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9182], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9106], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9041], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9008], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9175], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9043], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9051], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9038], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9246], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


[tensor([0.9910], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0222], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8846], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9474], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 3 val: 26.517006397247314
28
this is rmse
[350.229522457417, 189.1896070049379, 249.05449149847453]
this is r2
[array([-7.82383770e-01,  6.06752709e-01, -7.06318352e-01,  4.31368075e-01,
        2.02280875e-01, -5.22633506e-02, -4.53859816e+00, -1.91714152e-01,
        2.19821064e-02, -8.87311402e-02, -2.43687326e+02, -2.41639766e+00,
       -1.21309333e+00, -4.20164117e-01,  5.67412685e-01, -3.33112731e+01,
       -1.36580431e+00,  2.01472135e-01, -7.75070808e+00,  4.18640396e-01,
        3.65497690e-01, -1.20097676e-01, -5.87661715e-01,  3.53153147e-01,
        4.94635703e-01, -9.90718525e-01, -3.48888466e-01,  4.58357412e-01])]
this is mape
[array([ 4.84257177,  2.38999756, 13.3648361 ,  3.29410699,  5.82659995,
        5.70297244, 10.10491282,

100%|██████████| 752/752 [16:30<00:00,  1.32s/it]


tabular loss: [tensor([0.9082], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9001], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8985], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9118], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9245], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8917], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9149], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9062], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9232], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8931], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8934], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9015], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8907], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8911], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9002], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8924], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8921], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


[tensor([0.9947], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9201], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0070], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9392], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 4 val: 25.0586199760437
28
this is rmse
[350.229522457417, 189.1896070049379, 249.05449149847453, 209.49160784387007]
this is r2
[array([-1.46974029e+00,  5.53944255e-01, -9.60744962e-01,  4.64101289e-01,
        1.87228119e-01,  1.14577320e-01, -2.43314231e+00, -1.39940198e-01,
       -3.30056852e-02, -8.43061709e-02, -5.15025818e+01, -1.89465802e+00,
       -2.49629319e-01, -4.69855352e-01,  3.18801519e-01, -2.98752567e+00,
       -7.08533242e-01,  1.80749798e-01, -8.57300299e+00,  3.70831485e-01,
        4.89460911e-01, -1.26918155e-01, -4.94781134e+00,  3.38498058e-01,
        5.14973977e-01, -1.28102520e+00, -3.04997453e-01,  4.61906815e-01])]
this is mape
[array([ 6.0345353 ,  2.59213222, 14.27207076,  3.21752726,  5.88146445,
        5.2478

100%|██████████| 752/752 [16:21<00:00,  1.30s/it]


tabular loss: [tensor([0.9073], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9294], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9298], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9132], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9265], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9020], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9163], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9011], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9063], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9055], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9042], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9112], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9031], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9223], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9090], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9189], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9053], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


[tensor([1.0137], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9849], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9764], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9557], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 5 val: 20.16757333278656
28
this is rmse
[350.229522457417, 189.1896070049379, 249.05449149847453, 209.49160784387007, 193.8070774120333]
this is r2
[array([-7.10419396e-01,  5.90570995e-01, -8.66507029e-01,  4.15041605e-01,
        1.90510404e-01,  2.09448701e-01, -3.00589060e+00, -1.43564079e-02,
       -4.55220349e-01, -1.08691994e-01, -1.42571671e+00, -2.14188237e+00,
       -4.05531595e-01, -3.69519157e-01,  3.66989780e-01, -2.12383641e+00,
       -7.09100801e-01,  2.56799909e-01, -9.82787286e+00, -1.08081648e-01,
        4.89102228e-01, -1.44428566e-01, -1.75092206e+01,  3.72175176e-01,
        5.95963889e-01, -2.07266226e+00, -2.13079564e-01,  4.81227979e-01])]
this is mape
[array([ 4.76094913,  2.46320991, 13.94543047,  3.41232149,  5.8695

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
creating vit_b_16
fold: 1


100%|██████████| 828/828 [18:53<00:00,  1.37s/it]


tabular loss: [tensor([0.8898], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8935], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8901], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8842], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8991], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8944], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9158], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8883], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8941], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8959], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9221], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9139], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9095], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8881], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9197], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9134], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8960], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


[tensor([0.7605], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9314], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9562], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9353], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 1 val: 23.769132614135742
28
this is rmse
[191.16542838563194]
this is r2
[array([-3.76698648e+00, -6.68791195e+00,  6.59350598e-01, -8.74670153e-03,
       -2.05922248e+00,  5.43280218e-01, -4.35757296e+00, -9.75382763e-02,
       -2.90905585e-01, -4.76676411e-02, -6.92893888e-01,  1.66566831e-01,
        7.50193213e-01, -1.26537057e-02, -5.22593187e-01, -1.20068024e+00,
        2.74793854e-01, -1.16385653e-01, -3.02888977e-01, -2.09907756e+01,
       -7.86119073e-01,  2.45055029e-01,  6.43170812e-01,  3.93238568e-01,
       -1.69543967e+00, -6.08869214e-01,  6.43110446e-01, -1.23159031e+00])]
this is mape
[array([14.31122934,  7.63638017,  3.17907768,  1.44095908,  4.99670725,
        6.20603768, 14.9924004 ,  6.99174436,  1.8156238 ,  3.6291651

100%|██████████| 828/828 [19:00<00:00,  1.38s/it]


tabular loss: [tensor([0.9084], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9027], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9087], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8994], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9039], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8995], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9016], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9061], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9036], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8979], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8967], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9239], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9011], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9189], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9076], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9016], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9108], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


[tensor([0.8782], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9158], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9619], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9810], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 2 val: 23.231244564056396
28
this is rmse
[191.16542838563194, 189.58277940290097]
this is r2
[array([-2.45703256, -6.3997972 ,  0.6306859 , -0.01237978, -1.25483031,
        0.23571956, -4.64223308, -0.10166771, -0.40968462,  0.10612116,
       -0.74770919,  0.14950086,  0.69184619, -0.63287932, -0.34788536,
       -1.22530632,  0.02805975, -0.0624951 , -0.17118943, -9.54758773,
       -1.05011213,  0.28709987,  0.58145878,  0.32337104, -2.20327686,
       -0.61130641,  0.49605603, -0.44202736])]
this is mape
[array([12.33609207,  7.50807873,  3.175486  ,  1.45549333,  4.408996  ,
        7.75629237, 15.42621008,  7.01181205,  1.89786972,  3.46341764,
       11.59118615,  6.96993688,  2.46001492,  3.95984991,  7.8664164 ,
       45.15091856,  4.1

100%|██████████| 828/828 [19:05<00:00,  1.38s/it]


tabular loss: [tensor([0.9158], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9251], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9128], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9057], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9318], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9135], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9281], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9183], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9183], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9108], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9096], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9234], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9027], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9034], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9423], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9313], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9277], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.10it/s]


[tensor([0.8744], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9335], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8918], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9883], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 3 val: 25.067634105682373
28
this is rmse
[191.16542838563194, 189.58277940290097, 205.17182957171843]
this is r2
[array([ -3.47290275, -16.81529989,   0.76579212,  -2.58389542,
        -2.30278684,   0.84075887,  -4.15714714,  -0.09906929,
        -0.18457346,  -0.16013033,  -0.6892479 ,   0.42294626,
         0.75747631,  -0.04492769,  -0.43788969,  -1.11724336,
         0.17106796,   0.16335713,  -4.02344073, -12.28993811,
        -0.70563802,  -1.77065745,   0.6364685 ,   0.46495905,
        -2.7361256 , -10.27610841,   0.72773891,  -1.70536663])]
this is mape
[array([13.89577019, 11.10314768,  2.97277462,  2.29563249,  5.15970964,
        3.51762272, 14.67316797,  6.999378  ,  1.65461899,  3.7257995 ,
       11.38029533,  5.82483112,  2.47754

100%|██████████| 828/828 [18:38<00:00,  1.35s/it]


tabular loss: [tensor([0.9136], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9052], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9290], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9227], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9054], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9038], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9200], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9074], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9098], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9192], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9052], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9254], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9343], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9269], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9099], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9338], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9153], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


[tensor([0.8479], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9846], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9342], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8304], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 4 val: 18.117009162902832
28
this is rmse
[191.16542838563194, 189.58277940290097, 205.17182957171843, 187.48217610332725]
this is r2
[array([-3.05534283, -4.15608567,  0.67900031, -1.15772897, -2.88488982,
        0.45062862, -4.32852851, -0.08385944, -0.45156684, -0.07584436,
       -0.71121354,  0.04748526,  0.70500978, -0.59936063, -0.41496142,
       -1.15064023,  0.20070546, -0.17614752, -0.27150154, -6.29896533,
       -0.88363205,  0.16697791,  0.62646203,  0.48536053, -2.03126833,
        0.29148298,  0.74306454, -1.76972612])]
this is mape
[array([13.28021754,  6.39285642,  3.07820359,  1.7737593 ,  5.54779187,
        6.72910703, 14.94688626,  6.90727358,  1.9213429 ,  3.6547819 ,
       11.46530299,  7.34414165,  2.39795329,  3.9060605

100%|██████████| 828/828 [18:42<00:00,  1.36s/it]


tabular loss: [tensor([0.9199], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9042], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9195], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9025], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9094], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9088], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9022], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9052], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9195], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9103], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9075], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9187], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9064], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9048], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9129], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9150], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9092], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.05it/s]


[tensor([0.8856], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8955], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8530], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8625], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 5 val: 18.614794731140137
28
this is rmse
[191.16542838563194, 189.58277940290097, 205.17182957171843, 187.48217610332725, 187.70354030041614]
this is r2
[array([-2.65130958e+00, -5.05064142e+00,  7.83314392e-01, -2.65542084e-01,
       -2.35840735e+00,  4.92995211e-01, -4.40455260e+00, -8.67813653e-02,
       -4.23545230e-01, -2.83755840e-02, -7.03446277e-01, -8.91588881e-02,
        7.02794501e-01, -1.05215979e+00, -3.82244042e-01, -1.26783511e+00,
        8.84760424e-02, -3.26055079e-01,  1.84278894e-03, -5.17799598e+00,
       -1.04414578e+00,  2.43111367e-01,  5.87010857e-01,  4.34299507e-01,
       -1.21982852e+00, -6.41812695e-01,  7.45624314e-01, -8.14580394e-01])]
this is mape
[array([12.65161444,  6.86586681,  2.98276229,  1.48347968,  5

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
creating vit_b_16
fold: 2


100%|██████████| 753/753 [17:42<00:00,  1.41s/it]


tabular loss: [tensor([0.8940], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8978], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8975], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9115], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9025], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9065], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8925], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9070], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9043], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8970], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9072], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9066], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9015], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8883], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8982], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9036], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9033], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


[tensor([0.9538], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8502], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9302], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9299], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 1 val: 20.030126333236694
28
this is rmse
[215.38827600138973]
this is r2
[array([  0.30469708,  -0.78213522, -11.2164814 ,  -4.13505789,
         0.68887594, -17.67228641, -12.11153047,  -0.25597309,
        -1.04645484,  -3.39450493,   0.41533792,  -4.71814489,
       -10.42317704,   0.53482465,   0.28555326,   0.13390542,
        -3.36066085,  -9.42990138,  -2.0962063 ,   0.26095688,
        -2.44675513,   0.10990574,  -1.06919208,  -3.39175018,
        -4.92852174,  -1.33704067,  -0.32880998,  -7.46199494])]
this is mape
[array([ 7.71365908,  3.77636299,  5.53645639,  3.05510995,  2.93921882,
        8.41563049,  5.67365132,  2.050951  ,  4.81511046,  4.57100416,
        2.95916403,  4.81894987,  6.93027284,  3.56894029,  3.98897506,
        1

100%|██████████| 753/753 [19:04<00:00,  1.52s/it]


tabular loss: [tensor([0.8980], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8987], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8965], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9056], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8994], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9164], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8973], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8955], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9133], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8925], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9073], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8888], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8923], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9160], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8941], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9147], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8934], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


[tensor([0.9327], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9537], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8617], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9189], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 2 val: 19.025124549865723
28
this is rmse
[215.38827600138973, 213.83802557944858]
this is r2
[array([  0.32779238,  -1.0061682 , -10.18956969,  -8.80188992,
         0.6044618 , -18.91186048,  -8.52592956,  -0.40551588,
        -0.97514853,  -2.96739618,   0.60229141,  -3.75069167,
        -7.37844925,   0.49818499,   0.16609618,  -0.38114726,
        -3.22816964,  -8.7406811 ,  -1.70319399,   0.40692945,
        -1.9805834 ,   0.10025717,  -1.13685832,  -2.6232379 ,
        -6.99134953,  -1.0798865 ,  -0.31591806,  -6.60999164])]
this is mape
[array([ 7.58096209,  3.99022565,  5.31095343,  4.26936384,  3.54079986,
        8.66970228,  4.90201184,  2.34030867,  4.6816416 ,  4.34863513,
        2.46765463,  4.4325257 ,  5.92206123,  3.29686147,  4

100%|██████████| 753/753 [18:21<00:00,  1.46s/it]


tabular loss: [tensor([0.9109], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9063], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9002], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8985], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9104], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8906], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9062], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9048], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9037], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9130], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8955], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8984], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8999], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9025], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9014], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9099], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8992], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.06it/s]


[tensor([0.9315], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9016], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8900], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9281], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 3 val: 17.00963258743286
28
this is rmse
[215.38827600138973, 213.83802557944858, 203.9660571414959]
this is r2
[array([ 2.81358780e-01,  2.24717258e-01, -9.26997994e+00, -1.11681519e+01,
        5.70291370e-01, -9.50463031e+00, -5.99498761e+00, -8.85224572e-01,
       -5.42598354e-01, -4.70126485e+00,  3.74617804e-01, -1.00955932e+00,
       -3.69989152e+00,  4.17326371e-01,  1.09387273e-02, -1.05897970e-01,
       -2.83532424e+00, -7.99819851e+00, -1.20019452e+00,  4.92429137e-01,
       -1.89715246e+00, -2.78386551e-01, -1.23704337e+00, -1.93707216e+00,
       -5.43967935e+00, -7.20347821e-01, -3.79865547e-01, -6.28512799e+00])]
this is mape
[array([ 7.84427785,  2.51323888,  5.0986538 ,  4.76606203,  3.75484852,
        6.43327166,  4.23900354

100%|██████████| 753/753 [17:50<00:00,  1.42s/it]


tabular loss: [tensor([0.8976], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9030], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8948], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8974], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9056], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8925], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9076], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8903], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8930], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9010], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8960], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8996], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8929], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9126], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8988], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8989], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9028], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


[tensor([0.9186], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9480], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9367], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8971], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 4 val: 22.50315761566162
28
this is rmse
[215.38827600138973, 213.83802557944858, 203.9660571414959, 224.900271648149]
this is r2
[array([  0.15499073,   0.21072441,  -9.76175868, -32.44598003,
         0.64642542, -17.17671138,  -6.62981753,  -3.16964337,
        -0.41120282,  -2.97026821,   0.34333686,  -1.84203444,
        -6.29307845,   0.4635484 ,   0.13052243,  -0.74930024,
        -2.06865029,  -8.79404892,  -3.08625414,   0.42428185,
        -3.06116481,   0.12382785,  -1.94727197,  -6.31759087,
        -4.76935972,  -1.03254322,  -0.31823091,  -9.7232082 ])]
this is mape
[array([ 8.50069395,  2.53707777,  5.21349955,  7.97892247,  3.25526407,
        8.31145916,  4.41844499,  4.53202295,  4.20992987,  4.35018288,
        3.16349042,  3.47

100%|██████████| 753/753 [17:42<00:00,  1.41s/it]


tabular loss: [tensor([0.9219], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8830], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9033], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9005], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9093], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9057], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8912], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9016], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9214], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8951], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9065], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9063], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8953], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9011], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8977], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8944], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9033], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


[tensor([0.9393], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9752], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9120], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9215], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 5 val: 18.804508686065674
28
this is rmse
[215.38827600138973, 213.83802557944858, 203.9660571414959, 224.900271648149, 213.05555558070537]
this is r2
[array([-7.87927533e-03, -2.83709850e-01, -1.05090499e+01, -2.12184336e+01,
        6.02595527e-01, -1.42293373e+01, -1.54561481e+01, -1.79581991e+00,
       -3.64355135e-01, -1.00244675e+00,  4.52922637e-01, -4.08574360e+00,
       -7.36500730e+00,  4.19139067e-01,  1.71616999e-01,  1.99744091e-01,
       -1.22360814e+00, -9.41868340e+00, -2.01646953e+00,  1.63295633e-01,
       -2.93394435e+00,  1.32046148e-01, -1.18002240e+00, -4.30048696e+00,
       -6.76977721e+00, -5.55054893e-01, -3.10819756e-01, -3.61538163e+00])]
this is mape
[array([ 9.25043707,  3.23422212,  5.38233588,  6.45762491,  3.55

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
creating vit_b_16
fold: 3


100%|██████████| 832/832 [19:53<00:00,  1.43s/it]


tabular loss: [tensor([0.8901], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9062], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9027], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9112], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8941], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8925], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8969], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8995], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8956], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9036], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9215], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9000], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9088], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9082], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8871], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9383], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9165], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


[tensor([1.0147], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8622], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9721], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.5580], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 1 val: 16.182657718658447
28
this is rmse
[210.2033509594342]
this is r2
[array([ -0.43047458,  -5.34612309, -20.430582  ,  -0.59738691,
        -5.3782351 ,  -3.88678986,  -3.06342425,  -4.49734074,
        -0.94375972,  -0.02254063,  -2.06703675,  -4.87110516,
         0.58505913,  -0.06967197,  -0.22119215,  -1.12109081,
        -0.79230317, -14.06666846,  -0.13333911,   0.57414083,
         0.08004869,  -6.62842167,  -0.19489046,  -0.80985136,
        -0.24860224,  -2.12340645,   0.67293792,  -2.03127499])]
this is mape
[array([ 5.59371975, 14.02771887,  3.94304233,  3.73764872,  2.73425066,
       18.12133475,  7.96083358,  8.03489936,  4.60296371,  3.36851818,
        8.42095317, 53.80993742,  3.12621511,  2.53288928,  5.50244669,
        5.

100%|██████████| 832/832 [20:23<00:00,  1.47s/it]


tabular loss: [tensor([0.8997], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9294], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9286], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9337], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9303], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9171], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9405], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9315], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9144], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9250], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9216], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8997], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9256], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8963], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9343], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9110], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9274], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.06it/s]


[tensor([0.9584], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8042], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8308], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9246], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 2 val: 20.567355632781982
28
this is rmse
[210.2033509594342, 218.60901858515194]
this is r2
[array([-4.73845026e-01, -5.57116523e+00, -5.43655548e+01, -1.04530298e+00,
       -6.41559687e+00, -4.31048379e+00, -4.90723706e+00, -4.33951524e+00,
       -1.87119621e+00,  1.81982952e-01, -2.05393952e+00, -3.68044544e+00,
        4.94092813e-01, -3.67738088e-01, -7.40288844e-01, -8.91787098e-01,
       -6.63745236e-01, -1.55340583e+01, -6.79707751e-02, -4.73547499e-01,
       -2.73999609e-02, -1.08674676e+01, -1.10229755e-01,  5.41137963e-01,
       -6.41705396e-01, -4.51853619e+00,  6.64206769e-01, -2.30100003e+00])]
this is mape
[array([ 5.67287137, 14.30687846,  6.10894508,  4.0747068 ,  2.92604072,
       19.01162751,  9.56842268,  7.89030889,  5.7

100%|██████████| 832/832 [19:41<00:00,  1.42s/it]


tabular loss: [tensor([0.9244], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9062], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9111], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9107], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8958], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9173], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9145], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9153], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9154], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9335], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9230], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9060], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9104], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9149], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9063], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9023], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9147], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


[tensor([0.8197], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8743], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8980], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8197], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 3 val: 17.285756587982178
28
this is rmse
[210.2033509594342, 218.60901858515194, 219.12012130214157]
this is r2
[array([-1.18017662e+00, -5.34913407e+00, -3.41344935e+01, -1.58935540e+00,
       -5.12096674e+00, -5.11318915e+00, -2.66158527e+00, -4.53677123e+00,
        2.02858199e-03, -1.66560817e-02, -2.06724060e+00, -5.15926923e+00,
        3.89179349e-01, -2.97230635e-02,  6.15219364e-03, -1.13370412e+00,
       -7.87844116e-01, -1.40266884e+01, -9.51449097e-02, -2.83681701e-01,
        6.01182204e-02, -5.47682394e+00, -2.81727515e-01, -1.77830519e+00,
       -6.13142103e-01, -1.48564488e+00,  6.53239982e-01, -2.02858124e+00])]
this is mape
[array([ 6.81582757, 14.03152301,  4.94833955,  4.58575145,  2.68402027,
       20.49629836,  7.5489066

100%|██████████| 832/832 [19:16<00:00,  1.39s/it]


tabular loss: [tensor([0.9119], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9272], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9102], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9057], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9328], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9130], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9105], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9179], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9043], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9165], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9232], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9037], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9212], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9087], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9146], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9180], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9366], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


[tensor([0.9803], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8116], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9264], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9683], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 4 val: 19.759305000305176
28
this is rmse
[210.2033509594342, 218.60901858515194, 219.12012130214157, 226.61844942173352]
this is r2
[array([-1.11069982e+00, -5.79628775e+00, -7.48732941e+01, -8.61881414e-01,
       -7.76410232e+00, -5.11452130e+00, -3.78802789e+00, -4.22565514e+00,
       -3.37213375e+00,  1.12369457e-01, -2.38832421e+00, -3.92898847e+00,
        3.53610996e-01, -4.69356936e-01, -6.73192692e-01, -9.10815272e-01,
       -7.05606077e-01, -1.70651197e+01, -3.24711075e-02, -5.23988778e-01,
       -1.30962400e-01, -9.59827225e+00, -2.59101231e-01, -1.39018831e-01,
       -6.69194579e-01, -3.22456697e+00,  6.69515436e-01, -2.22026849e+00])]
this is mape
[array([ 6.71329674, 14.5763864 ,  7.07778643,  3.95724341,  3.15432574,
       20.

100%|██████████| 832/832 [19:31<00:00,  1.41s/it]


tabular loss: [tensor([0.9187], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9238], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9162], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9103], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9051], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9131], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9123], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9182], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9076], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9222], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9237], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9120], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9166], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9211], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8994], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9226], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9102], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


[tensor([0.9387], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9486], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9074], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8756], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 5 val: 151.8026008605957
28
this is rmse
[210.2033509594342, 218.60901858515194, 219.12012130214157, 226.61844942173352, 887.5608204288808]
this is r2
[array([-7.87903389e+01, -6.10411298e+00, -7.82954922e+01, -1.78958760e+00,
       -9.06993372e+00, -3.71745323e+00, -5.04138507e+02, -2.98712862e+00,
       -4.06590548e+02, -5.83338903e+02, -2.76759031e+02, -3.32075464e+00,
       -3.59376018e+02, -3.31469931e+02, -6.41946618e-01, -3.54926677e+01,
       -3.15784697e+01, -2.50987392e+02, -1.02957226e+00, -1.91234396e+01,
       -4.17657987e-01, -9.62332861e+00, -1.11130713e-01, -1.27128148e+00,
       -1.17469959e+00, -2.26066730e+01,  6.80488834e-01, -6.56893743e+02])]
this is mape
[array([38.05233419, 14.93082799,  7.22576325,  4.78672474,  3.35

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
creating vit_b_16
fold: 4


100%|██████████| 873/873 [20:40<00:00,  1.42s/it]


tabular loss: [tensor([0.8974], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8897], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8828], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8883], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8829], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8936], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9156], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9102], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8973], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9103], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9075], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9125], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9070], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9006], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8972], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8994], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9016], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


[tensor([0.9146], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9206], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9246], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8360], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 1 val: 75.48791122436523
28
this is rmse
[438.01428499508273]
this is r2
[array([-2.57654507e+00, -2.42203966e-01, -1.37764468e+01, -1.10809425e+02,
       -5.59915703e+01, -7.84118590e+00, -2.55065930e+01,  4.99825279e-01,
       -2.01989953e+00, -2.39757148e+01, -2.50393968e+00, -1.04956875e+01,
        2.09364899e-02, -7.96249197e+00, -1.40222230e+01, -1.40900918e-01,
       -1.91581581e+00, -1.32143728e+00, -2.08958893e+00, -2.46551155e+01,
       -2.70053747e+01, -3.27427453e+01, -3.49664935e+00, -1.10997724e+01,
       -2.19304943e+01, -2.30727721e+00, -2.96827006e+01, -6.59200829e+01])]
this is mape
[array([17.26684599,  4.28391304,  7.0086467 , 24.46847807, 29.21078956,
       13.94354036, 33.03171252,  2.30919983,  5.86128229, 11.79556665

100%|██████████| 873/873 [20:21<00:00,  1.40s/it]


tabular loss: [tensor([0.8965], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9033], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8906], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8910], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9046], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9089], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9026], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8991], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8932], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8970], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8995], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8911], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9025], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8952], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9025], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8961], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9042], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:04<00:00,  1.04s/it]


[tensor([0.8921], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9527], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9082], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9498], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 2 val: 17.469101428985596
28
this is rmse
[438.01428499508273, 169.8022371505267]
this is r2
[array([ 0.54360271, -1.6978845 ,  0.75936144, -8.57007166, -2.99069829,
       -0.95853334, -3.17391358,  0.61992642,  0.2868995 , -0.48478728,
        0.70223336, -0.41550615, -0.18003366,  0.65758198, -1.94057827,
        0.4342596 ,  0.31388253, -0.20586353, -0.12960918, -5.75025939,
       -0.8861035 , -0.47017925, -3.76933374,  0.70854434, -0.68068736,
        0.42634942,  0.43348767, -0.01874531])]
this is mape
[array([ 5.72788398,  6.54387178,  1.06693261,  7.58761514,  9.28807528,
        8.7552239 , 12.98107505,  2.26299282,  2.99075287,  3.82032481,
        3.81625985,  8.26315458,  4.34317291,  3.04066047,  5.80313344,
        4.08281538,  6.39

100%|██████████| 873/873 [20:55<00:00,  1.44s/it]


tabular loss: [tensor([0.9015], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9107], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9095], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9063], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8992], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9106], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8998], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8970], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9102], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9039], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9089], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9128], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9017], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9111], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9083], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8934], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9081], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.04it/s]


[tensor([0.9636], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9069], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8926], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9251], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 3 val: 18.468429565429688
28
this is rmse
[438.01428499508273, 169.8022371505267, 171.17450376187284]
this is r2
[array([ 0.52001892, -1.97109165,  0.7116714 , -8.41540192, -2.67517318,
       -0.90668219, -2.82286799,  0.456103  ,  0.15684275, -0.37600872,
        0.64756832, -0.24258987, -0.37988236,  0.69251297, -1.50277921,
        0.09545543,  0.18843267, -0.17325212, -0.04075972, -5.20748557,
       -0.59613435,  0.06508858, -4.2184133 ,  0.64662518, -0.45192183,
        0.26900927,  0.44294007, -1.9722491 ])]
this is mape
[array([ 5.86699895,  6.88114736,  1.13744129,  7.52873582,  9.00779258,
        8.67296146, 12.39181056,  2.54723706,  3.46837072,  3.66120683,
        4.08686457,  7.76431359,  4.70698236,  2.98367577,  5.22917686,
     

100%|██████████| 873/873 [21:35<00:00,  1.48s/it]


tabular loss: [tensor([0.8998], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9008], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9004], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9180], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9101], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9023], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9113], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9052], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9002], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9048], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9043], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8936], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9112], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9050], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9096], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9148], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9071], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


[tensor([0.9481], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8989], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9200], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9661], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 4 val: 17.34344482421875
28
this is rmse
[438.01428499508273, 169.8022371505267, 171.17450376187284, 166.6154039290472]
this is r2
[array([ 4.24330824e-01, -2.04949592e+00,  6.75648700e-01, -8.13549538e+00,
       -1.86652585e+00, -8.03668773e-01, -1.30240104e+00,  4.07865476e-01,
        1.03963906e-01, -3.43958927e-01,  5.44709895e-01, -2.00729890e-01,
       -4.88938617e-01,  7.03950621e-01, -5.78117808e-01,  2.28160213e-01,
        9.15982702e-02, -1.48297494e-01, -3.70802663e-03, -4.96853454e+00,
       -4.72005080e-01,  3.40591667e-03, -4.34022614e+00,  5.82050751e-01,
       -3.88147039e-01,  1.60204700e-01,  4.16616166e-01, -8.18216957e-01])]
this is mape
[array([ 6.34237186,  6.97313145,  1.16412071,  7.42079381,  8.1976499 ,
        8.50

100%|██████████| 873/873 [24:13<00:00,  1.66s/it]


tabular loss: [tensor([0.9175], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9007], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8975], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9234], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9004], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8997], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9181], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9194], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9304], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9164], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9120], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9105], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9206], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9113], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9028], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9033], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9187], device='cuda:0', grad_fn

100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


[tensor([0.9602], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9676], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9704], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.7651], device='cuda:0', grad_fn=<AddBackward0>)]
epoch 5 val: 17.895178079605103
28
this is rmse
[438.01428499508273, 169.8022371505267, 171.17450376187284, 166.6154039290472, 163.59738854633414]
this is r2
[array([ 0.44216396, -1.63922269,  0.7597417 , -7.65143159, -1.97504247,
       -1.22118908, -0.76424286,  0.5928941 ,  0.29369711, -0.45180583,
        0.62663751, -0.40975609, -0.23076864,  0.6991609 , -0.7520485 ,
        0.58189572,  0.28128452, -0.2073905 , -0.10017375, -5.76032828,
       -0.95325469, -0.25331492, -3.80041858,  0.70703991, -0.66220181,
        0.34091072,  0.43960581, -0.50383788])]
this is mape
[array([ 6.26193559,  6.46761488,  1.06472441,  7.22965826,  8.31596193,
        9.13443542,  8.02389196,  2.31622867,  3.00544389,  3.78072863,
        4.22206723,  8.24301825,  4.44

In [None]:
test = [p for p in test if p not in ['ID00011637202177653955184', 'ID00052637202186188008618']]
osic_test = OSICData_test(test, A, TAB)
test_loader = torch.utils.data.DataLoader(osic_test, batch_size=1, num_workers=hyp.num_workers)


# load the best model
tabct = TabCT(cnn = model).to(gpu)
tabct.load_state_dict(torch.load(f"{result_dir}/september8th/{model}simplestep2hybrid.tar")["model_state_dict"])

running_loss = 0.0
pred_a = {}
tabct.eval()
tabular_loss = []
for i, data in enumerate(tqdm(test_loader, 0)):

    [mask, x, t], a, pid = data

    x = x.to(gpu)
    mask = mask.to(gpu)
    t = t.to(gpu)
    a = a.to(gpu)

    # forward
    outputs, tab_loss = tabct(x, t, mask)
    loss = criterion(outputs, a)
    tabular_loss.append(tab_loss)
    pids = pid
    preds_a = outputs.detach().cpu().numpy().flatten()
    print([outputs, pid])
    for j, p_d in enumerate(pids):
        pred_a[p_d] = preds_a[j]




    # print statistics
    running_loss += loss.item()
print(tabular_loss)
# print(f"epoch {epoch+1} val: {running_loss}")
# log.write(f"epoch {epoch+1} val: {running_loss}\n")
# score calculation
# print(pred_a)
# print(len(pred_a))
# print(p_test)
# print(len(p_test))

# totals
tot_r2_score = []
tot_mape_score = []
tot_mae_score = []
tot_smape_score = []

# everyone
score_v = 0.
rmse = 0.
test_r2_score = []
test_mape_score = []
test_mae_score = []
test_smape_score = []
# print(len(p_valid))
# fvcs_true = []
# fvcs_pred = []
for p in test:
    score_v += (score_avg(p, pred_a[p]))/len(test)
    rmse += (rmse_avg(p, pred_a[p]))/len(test)
    fvc_true = train.FVC.values[train.Patient == p]
    weeks_true = train.Weeks.values[train.Patient == p]
    fvc_pred = pred_a[p] * (weeks_true - weeks_true[0]) + fvc_true[0]

    test_r2_score.append(r2_score(fvc_true, fvc_pred))
    test_mape_score.append(np.mean(np.abs((fvc_true - fvc_pred) / fvc_true)) * 100)
    test_mae_score.append(np.mean(np.abs(fvc_pred - fvc_true)))
    test_smape_score.append(smape(fvc_true, fvc_pred))
#------------------------
tot_rmse.append(rmse)
tot_r2_score.append(np.asanyarray(test_r2_score))
tot_mape_score.append(np.asanyarray(test_mape_score))
tot_mae_score.append(np.asanyarray(test_mae_score))
tot_smape_score.append(np.asanyarray(test_smape_score))
#------------------------

print("this is rmse")
print(tot_rmse)
print("this is r2")
print(tot_r2_score)
print("this is mape")
print(tot_mape_score)
print("this is mae")
print(tot_mae_score)
print("this is smape")
print(tot_smape_score)
print(f"val score: {score_v}")
log.write(f"val score: {score_v}\n")
log.write(f"val rmse: {rmse}\n")

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor


  3%|▎         | 1/35 [00:01<00:46,  1.36s/it]

[tensor([[-4.9629]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00025637202179541264076',)]


  6%|▌         | 2/35 [00:02<00:35,  1.08s/it]

[tensor([[-6.9696]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00417637202310901214011',)]


  9%|▊         | 3/35 [00:03<00:31,  1.01it/s]

[tensor([[-5.1296]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00423637202312137826377',)]


 11%|█▏        | 4/35 [00:03<00:28,  1.08it/s]

[tensor([[-8.2493]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00078637202199415319443',)]


 14%|█▍        | 5/35 [00:04<00:26,  1.11it/s]

[tensor([[-4.4956]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00042637202184406822975',)]


 17%|█▋        | 6/35 [00:05<00:25,  1.15it/s]

[tensor([[-0.2551]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00210637202257228694086',)]


 20%|██        | 7/35 [00:06<00:24,  1.16it/s]

[tensor([[-5.7764]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00367637202296290303449',)]


 23%|██▎       | 8/35 [00:07<00:22,  1.19it/s]

[tensor([[-6.7125]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00283637202278714365037',)]


 26%|██▌       | 9/35 [00:08<00:21,  1.18it/s]

[tensor([[-4.6875]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00242637202264759739921',)]


 29%|██▊       | 10/35 [00:09<00:21,  1.16it/s]

[tensor([[-4.9741]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00329637202285906759848',)]


 31%|███▏      | 11/35 [00:09<00:20,  1.17it/s]

[tensor([[-5.1242]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00111637202210956877205',)]


 34%|███▍      | 12/35 [00:10<00:19,  1.19it/s]

[tensor([[-6.8789]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00135637202224630271439',)]


 37%|███▋      | 13/35 [00:11<00:18,  1.19it/s]

[tensor([[0.0043]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00341637202287410878488',)]


 40%|████      | 14/35 [00:12<00:17,  1.19it/s]

[tensor([[-5.0871]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00371637202296828615743',)]


 43%|████▎     | 15/35 [00:13<00:16,  1.21it/s]

[tensor([[-4.4883]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00048637202185016727717',)]


 46%|████▌     | 16/35 [00:13<00:15,  1.22it/s]

[tensor([[-6.0758]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00077637202199102000916',)]


 49%|████▊     | 17/35 [00:14<00:14,  1.21it/s]

[tensor([[-4.5736]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00305637202281772703145',)]


 51%|█████▏    | 18/35 [00:16<00:16,  1.04it/s]

[tensor([[-0.9980]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00331637202286306023714',)]


 54%|█████▍    | 19/35 [00:16<00:14,  1.07it/s]

[tensor([[-5.1421]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00110637202210673668310',)]


 57%|█████▋    | 20/35 [00:17<00:13,  1.12it/s]

[tensor([[-5.1296]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00319637202283897208687',)]


 60%|██████    | 21/35 [00:18<00:12,  1.15it/s]

[tensor([[-4.5319]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00336637202286801879145',)]


 63%|██████▎   | 22/35 [00:19<00:11,  1.17it/s]

[tensor([[-4.3320]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00400637202305055099402',)]


 66%|██████▌   | 23/35 [00:20<00:10,  1.19it/s]

[tensor([[-4.6363]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00161637202235731948764',)]


 69%|██████▊   | 24/35 [00:20<00:09,  1.22it/s]

[tensor([[-4.6875]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00364637202296074419422',)]


 71%|███████▏  | 25/35 [00:21<00:08,  1.23it/s]

[tensor([[-6.6904]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00344637202287684217717',)]


 74%|███████▍  | 26/35 [00:22<00:07,  1.26it/s]

[tensor([[-5.5766]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00138637202231603868088',)]


 77%|███████▋  | 27/35 [00:23<00:06,  1.28it/s]

[tensor([[-4.9740]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00337637202286839091062',)]


 80%|████████  | 28/35 [00:24<00:05,  1.28it/s]

[tensor([[-4.5861]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00381637202299644114027',)]


 83%|████████▎ | 29/35 [00:24<00:04,  1.30it/s]

[tensor([[-6.0951]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00047637202184938901501',)]


 86%|████████▌ | 30/35 [00:25<00:03,  1.29it/s]

[tensor([[-4.9871]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00383637202300493233675',)]


 89%|████████▊ | 31/35 [00:26<00:03,  1.30it/s]

[tensor([[-4.2614]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00026637202179561894768',)]


 91%|█████████▏| 32/35 [00:27<00:02,  1.31it/s]

[tensor([[-4.5861]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00168637202237852027833',)]


 94%|█████████▍| 33/35 [00:27<00:01,  1.32it/s]

[tensor([[-1.1288]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00117637202212360228007',)]


 97%|█████████▋| 34/35 [00:28<00:00,  1.34it/s]

[tensor([[-5.4980]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00407637202308788732304',)]


100%|██████████| 35/35 [00:29<00:00,  1.20it/s]

[tensor([[-4.5735]], device='cuda:0', grad_fn=<AddmmBackward0>), ('ID00123637202217151272140',)]
[tensor([1.0120], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9438], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9477], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.8658], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9694], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9567], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9329], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9822], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0142], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9850], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9477], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9587], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9328], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9616], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0336], device='cuda:0', grad_fn=<AddBackward0>), tensor([0.9741], 




29

In [25]:
print(pred_a)
print(test)

{'ID00025637202179541264076': -4.9629145, 'ID00417637202310901214011': -6.96957, 'ID00423637202312137826377': -5.1295543, 'ID00078637202199415319443': -8.249274, 'ID00042637202184406822975': -4.4956326, 'ID00210637202257228694086': -0.25509533, 'ID00367637202296290303449': -5.776394, 'ID00283637202278714365037': -6.7125416, 'ID00242637202264759739921': -4.6875415, 'ID00329637202285906759848': -4.974105, 'ID00111637202210956877205': -5.1242247, 'ID00135637202224630271439': -6.878868, 'ID00341637202287410878488': 0.0043280534, 'ID00371637202296828615743': -5.0871334, 'ID00048637202185016727717': -4.488312, 'ID00077637202199102000916': -6.0758114, 'ID00305637202281772703145': -4.573623, 'ID00331637202286306023714': -0.99795616, 'ID00110637202210673668310': -5.1420565, 'ID00319637202283897208687': -5.1295567, 'ID00336637202286801879145': -4.531871, 'ID00400637202305055099402': -4.3320036, 'ID00161637202235731948764': -4.636307, 'ID00364637202296074419422': -4.687541, 'ID0034463720228768421

In [None]:
# final training with optimized setting

osic_all = OSICData_test(P, A, TAB)
all_loader = torch.utils.data.DataLoader(osic_all, batch_size=2, shuffle=True, num_workers=hyp.num_workers)

# load the best model
tabct = TabCT(cnn = model).to(gpu)
tabct.load_state_dict(torch.load(f"{result_dir}/september8th/{model}simplestep2hybrid.tar")["model_state_dict"])

optimizer = torch.optim.AdamW(tabct.parameters(), lr = hyp.final_lr) # very small learning rate


print(f"Final training")
log.write(f"Final training\n")  
for epoch in range(best_epoch + 2):  # loop over the dataset multiple times

    running_loss = 0.0
    tabct.train()
    for i, data in enumerate(tqdm(all_loader, 0)):

        [mask, x, t], a, _ = data

        x = x.to(gpu)
        mask = mask.to(gpu)
        t = t.to(gpu)
        a = a.to(gpu)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, _ = tabct(x, t, mask)
        loss = criterion(outputs, a)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    print(f"epoch {epoch+1} train: {running_loss}")
    log.write(f"epoch {epoch+1} train: {running_loss}\n")
    torch.save({
        'epoch': best_epoch,
        'model_state_dict': tabct.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
        }, f"{result_dir}/september8th/{model}simplestep2hybrid.tar")

print('Finished Training')
# destroy model
del tabct
torch.cuda.empty_cache()


# plot_figures(min_sq, 2, 3)

# ref: https://www.kaggle.com/miklgr500/linear-decay-based-on-resnet-cnn
# https://pytorch.org/docs/stable/index.html

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor
Final training


100%|██████████| 87/87 [01:53<00:00,  1.30s/it]


epoch 1 train: 377.20458340644836


100%|██████████| 87/87 [01:54<00:00,  1.31s/it]


epoch 2 train: 377.4835966229439


100%|██████████| 87/87 [01:50<00:00,  1.27s/it]


epoch 3 train: 379.4568520784378


100%|██████████| 87/87 [01:47<00:00,  1.24s/it]


epoch 4 train: 368.7075105905533


100%|██████████| 87/87 [01:46<00:00,  1.23s/it]


epoch 5 train: 375.00669288635254


100%|██████████| 87/87 [01:48<00:00,  1.25s/it]


epoch 6 train: 378.1719209551811


100%|██████████| 87/87 [01:48<00:00,  1.24s/it]


epoch 7 train: 364.9025667309761
Finished Training


In [27]:
# load the best model
tabct = TabCT(cnn = model).to(gpu)
tabct.load_state_dict(torch.load(f"{result_dir}/september8th/{model}simplestep2hybrid.tar")["model_state_dict"])

Some weights of ViTHybridModel were not initialized from the model checkpoint at google/vit-hybrid-base-bit-384 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


second feature extractor


<All keys matched successfully>

# VIT

## CT

In [30]:
# C:\Users\Vison\Documents\Users\Dolatabadi\FibroCosaNet\train\ID00216637202257988213445\10.dcm
image = get_img("../train/ID00216637202257988213445/10.dcm")
image = np.stack((image, image, image), axis=-1)
image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0) .to(gpu)
out = tabct.conv(image).squeeze(0)
tensor = out  # Example: Replace with your actual tensor
print(type(tensor))
# Convert tensor to numpy if needed
tensor_np = tensor.to("cpu").detach().numpy()


# List of colormaps to visualize
colormaps = ['gray', 'bone', 'inferno', 'viridis', 'magma', 'jet', 'hot', 'coolwarm', 'plasma', 'cividis']

# Create a folder to save the images
save_dir = "slices_by_cmap/VIT/image"
os.makedirs(save_dir, exist_ok=True)

# Plot all slices for each colormap and save the output
for cmap in colormaps:
    # Create a figure with subplots
    fig, axes = plt.subplots(8, 8, figsize=(15, 15))  # 8x8 grid for 64 slices

    # Plot each slice in the grid
    for i in range(8):
        for j in range(8):
            slice_idx = i * 8 + j  # Calculate the current slice index
            axes[i, j].imshow(tensor_np[slice_idx], cmap=cmap)
            axes[i, j].set_title(f'Channel {slice_idx}')
            axes[i, j].axis('off')  # Hide the axis for clarity

    # Adjust layout
    plt.tight_layout()

    # Save the figure for the current colormap
    save_path = os.path.join(save_dir, f"channels_{cmap}.png")
    plt.savefig(save_path)

    # Close the figure to release memory
    plt.close(fig)

print(f"Images saved in directory: {save_dir}")

<class 'torch.Tensor'>
Images saved in directory: slices_by_cmap/VIT/image


## mask

In [31]:
# C:\Users\Vison\Documents\Users\Dolatabadi\FibroCosaNet\mask_clear\10.jpg
image = get_mask("../mask_clear/ID00216637202257988213445/10.jpg")
image = np.stack((image, image, image), axis=-1)
image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0) .to(gpu)
out = tabct.mask(image).squeeze(0)
tensor = out  # Example: Replace with your actual tensor
print(type(tensor))
# Convert tensor to numpy if needed
tensor_np = tensor.to("cpu").detach().numpy()


# List of colormaps to visualize
colormaps = ['gray', 'bone', 'inferno', 'viridis', 'magma', 'jet', 'hot', 'coolwarm', 'plasma', 'cividis']

# Create a folder to save the images
save_dir = "slices_by_cmap/VIT/mask"
os.makedirs(save_dir, exist_ok=True)

# Plot all slices for each colormap and save the output
for cmap in colormaps:
    # Create a figure with subplots
    fig, axes = plt.subplots(8, 8, figsize=(15, 15))  # 8x8 grid for 64 slices

    # Plot each slice in the grid
    for i in range(8):
        for j in range(8):
            slice_idx = i * 8 + j  # Calculate the current slice index
            axes[i, j].imshow(tensor_np[slice_idx], cmap=cmap)
            axes[i, j].set_title(f'Channel {slice_idx}')
            axes[i, j].axis('off')  # Hide the axis for clarity

    # Adjust layout
    plt.tight_layout()

    # Save the figure for the current colormap
    save_path = os.path.join(save_dir, f"Channels_{cmap}.png")
    plt.savefig(save_path)

    # Close the figure to release memory
    plt.close(fig)

print(f"Images saved in directory: {save_dir}")

<class 'torch.Tensor'>
Images saved in directory: slices_by_cmap/VIT/mask
