In [None]:
!pip install datasets
!pip install timm
!pip install triton

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [None]:
import os
import torch
import timm
import torch.nn as nn
from tqdm import tqdm
from timm.layers import SwiGLUPacked
from transformers import GPT2Tokenizer, GPT2Model, ViTModel
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import transforms
from datasets import load_dataset
import numpy as np
import re
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# ------------------- Dataset Class for VQA -------------------
# Preprocessing function to clean up the questions and answers
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(' +', ' ', text).replace(" ?", "?").strip()
    return text


class VQA_Dataset(Dataset):
    def __init__(self, hf_dataset, tokenizer_name='gpt2',
                 max_seq_length=256, img_size=224, transform=None, answer_to_index=None):

        self.data = hf_dataset
        self.img_size = img_size
        self.max_seq_length = max_seq_length

        # ------------------- Image Preprocessing Function -------------------
        # default_transform is only resize and to tensor
        default_transform = transforms.Compose([
            transforms.Resize((self.img_size, self.img_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
        # specified slide_feature image Transform can be assigned
        self.transform = transform or default_transform

        # ------------------- Text Preprocessing Function -------------------
        # The GPT-2 model operates on tokenized input, which is essentially converting text into sequences of
        # integers that represent individual tokens (words or subwords).
        self.answer_to_index = answer_to_index
        # Calling tokenizer ensures that the input text is properly formatted and tokenized in the same way
        # the GPT-2 model was trained, which is critical for effective performance.
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') if tokenizer_name == 'gpt2' else None
        # Padding is used to ensure that all input sequences in a batch are of the same length.
        # EOS (End of sequence model) make the end of a seq to let model know input text is done
        self.tokenizer.pad_token = self.tokenizer.eos_token  # pad with eos, (use eos_token as pad_token)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # fetch question, answer and image path from the dataset
        question = self.data[idx]['question']
        answer = self.data[idx]['answer']

        # Load the image from Hugging Face dataset
        image = self.data[idx]['image'].convert('RGB')  # Convert CMYK to RGB if needed

        # regardless of greyscale or RGB, Convert to RGB as transformer expects RCG 3 channel input
        img_tensor = self.transform(image)

        # Tokenize the question using GPT-2 tokenizer
        inputs = self.tokenizer(
            question,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_seq_length
        )

        # Map the processed answer to an integer index for classification
        if answer in self.answer_to_index:
            answer_idx = self.answer_to_index[answer]
        else:
            # Handle missing or unknown answers by assigning a default valid class
            answer_idx = 0  # Or any valid class index

        return {
            'image': img_tensor,
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(answer_idx, dtype=torch.long)
        }


# Custom Collate Function for Batch stacking
def custom_collate_fn(batch):
    images = torch.stack([item['image'] for item in batch])
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item['labels'] for item in batch])

    return {
        'image': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


# model and modules
# ------------------- Image Encoder (ViT) -------------------
# todo this will be called from PuzzleAI.ModelBase.ROI_models.Get_ROI_model
# Pre-processed image tensor is passed through the Vision Transformer (ViT), to obtain image embedding (ViT CLS token)
class ImageEncoder(nn.Module):
    def __init__(self, embed_size=768):
        super(ImageEncoder, self).__init__()

        # Pre-trained Vision Transformer (ViT)
        self.Image_Encoder = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=0)
        self.embed_convert = nn.Linear(self.Image_Encoder.embed_dim, embed_size) \
            if self.Image_Encoder.embed_dim != embed_size else nn.Identity()

    def forward(self, images):
        # Process image through Image_Encoder to get the embeddings
        Image_cls_embedding = self.Image_Encoder(images)  # CLS token output from ViT [B,D]
        return self.embed_convert(Image_cls_embedding)


# ------------------- Text Encoder (GPT-2) -------------------
# todo this will be called from PuzzleAI.ModelBase.Get_Language_model
# After tokenisation, the query (question tokens) is passed through the GPT-2 model,
# generating a sequence of hidden states (intermediate representations of input text after learning)
# The last CLS token from the last hidden state from the sequence is selected as the question's vector representation.
# A dropout layer is applied to the text embeddings to prevent overfitting.

class TextEncoder(nn.Module):
    # this obtains the question embedding (GPT CLS token)
    def __init__(self, tokenizer_name='gpt2', embed_size=768, dropout_rate=0.1):
        super(TextEncoder, self).__init__()
        # Pre-trained GPT-2 (768)
        self.Text_Encoder = GPT2Model.from_pretrained('gpt2') if tokenizer_name == 'gpt2' else None
        self.dropout = nn.Dropout(dropout_rate)

        self.embed_convert = nn.Linear(self.Text_Encoder.embed_dim, embed_size) \
            if self.Text_Encoder.embed_dim != embed_size else nn.Identity()

    def forward(self, input_ids, attention_mask):
        # Process text through GPT-2 to generate a seq of hidden state
        Text_outputs = self.Text_Encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        Text_cls_embedding = Text_outputs[:, -1, :]  # GPT-2 uses the last token embedding as CLS representation
        Text_cls_embedding = self.dropout(Text_cls_embedding)

        return self.embed_convert(Text_cls_embedding)


# ------------------- Multiple Modality Fusion -------------------
# The text embeddings (query) are passed into the attention mechanism to attend to the image embeddings (key/value).
# The multi-head attention layer computes the attention weights that help the model focus on relevant visual features
# based on the textual query.
# The attended image and text features are concatenated together to form a unified representation of both modalities.
class MultiHeadAttention(nn.Module):
    # In the attention mechanism (both single and multi-head), the core idea is to let the model focus on
    # different parts of the input sequence or different inputs to capture relationships
    # In this Visual Question Answering (VQA) model,
    # the attention mechanism helps the text (the query) focus on the image (to answer the question).
    def __init__(self, embed_size=768, heads=8, dropout_rate=0.1):
        super(MultiHeadAttention, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, query, key, value):
        '''
        Query (Q): Represents what you are trying to match or attend to (here it is the text embeddings).
        Key (K): Represents the features to compare against (here it is the image embeddings).
        Value (V): Holds the actual data that will be output after attention is applied (here it is image embeddings).
        The key tells the model where to attend, and the value gives the information for those attended locations.
        '''
        # query, key, value should be [seq_len, batch, embed_size]
        query = query.transpose(0, 1)
        key = key.transpose(0, 1)
        value = value.transpose(0, 1)

        attn_output, attn_weights = self.multihead_attn(query, key, value)
        attn_output = self.dropout(attn_output)

        # Transpose back to [batch, seq_len, embed_size]
        attn_output = attn_output.transpose(0, 1)
        return attn_output, attn_weights


class MultipleModalityFusion(nn.Module):
    # In the attention mechanism (both single and multi-head), the core idea is to let the model focus on
    # different parts of the input sequence or different inputs to capture relationships
    # In this Visual Question Answering (VQA) model,
    # the attention mechanism helps the text (the query) focus on the image (to answer the question).
    def __init__(self, fusion_method='MHSA', embed_size=768, heads=8, dropout_rate=0.1):
        super(MultipleModalityFusion, self).__init__()
        self.fusion_method = fusion_method
        if self.fusion_method == 'MHSA':
            self.attention = MultiHeadAttention(embed_size=embed_size, heads=heads, dropout_rate=dropout_rate)
        elif self.fusion_method == 'clip':
            raise NotImplementedError
        else:
            raise NotImplementedError

    def forward(self, text_features, image_features):
        if self.fusion_method == 'MHSA':
            # Attention between image and text
            query = text_features.unsqueeze(1)  # Text features as query
            key_value = image_features.unsqueeze(1)  # Image features as key/value
            attended_features, _ = self.attention(query, key_value, key_value)

            # Combine attended features with text features [B, 2 * embed_size]
            combined_features = torch.cat((attended_features.squeeze(1), text_features), dim=1)
            # The attended_features (the output of the attention mechanism) is combined with the original text_features
            # attended_features.squeeze(1): Removes the extra dimension added by unsqueeze(1) earlier

            return combined_features
        elif self.fusion_method == 'clip':
            raise NotImplementedError
        else:
            raise NotImplementedError


# ------------------- Answer Decoder (VQAbyCLS Classifier) -------------------
class AnswerDecoder_VQAbyCLS(nn.Module):
    '''
    The VQAbyCLS is task design that align and train the multiple modal output in a classification manner

    in the output langurage decoding stage:
    The combined features (which now include both the attended image information and the text representation)
    are passed into the answer decoder, which is a linear classifier predicts
    the final answer by producing logits for each possible answer class.

    The output, logits, is a tensor of size [batch_size, num_classes],
    it represents the raw scores for each possible answer class,
    where num_classes is the total number of possible answer classes

    '''

    def __init__(self, embed_size=768, num_classes=None):
        assert num_classes is not None
        super(AnswerDecoder_VQAbyCLS, self).__init__()
        self.classifier = nn.Linear(embed_size * 2, num_classes)

    def forward(self, combined_features):
        # Classification to predict the answer
        logits = self.classifier(combined_features)
        return logits


# ------------------- Full VQA Model -------------------
class VQAModel_VQAbyCLS(nn.Module):
    def __init__(self, image_encoder, text_encoder, fusion_method='MHSA',
                 num_classes=None, embed_size=768, heads=8, dropout_rate=0.1):
        assert num_classes is not None
        super(VQAModel_VQAbyCLS, self).__init__()
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        # fusion with clip for future
        self.fusion = MultipleModalityFusion(fusion_method=fusion_method,
                                             embed_size=embed_size, heads=heads, dropout_rate=dropout_rate)

        self.answer_decoder = AnswerDecoder_VQAbyCLS(embed_size=embed_size, num_classes=num_classes)

    def forward(self, images, input_ids, attention_mask):
        # Image encoding
        image_features = self.image_encoder(images)
        # Text encoding
        text_features = self.text_encoder(input_ids, attention_mask)
        # fusion
        combined_features = self.fusion(text_features, image_features)
        # Answer classification [B, 2 * embed_size] -> logits [B, N(num cls)]
        logits = self.answer_decoder(combined_features)

        return logits


# ------------------- Training and Evaluation -------------------

def train_and_validate(model, train_dataloader, val_dataloader, optimizer, loss_fn, device, epochs=10):
    best_val_accuracy = 0.0  # To track the best validation accuracy
    train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

    for epoch in range(epochs):
        # Set model to training mode
        model.train()
        total_train_loss, correct_train, total_train = 0, 0, 0

        # Add tqdm to show the progress for each batch
        train_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{epochs}")

        for batch in train_bar:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()  # Clear gradients
            # forward
            with torch.amp.autocast('cuda'):  # automatic mix precision training
                logits = model(images, input_ids, attention_mask)  # Forward pass
                loss = loss_fn(logits, labels)  # Calculate loss
                total_train_loss += loss.item()
                loss.backward()  # Backpropagation
                optimizer.step()  # Update weights

            _, predicted = torch.max(logits.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            # Update tqdm description with current loss and accuracy
            train_bar.set_postfix(loss=loss.item(), accuracy=correct_train / total_train)

        # Calculate average training loss and accuracy for this epoch
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_accuracy = correct_train / total_train if total_train > 0 else 0
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)

        # Validation step at the end of each epoch
        val_loss, val_accuracy = evaluate(model, val_dataloader, loss_fn, device)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch [{epoch + 1}/{epochs}], "
              f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
              f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # Track the best validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy

    return train_losses, train_accuracies, val_losses, val_accuracies


# ------------------- Validation Function (evaluate) -------------------
def evaluate(model, dataloader, loss_fn, device):
    model.eval()  # Set model to evaluation mode
    total_loss, correct, total = 0, 0, 0

    # Add tqdm to show progress for the validation/test loop
    val_bar = tqdm(dataloader, desc="Validating")

    with torch.no_grad():  # Disable gradient calculation for inference
        for batch in val_bar:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(images, input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            total_loss += loss.item()
            _, predicted = torch.max(logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Update tqdm description with current loss and accuracy
            val_bar.set_postfix(loss=loss.item(), accuracy=correct / total)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy




# sec 1

In [None]:
if __name__ == '__main__':
    # Constants
    IMG_SIZE = 224
    MAX_SEQ_LENGTH = 256  # Adjust based on typical question length
    BATCH_SIZE = 32  # 8 for small GPU
    EPOCHS = 10
    LEARNING_RATE = 0.0001
    DROP_RATE = 0.1
    HEADS = 8
    EMBED_SIZE = 768
    num_workers = 12

    tokenizer_name = 'gpt2'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # ------------------- Prepare the excel dataset -------------------
    # Create DataLoaders for batching and loading the data
    # Download the PathVQA dataset from Hugging Face
    dataset = load_dataset("flaviagiammarino/path-vqa")
    # Preprocess and clean the dataset
    for split in ["train", "validation", "test"]:  # Use "validation" instead of "val"
        dataset[split] = dataset[split].map(lambda example: {
            'question': clean_text(example['question']),
            'answer': clean_text(example['answer'])
        })
    # Check available splits
    print(dataset)

    # Extract all answers from train, validation, and test splits
    all_answers = dataset['train']['answer'] + dataset['validation']['answer'] + dataset['test']['answer']
    # Get unique answers
    unique_answers = np.unique(all_answers)
    # Map each unique answer to an index
    answer_to_index = {ans: idx for idx, ans in enumerate(unique_answers)}
    # Number of classes (unique answers)
    num_classes = len(answer_to_index)
    print(f"Number of unique answers: {num_classes}")

    # ------------------- Create Datasets & DataLoaders -------------------
    train_dataset = VQA_Dataset(hf_dataset=dataset['train'], answer_to_index=answer_to_index)
    val_dataset = VQA_Dataset(hf_dataset=dataset['validation'], answer_to_index=answer_to_index)
    test_dataset = VQA_Dataset(hf_dataset=dataset['test'], answer_to_index=answer_to_index)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn,
                                  num_workers=num_workers)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn,
                                num_workers=num_workers)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn,
                                 num_workers=num_workers)

    # ------------------- Build VQA model and task-------------------
    # Initialize model
    image_encoder = ImageEncoder(embed_size=EMBED_SIZE)
    text_encoder = TextEncoder(embed_size=EMBED_SIZE, dropout_rate=DROP_RATE)
    model = VQAModel_VQAbyCLS(image_encoder, text_encoder, fusion_method='MHSA',
                              embed_size=EMBED_SIZE, heads=HEADS, dropout_rate=DROP_RATE,
                              num_classes=num_classes)
    model = torch.compile(model)
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    loss_fn = CrossEntropyLoss()

    # ------------------- Training Loop-------------------
    train_losses, train_accuracies, val_losses, val_accuracies = train_and_validate(
        model, train_dataloader, val_dataloader, optimizer, loss_fn, device, epochs=EPOCHS
    )

    # ------------------- Test Code -------------------
    # Evaluate the model on the test dataset
    test_loss, test_accuracy = evaluate(model, test_dataloader, loss_fn, device)

    # Print out test results
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # Save the trained encoders after training VQAbyCLS model
    torch.save(model.image_encoder.state_dict(), 'image_encoder.pth')
    torch.save(model.text_encoder.state_dict(), 'text_encoder.pth')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

(…)-00000-of-00007-f2d0e9ef9f022d38.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

(…)-00001-of-00007-47d8e0220bf6c933.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

(…)-00002-of-00007-7fb5037c4c5da7be.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

(…)-00003-of-00007-74b9b7b81cc55f90.parquet:   0%|          | 0.00/90.0M [00:00<?, ?B/s]

(…)-00004-of-00007-77eea90af4a55dce.parquet:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

(…)-00005-of-00007-5332ec423be520bd.parquet:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

(…)-00006-of-00007-637a58c700b604af.parquet:   0%|          | 0.00/57.3M [00:00<?, ?B/s]

(…)-00000-of-00003-90a5518d26493b67.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

(…)-00001-of-00003-cbfe947a3418595c.parquet:   0%|          | 0.00/45.7M [00:00<?, ?B/s]

(…)-00002-of-00003-9ec816895bd3bc20.parquet:   0%|          | 0.00/64.7M [00:00<?, ?B/s]

(…)-00000-of-00003-e9adadb4799f44d3.parquet:   0%|          | 0.00/41.2M [00:00<?, ?B/s]

(…)-00001-of-00003-7ea98873fc919813.parquet:   0%|          | 0.00/45.3M [00:00<?, ?B/s]

(…)-00002-of-00003-1628308435019820.parquet:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19654 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6259 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6719 [00:00<?, ? examples/s]

Map:   0%|          | 0/19654 [00:00<?, ? examples/s]

Map:   0%|          | 0/6259 [00:00<?, ? examples/s]

Map:   0%|          | 0/6719 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 19654
    })
    validation: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 6259
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 6719
    })
})
Number of unique answers: 4879


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

W0927 06:35:00.732000 136159341089408 torch/fx/experimental/symbolic_shapes.py:4449] [0/0] xindex is not in var_ranges, defaulting to unknown range.
Training Epoch 1/10: 100%|█████████▉| 614/615 [02:57<00:00, 10.22it/s, accuracy=0.426, loss=3.71]W0927 06:38:04.593000 136159341089408 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] xindex is not in var_ranges, defaulting to unknown range.
W0927 06:38:35.906000 136159341089408 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] ps1 is not in var_ranges, defaulting to unknown range.
W0927 06:38:38.987000 136159341089408 torch/fx/experimental/symbolic_shapes.py:4449] [0/1] ps1 is not in var_ranges, defaulting to unknown range.
Training Epoch 1/10: 100%|██████████| 615/615 [04:27<00:00,  2.30it/s, accuracy=0.426, loss=0.893]
W0927 06:39:23.785000 136159341089408 torch/fx/experimental/symbolic_shapes.py:4449] [0/2] xindex is not in var_ranges, defaulting to unknown range.
Validating: 100%|██████████| 196/196 [01:16<00:00,  2.57it/s, acc

Epoch [1/10], Train Loss: 3.4335, Train Accuracy: 0.4261, Validation Loss: 2.9980, Validation Accuracy: 0.4844


Training Epoch 2/10: 100%|██████████| 615/615 [01:33<00:00,  6.56it/s, accuracy=0.508, loss=1.95]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.21it/s, accuracy=0.504, loss=5.83]


Epoch [2/10], Train Loss: 2.6782, Train Accuracy: 0.5076, Validation Loss: 3.0584, Validation Accuracy: 0.5041


Training Epoch 3/10: 100%|██████████| 615/615 [01:33<00:00,  6.58it/s, accuracy=0.566, loss=2.8]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.23it/s, accuracy=0.509, loss=6.18]


Epoch [3/10], Train Loss: 2.1551, Train Accuracy: 0.5658, Validation Loss: 3.1409, Validation Accuracy: 0.5090


Training Epoch 4/10: 100%|██████████| 615/615 [01:34<00:00,  6.52it/s, accuracy=0.636, loss=1.63]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.20it/s, accuracy=0.54, loss=6.81]


Epoch [4/10], Train Loss: 1.5645, Train Accuracy: 0.6358, Validation Loss: 3.2800, Validation Accuracy: 0.5397


Training Epoch 5/10: 100%|██████████| 615/615 [01:34<00:00,  6.54it/s, accuracy=0.716, loss=2.4]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.24it/s, accuracy=0.542, loss=7]


Epoch [5/10], Train Loss: 1.0985, Train Accuracy: 0.7159, Validation Loss: 3.3104, Validation Accuracy: 0.5421


Training Epoch 6/10: 100%|██████████| 615/615 [01:33<00:00,  6.57it/s, accuracy=0.76, loss=0.274]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.24it/s, accuracy=0.539, loss=7.22]


Epoch [6/10], Train Loss: 0.8334, Train Accuracy: 0.7603, Validation Loss: 3.4170, Validation Accuracy: 0.5394


Training Epoch 7/10: 100%|██████████| 615/615 [01:33<00:00,  6.55it/s, accuracy=0.79, loss=0.778]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.24it/s, accuracy=0.539, loss=7.34]


Epoch [7/10], Train Loss: 0.6820, Train Accuracy: 0.7896, Validation Loss: 3.4718, Validation Accuracy: 0.5394


Training Epoch 8/10: 100%|██████████| 615/615 [01:33<00:00,  6.56it/s, accuracy=0.815, loss=0.088]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.22it/s, accuracy=0.538, loss=7.62]


Epoch [8/10], Train Loss: 0.5871, Train Accuracy: 0.8147, Validation Loss: 3.5370, Validation Accuracy: 0.5383


Training Epoch 9/10: 100%|██████████| 615/615 [01:34<00:00,  6.54it/s, accuracy=0.836, loss=0.44]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.21it/s, accuracy=0.538, loss=7.77]


Epoch [9/10], Train Loss: 0.5194, Train Accuracy: 0.8356, Validation Loss: 3.6817, Validation Accuracy: 0.5384


Training Epoch 10/10: 100%|██████████| 615/615 [01:33<00:00,  6.55it/s, accuracy=0.845, loss=0.385]
Validating: 100%|██████████| 196/196 [00:37<00:00,  5.18it/s, accuracy=0.542, loss=7.82]


Epoch [10/10], Train Loss: 0.4716, Train Accuracy: 0.8447, Validation Loss: 3.7447, Validation Accuracy: 0.5416


Validating: 100%|██████████| 210/210 [00:39<00:00,  5.27it/s, accuracy=0.545, loss=7.87]


Test Loss: 3.7270, Test Accuracy: 0.5452


In [None]:
# ------------------- Testing Function for Single Input -------------------
def test_model_with_input(model, dataset, answer_to_index, index_to_answer, device):
    """
    Test the model with a single input (image and question).
    This function will print the predicted answer.
    """
    # Set the model to evaluation mode
    model.eval()

    # Choose a random sample from the dataset (or you can pass an index)
    sample = dataset[0]  # Replace 0 with any valid index if you want to test specific samples
    image = sample['image'].unsqueeze(0).to(device)  # Add batch dimension
    input_ids = sample['input_ids'].unsqueeze(0).to(device)  # Add batch dimension
    attention_mask = sample['attention_mask'].unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():  # No need to track gradients during testing
        logits = model(image, input_ids, attention_mask)
        predicted_idx = torch.argmax(logits, dim=1).item()

    # Convert the predicted index back to the answer
    predicted_answer = index_to_answer[predicted_idx]

    # Print the question and the predicted answer
    question = dataset.tokenizer.decode(input_ids.squeeze(0), skip_special_tokens=True)
    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")


# Helper function to create the index-to-answer mapping
def create_index_to_answer_mapping(answer_to_index):
    return {idx: ans for ans, idx in answer_to_index.items()}

# ------------------- Single Input Testing -------------------
# Create reverse mapping: index to answer
index_to_answer = create_index_to_answer_mapping(answer_to_index)
# Test the model with an input from the dataset
test_model_with_input(model, test_dataset, answer_to_index, index_to_answer, device)

Question: what are positively charged, thus allowing the compaction of the negatively charged dna?
Predicted Answer: all three transverse sections of myocardium
