In [43]:
import torch
torch.cuda.empty_cache()

In [1]:
import json
import requests

# Directory for all images
img_dir = 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/'

# Directory for annotation files
ann_dir = 'https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations/'

train_annotation_path = '{}{}'.format(ann_dir, 'train.json')
val_annotation_path = '{}{}'.format(ann_dir, 'val.json')
test_annotation_path = '{}{}'.format(ann_dir, 'test.json')


In [2]:
# Train
train_data = requests.get(train_annotation_path, allow_redirects=True)
train_data = train_data.json()

# Validation
val_data = requests.get(val_annotation_path, allow_redirects=True)
val_data = val_data.json()

# Test
test_data = requests.get(test_annotation_path, allow_redirects=True)
test_data = test_data.json()

print('Train set size:', len(train_data))
print('Validation set size:', len(val_data))
print('Test set size:', len(test_data))

Train set size: 20523
Validation set size: 4319
Test set size: 8000


In [3]:
vq = train_data[1345]
a = val_data[0]
b = test_data[0]

# printing the entire set of annotation to see the structure
print('First sample:', vq)
print('Second sample:', a)
print('Third sample:', b)

First sample: {'image': 'VizWiz_train_00001345.jpg', 'question': 'Hi can you tell me if this package of coffee is caffeinated or decaffeinated? Thank you.', 'answers': [{'answer_confidence': 'yes', 'answer': 'i dont know'}, {'answer_confidence': 'yes', 'answer': 'decaffeinated'}, {'answer_confidence': 'yes', 'answer': 'decaffeinated'}, {'answer_confidence': 'yes', 'answer': 'decaf'}, {'answer_confidence': 'yes', 'answer': 'decaf'}, {'answer_confidence': 'yes', 'answer': 'decaffeinated'}, {'answer_confidence': 'yes', 'answer': 'decaf'}, {'answer_confidence': 'maybe', 'answer': 'decaffeinated'}, {'answer_confidence': 'yes', 'answer': 'decaf'}, {'answer_confidence': 'yes', 'answer': 'decaffeinated'}], 'answer_type': 'other', 'answerable': 1}
Second sample: {'image': 'VizWiz_val_00000000.jpg', 'question': 'Ok. There is another picture I hope it is a better one.', 'answers': [{'answer': 'unanswerable', 'answer_confidence': 'yes'}, {'answer': 'unanswerable', 'answer_confidence': 'yes'}, {'an

In [4]:
train_subset = train_data[:1200] 

In [5]:
print(train_subset[1119])

{'image': 'VizWiz_train_00001119.jpg', 'question': 'Hi can you tell me what this product is? ', 'answers': [{'answer_confidence': 'yes', 'answer': 'drink mix'}, {'answer_confidence': 'yes', 'answer': 'powdered green tea'}, {'answer_confidence': 'maybe', 'answer': 'unanswerable'}, {'answer_confidence': 'yes', 'answer': 'no'}, {'answer_confidence': 'yes', 'answer': 'no'}, {'answer_confidence': 'yes', 'answer': 'green tea'}, {'answer_confidence': 'maybe', 'answer': 'soup'}, {'answer_confidence': 'yes', 'answer': 'unanswerable'}, {'answer_confidence': 'no', 'answer': 'tea'}, {'answer_confidence': 'yes', 'answer': 'green tea'}], 'answer_type': 'other', 'answerable': 1}


In [6]:
import requests
from PIL import Image
from io import BytesIO
import torch
from torchvision import transforms
from concurrent.futures import ThreadPoolExecutor  # For parallel execution

# Image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Function to download & preprocess an image
def process_image(sample):
    image_url = img_dir + sample['image']
    try:
        response = requests.get(image_url, timeout=5)  # 5-second timeout
        response.raise_for_status()  # Raise error if failed
        img = Image.open(BytesIO(response.content)).convert("RGB")  # Ensure 3 channels
        return transform(img)
    except Exception as e:
        print(f"Failed to load {image_url}: {e}")
        return torch.zeros(3, 224, 224)  # Return a blank image instead of crashing

# Load images in parallel using ThreadPoolExecutor
num_workers = 8  # Adjust based on system performance
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    image_tensors = list(executor.map(process_image, train_subset))

# Stack tensors into a batch
image_tensors = torch.stack(image_tensors)
print('Train images shape:', image_tensors.shape)  # Expected: [num_samples, 3, 224, 224]


Train images shape: torch.Size([1200, 3, 224, 224])


In [7]:
import torch
import string
import spacy

# Load English tokenizer from spaCy
nlp = spacy.load('en_core_web_sm')

# Tokenize and preprocess questions
def tokenize_question(question):
    doc = nlp(question.lower())  # Convert to lowercase
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

# Tokenize all questions
tokenized_questions = [tokenize_question(sample['question']) for sample in train_subset]

# Create a word-to-index vocabulary
word2idx = {"<PAD>": 0, "<UNK>": 1}  # Special tokens
for question in tokenized_questions:
    for token in question:
        if token not in word2idx:
            word2idx[token] = len(word2idx)

# Vocabulary size
vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

# Convert tokenized questions into index sequences
max_length = 20  # Fixed length for padding
question_tensors = []
for question in tokenized_questions:
    seq = [word2idx.get(token, word2idx["<UNK>"]) for token in question]
    seq += [word2idx["<PAD>"]] * (max_length - len(seq))  # Pad if shorter
    seq = seq[:max_length]  # Truncate if longer
    question_tensors.append(torch.tensor(seq))

# Stack into a tensor
question_tensors = torch.stack(question_tensors)
print("Train questions shape:", question_tensors.shape)  # Expected: [num_samples, max_length]


Vocabulary size: 633
Train questions shape: torch.Size([1200, 20])


In [12]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(question, max_length=20):
    encoding = tokenizer(question, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
    return encoding["input_ids"].squeeze(0), encoding["attention_mask"].squeeze(0)


In [20]:
from torchvision import transforms

# Define Image Transformations (ViT-compatible)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224 (ViT standard input size)
    transforms.ToTensor(),          # Convert image to tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize (ViT prefers -1 to 1)
])


In [21]:
from torch.utils.data import Dataset
import torch
from PIL import Image

class VizWizViTBERTDataset(Dataset):
    def __init__(self, data, transform, max_length=20, base_url="https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/"):
        self.data = data
        self.transform = transform
        self.max_length = max_length
        self.base_url = base_url  # Use online images

    def __len__(self):  
        return len(self.data)  #  This fixes the error

    def __getitem__(self, idx):
        sample = self.data[idx]

        #  Load Image from Online URL
        image_url = self.base_url + sample["image"]  # Construct URL
        try:
            response = requests.get(image_url, timeout=5)
            response.raise_for_status()  # Ensure it's a valid response
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f" Failed to load image: {image_url} ({e})")
            image = torch.zeros(3, 224, 224)  # Return a blank image on failure

        #  Tokenize Text
        input_ids, attention_mask = tokenize_text(sample["question"], self.max_length)

        #  Convert Label to Tensor
        label = torch.tensor(sample["answerable"], dtype=torch.long)

        return {
            "image": image,  #  Now a tensor
            "input_ids": input_ids,  #  Tensor
            "attention_mask": attention_mask,  #  Tensor
            "label": label  #  Tensor (0 or 1)
        }

# Create Dataset
train_dataset = VizWizViTBERTDataset(train_subset, transform=image_transform)


In [22]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [23]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    images = torch.stack([item["image"] for item in batch])  #  Stack image tensors
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)

    return {
        "image": images,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "label": labels
    }

#  Create DataLoader with `collate_fn`
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=collate_fn)


In [24]:
print(train_dataset[0])  # Check the dictionary keys


{'image': tensor([[[ 0.7020,  0.4118, -0.0431,  ...,  0.8118,  0.7961,  0.8510],
         [ 0.6471,  0.3255, -0.0745,  ...,  0.8118,  0.8353,  0.8118],
         [ 0.5843,  0.2392, -0.1137,  ...,  0.7804,  0.8353,  0.8275],
         ...,
         [-0.2706, -0.2706, -0.2706,  ..., -0.0667, -0.0745, -0.0431],
         [-0.2863, -0.2863, -0.3098,  ..., -0.0824, -0.0745, -0.0745],
         [-0.3098, -0.2941, -0.3020,  ..., -0.0980, -0.0824, -0.0902]],

        [[ 0.3961,  0.1529, -0.2863,  ...,  0.2863,  0.2627,  0.3098],
         [ 0.3490,  0.0588, -0.3098,  ...,  0.3020,  0.3098,  0.2863],
         [ 0.3098, -0.0196, -0.3412,  ...,  0.2941,  0.3412,  0.3098],
         ...,
         [-0.5922, -0.5765, -0.5608,  ..., -0.2235, -0.2392, -0.2000],
         [-0.6000, -0.5922, -0.6000,  ..., -0.2471, -0.2314, -0.2314],
         [-0.6157, -0.5922, -0.5922,  ..., -0.2549, -0.2314, -0.2314]],

        [[ 0.0902, -0.1137, -0.5137,  ..., -0.3725, -0.3725, -0.2941],
         [ 0.1294, -0.1686, -0.5294

In [25]:
#  Fetch a Batch
batch = next(iter(train_loader))

#  Check Shapes
print("Batch Image Shape:", batch["image"].shape)  # Expected: [32, 3, 224, 224]
print("Batch Input IDs Shape:", batch["input_ids"].shape)  # Expected: [32, max_length]
print("Batch Attention Mask Shape:", batch["attention_mask"].shape)  # Expected: [32, max_length]
print("Batch Label Shape:", batch["label"].shape)  # Expected: [32]


Batch Image Shape: torch.Size([32, 3, 224, 224])
Batch Input IDs Shape: torch.Size([32, 20])
Batch Attention Mask Shape: torch.Size([32, 20])
Batch Label Shape: torch.Size([32])


In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel
import timm  # For Vision Transformer


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel
import timm  

class CustomViTBERTClassifier(nn.Module):
    def __init__(self, bert_model="bert-base-uncased", vit_model="vit_base_patch16_224", num_classes=2):
        super().__init__()

        #  Vision Transformer (ViT) - REMOVE default classifier
        self.vit = timm.create_model(vit_model, pretrained=True, num_classes=0)  # Removes classifier
        vit_output_size = self.vit.num_features  # Correct output size from ViT

        #  Reduce ViT Output Dimensionality (to match BERT)
        self.vit_reduction = nn.Linear(vit_output_size, 768)  # Now input size is correct!

        #  BERT Model - REMOVE default classifier
        self.bert = BertModel.from_pretrained(bert_model)
        bert_output_size = self.bert.config.hidden_size  # 768

        #  Attention-Based Fusion (Custom Design)
        self.attention_layer = nn.MultiheadAttention(embed_dim=768, num_heads=8)

        #  Custom Feature Fusion & Processing
        self.fusion = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256)
        )

        #  Custom Classification Head
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes),
            nn.Softmax(dim=1)  # Output probabilities
        )

    def forward(self, image, input_ids, attention_mask):
        #  Extract Image Features (ViT)
        img_features = self.vit(image)  # Now this is [batch, 768]
        img_features = self.vit_reduction(img_features)  # Reduce to 768 dims

        #  Extract Text Features (BERT)
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output

        #  Apply Custom Attention Mechanism
        combined_features = torch.stack((img_features, text_features), dim=0)  # Stack instead of cat
        combined_features, _ = self.attention_layer(combined_features, combined_features, combined_features)
        combined_features = combined_features.mean(dim=0)  

        #  Custom Feature Fusion
        fused_features = self.fusion(combined_features)

        #  Classifier Output
        output = self.classifier(fused_features)

        return output


In [40]:
#  Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

#  Initialize Model & Move to GPU
model = CustomViTBERTClassifier().to(device)


Using device: cuda


In [41]:
criterion = nn.CrossEntropyLoss()  # Binary classification
optimizer = optim.Adam(model.parameters(), lr=2e-5)  # Fine-tuning with low LR


In [160]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [161]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [47]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        TP, TN, FP, FN = 0, 0, 0, 0  # True Positives, True Negatives, False Positives, False Negatives

        for batch in train_loader:
            #  Move Data to GPU
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights

            total_loss += loss.item()

            #  Predictions
            _, predicted = torch.max(outputs, 1)  # Get predicted class (0 or 1)

            #  Compute TP, TN, FP, FN
            TP += ((predicted == 1) & (labels == 1)).sum().item()
            TN += ((predicted == 0) & (labels == 0)).sum().item()
            FP += ((predicted == 1) & (labels == 0)).sum().item()
            FN += ((predicted == 0) & (labels == 1)).sum().item()

        #  Compute Accuracy using Correct Formula
        accuracy_cls = (TP + TN) / (TP + FP + TN + FN)

        #  Print Metrics for the Epoch (NO Debug Outputs)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Accuracy_cls: {accuracy_cls:.4f}")
        print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}\n")

#  Train the Model
train_model(model, train_loader, criterion, optimizer, num_epochs=5)


Epoch [1/5], Loss: 25.7323, Accuracy_cls: 0.5825
TP: 476, TN: 223, FP: 137, FN: 364

Epoch [2/5], Loss: 22.9753, Accuracy_cls: 0.7408
TP: 646, TN: 243, FP: 117, FN: 194

Epoch [3/5], Loss: 19.4737, Accuracy_cls: 0.8633
TP: 752, TN: 284, FP: 76, FN: 88

Epoch [4/5], Loss: 16.2384, Accuracy_cls: 0.9583
TP: 821, TN: 329, FP: 31, FN: 19

Epoch [5/5], Loss: 14.3011, Accuracy_cls: 0.9858
TP: 832, TN: 351, FP: 9, FN: 8



In [52]:
def collate_fn1(batch):
    images = torch.stack([item["image"] for item in batch])  
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True, padding_value=0)

    # ðŸ”¹ Only include labels if they exist in the batch (for training)
    if "label" in batch[0]:  
        labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
        return {
            "image": images,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": labels
        }
    else:  
        return {  
            "image": images,
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }


In [53]:
class VizWizTestDataset(Dataset):
    def __init__(self, data, transform, max_length=20, base_url="https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/"):
        self.data = data
        self.transform = transform
        self.max_length = max_length
        self.base_url = base_url  # Online images

    def __len__(self):  
        return len(self.data)  

    def __getitem__(self, idx):
        sample = self.data[idx]

        #  Load Image from Online URL
        image_url = self.base_url + sample["image"]
        try:
            response = requests.get(image_url, timeout=5)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f" Failed to load image: {image_url} ({e})")
            image = torch.zeros(3, 224, 224)  # Return a blank image on failure

        #  Tokenize Text
        input_ids, attention_mask = tokenize_text(sample["question"], self.max_length)

        return {
            "image": image,  
            "input_ids": input_ids,  
            "attention_mask": attention_mask  
        }

#  Create Test Dataset
test_dataset = VizWizTestDataset(test_data[:100], transform=image_transform)  # First 100 samples only

#  Create Test DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=collate_fn1)


In [54]:
def get_first_100_predictions(model, test_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(images, input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)  # Get predicted class (0 or 1)

            predictions.extend(predicted.cpu().tolist())

            if len(predictions) >= 100:  
                break  # Stop after first 100 samples

    return predictions[:100]  

#  Get First 100 Predictions
first_100_predictions = get_first_100_predictions(model, test_loader, device)


In [55]:
import pickle

# Convert to a PyTorch tensor
predictions_tensor = torch.tensor(first_100_predictions)  # 1D tensor

#  Save the tensor to a .pkl file
file_name = "Srinath_Muppala_challenge1.pkl"  # Change this to your actual name
torch.save(predictions_tensor, file_name)

print(f" Predictions saved successfully to {file_name}")


âœ… Predictions saved successfully to Srinath_Muppala_challenge1.pkl


In [56]:
import torch

# Load the predictions from the file
predictions = torch.load("Srinath_Muppala_challenge1.pkl")

# Print the shape and first few values
print("Predictions Shape:", predictions.shape)  # Expected: torch.Size([100])
print("First 10 Predictions:", predictions[:10].tolist())  # Convert to list for easy reading


Predictions Shape: torch.Size([100])
First 10 Predictions: [1, 0, 1, 1, 1, 0, 1, 1, 1, 1]


In [62]:
print(test_data[:10])

[{'image': 'VizWiz_test_00000000.jpg', 'question': 'What is this? And what color is it?'}, {'image': 'VizWiz_test_00000001.jpg', 'question': 'What is this?'}, {'image': 'VizWiz_test_00000002.jpg', 'question': 'Has this oven gotten up to four hundred fifty degrees Fahrenheit yet?'}, {'image': 'VizWiz_test_00000003.jpg', 'question': 'What is this?'}, {'image': 'VizWiz_test_00000004.jpg', 'question': 'What is this?'}, {'image': 'VizWiz_test_00000005.jpg', 'question': 'What kind of key is this?'}, {'image': 'VizWiz_test_00000006.jpg', 'question': 'What does it say on here?'}, {'image': 'VizWiz_test_00000007.jpg', 'question': 'What is this? '}, {'image': 'VizWiz_test_00000008.jpg', 'question': 'What is this? What is this? '}, {'image': 'VizWiz_test_00000009.jpg', 'question': 'Do these beans look like black beans or pinto beans?'}]


In [223]:
import torch
import torch.nn as nn
import timm
from transformers import BertModel

class CustomViTBERTClassifier1(nn.Module):
    def __init__(self, bert_model="bert-base-uncased", vit_model="vit_base_patch16_224", hidden_size=512, vocab_size=30522, top_k=5):
        super(CustomViTBERTClassifier1, self).__init__()
        self.top_k = top_k
        self.vocab_size = vocab_size

        # ViT Encoder
        self.vit = timm.create_model(vit_model, pretrained=True)
        self.vit.head = nn.Identity()
        self.vit_output_size = self.vit.num_features

        # BERT Encoder
        self.bert = BertModel.from_pretrained(bert_model)
        self.bert_output_size = self.bert.config.hidden_size

        # Project to hidden size
        self.projection = nn.Linear(self.vit_output_size + self.bert_output_size, hidden_size)

        # Decoder
        self.decoder = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True
        )

        # Embedding for tokens
        self.embedding = nn.Embedding(vocab_size, hidden_size)

        # Final vocab prediction
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image, input_ids, attention_mask, answer_tokens=None):
        img_features = self.vit(image)
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        combined = torch.cat((img_features, text_features), dim=1)
        projected = self.projection(combined)

        if answer_tokens is not None:
            # Training Mode
            decoder_input = self.embedding(answer_tokens)
            h0 = projected.unsqueeze(0).repeat(self.decoder.num_layers, 1, 1)
            c0 = torch.zeros_like(h0)
            decoder_outputs, _ = self.decoder(decoder_input, (h0, c0))
            output_logits = self.fc(decoder_outputs)
            return output_logits
        else:
            # Inference Mode with Top-k Sampling
            batch_size = image.size(0)
            current_token = torch.full((batch_size, 1), 101, dtype=torch.long, device=image.device)
            current_input = self.embedding(current_token)
            h0 = projected.unsqueeze(0).repeat(self.decoder.num_layers, 1, 1)
            c0 = torch.zeros_like(h0)
            hidden = (h0, c0)

            generated_tokens = []

            for _ in range(20):
                output, hidden = self.decoder(current_input, hidden)
                logits = self.fc(output[:, -1, :])  # [B, vocab_size]

                if self.top_k > 0:
                    topk_vals, topk_idx = torch.topk(logits, self.top_k, dim=-1)
                    probs = torch.nn.functional.softmax(topk_vals, dim=-1)
                    next_token = topk_idx.gather(-1, torch.multinomial(probs, num_samples=1))
                else:
                    next_token = logits.argmax(dim=-1, keepdim=True)

                generated_tokens.append(next_token)
                current_input = self.embedding(next_token)

            return torch.cat(generated_tokens, dim=1)  # [B, 20]


In [224]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_textgen(batch):
    images = torch.stack([item["image"] for item in batch])  #  Stack image tensors
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True, padding_value=0)

    #  Fix: Use answer_tokens instead of label
    answer_tokens = pad_sequence([item["answer_tokens"] for item in batch], batch_first=True, padding_value=0)

    return {
        "image": images,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "answer_tokens": answer_tokens  #  Use this instead of "label"
    }


In [225]:
class VizWizTextGenDataset(Dataset):
    def __init__(self, data, transform, tokenizer, max_length=20, base_url="https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/"):
        self.data = data
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.base_url = base_url

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        #  Load Image from Online URL
        image_url = self.base_url + sample["image"]
        try:
            response = requests.get(image_url, timeout=5)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f" Failed to load image: {image_url} ({e})")
            image = torch.zeros(3, 224, 224)

        #  Tokenize Question
        input_ids, attention_mask = tokenize_text(sample["question"], self.max_length)

        #  Tokenize Answer (if available)
        if "answers" in sample:
            answer_text = sample["answers"][0]["answer"]
            answer_tokens = self.tokenizer(answer_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")["input_ids"].squeeze(0)
        else:
            answer_tokens = torch.zeros(self.max_length, dtype=torch.long)  # Placeholder

        return {
            "image": image,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "answer_tokens": answer_tokens
        }

#  Initialize Dataset
train_dataset = VizWizTextGenDataset(train_subset, transform=image_transform, tokenizer=tokenizer)

#  DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn_textgen)


In [203]:
print(train_dataset[0])

{'image': tensor([[[ 0.7020,  0.4118, -0.0431,  ...,  0.8118,  0.7961,  0.8510],
         [ 0.6471,  0.3255, -0.0745,  ...,  0.8118,  0.8353,  0.8118],
         [ 0.5843,  0.2392, -0.1137,  ...,  0.7804,  0.8353,  0.8275],
         ...,
         [-0.2706, -0.2706, -0.2706,  ..., -0.0667, -0.0745, -0.0431],
         [-0.2863, -0.2863, -0.3098,  ..., -0.0824, -0.0745, -0.0745],
         [-0.3098, -0.2941, -0.3020,  ..., -0.0980, -0.0824, -0.0902]],

        [[ 0.3961,  0.1529, -0.2863,  ...,  0.2863,  0.2627,  0.3098],
         [ 0.3490,  0.0588, -0.3098,  ...,  0.3020,  0.3098,  0.2863],
         [ 0.3098, -0.0196, -0.3412,  ...,  0.2941,  0.3412,  0.3098],
         ...,
         [-0.5922, -0.5765, -0.5608,  ..., -0.2235, -0.2392, -0.2000],
         [-0.6000, -0.5922, -0.6000,  ..., -0.2471, -0.2314, -0.2314],
         [-0.6157, -0.5922, -0.5922,  ..., -0.2549, -0.2314, -0.2314]],

        [[ 0.0902, -0.1137, -0.5137,  ..., -0.3725, -0.3725, -0.2941],
         [ 0.1294, -0.1686, -0.5294

In [227]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel
import timm

#  Define Loss & Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens
optimizer = optim.Adam(model.parameters(), lr=1e-2)
model = CustomViTBERTClassifier1().to(device)

def train_textgen(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_tokens = 0

        for batch in train_loader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            answer_tokens = batch["answer_tokens"].to(device)  # Target sequence

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask, answer_tokens)

            #  Compute loss only for non-padding tokens
            loss = criterion(outputs.view(-1, outputs.size(-1)), answer_tokens.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            #  Compute Token Accuracy
            predictions = outputs.argmax(dim=-1)  # Get most probable token for each position
            mask = (answer_tokens != 0)  # Ignore padding tokens
            correct = (predictions == answer_tokens) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

#  Train the Model
train_textgen(model, train_loader, criterion, optimizer, num_epochs=10)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
class VizWizTextGenDataset(Dataset):
    def __init__(self, data, transform, tokenizer, max_length=20, base_url="https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/"):
        self.data = data
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.base_url = base_url

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        #  Load Image from URL
        image_url = self.base_url + sample["image"]
        try:
            response = requests.get(image_url, timeout=5)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f" Failed to load image: {image_url} ({e})")
            image = torch.zeros(3, 224, 224)  # Placeholder if image fails to load

        #  Tokenize Question
        input_ids, attention_mask = tokenize_text(sample["question"], self.max_length)

        #  For training, extract answers (not applicable to test set)
        if "answers" in sample:  
            answer_text = sample["answers"][0] if sample["answers"] else ""
            answer_tokens = self.tokenizer(answer_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")["input_ids"].squeeze(0)
        else:
            answer_tokens = None  # No ground-truth answer for test set

        return {
            "image": image,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "answer_tokens": answer_tokens  # Will be `None` for test set
        }


In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_textgen2(batch):
    images = torch.stack([item["image"] for item in batch])
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True, padding_value=0)

    #  Check if 'answer_tokens' exists in the batch (for training data)
    if "answer_tokens" in batch[0]:  
        answer_tokens = pad_sequence(
            [item["answer_tokens"] for item in batch if item["answer_tokens"] is not None], 
            batch_first=True, 
            padding_value=0
        )
    else:
        answer_tokens = None  # Test set does not have answer_tokens

    return {
        "image": images,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "answer_tokens": answer_tokens  # This will be `None` for test set
    }


In [None]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn_textgen2)


In [None]:
def decode_predictions(generated_ids, tokenizer):
    answers = []
    for ids in generated_ids:
        text = tokenizer.decode(ids, skip_special_tokens=True)
        cleaned = " ".join([w for w in text.split() if not w.startswith("[unused")])
        answers.append(cleaned.strip())
    return answers


In [None]:
import json

def generate_answers(model, test_loader, device, tokenizer):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # ðŸ”¸ Generate token IDs
            generated_ids = model(images, input_ids, attention_mask)

            # ðŸ”¸ Clean decoded answers
            answers = decode_predictions(generated_ids, tokenizer)

            # ðŸ”¸ Build prediction JSON
            for i in range(len(answers)):
                predictions.append({
                    "image": test_data[batch_idx * len(answers) + i]["image"],
                    "answer": answers[i]
                })

    return predictions[:100]

#  Generate Predictions
predictions = generate_answers(model, test_loader, device, tokenizer)

#  Save JSON File
with open("Srinath_Muppala_challenge2.json", "w") as f:
    json.dump(predictions, f, indent=4)

print(" Predictions saved to Srinath_Muppala_challenge2.json")


In [87]:
print(test_data[0])

{'image': 'VizWiz_test_00000000.jpg', 'question': 'What is this? And what color is it?'}


In [214]:
import json

#  Load JSON File
with open("Srinath_Muppala_challenge2.json", "r") as f:
    predictions = json.load(f)

#  Print First 10 Predictions
print(json.dumps(predictions[:5], indent=4))


[
    {
        "image": "VizWiz_test_00000000.jpg",
        "answer": "##ci devote aesthetictos regionalrable working granny knyed cosmetics duane adherents rang quickly drug metaphor auxiliary clinical 70th"
    },
    {
        "image": "VizWiz_test_00000001.jpg",
        "answer": "grams doorbell printed bahamas fated birthday \u66f2 featured uk transitionvite revealing harvest doneitarian done \u66f2 masonry pottery\u2192"
    },
    {
        "image": "VizWiz_test_00000002.jpg",
        "answer": "##rs shrinkingask phosphate butte br august discusrable fragmentation costs candle navigate gael discus hbo kits 1899rable"
    },
    {
        "image": "VizWiz_test_00000003.jpg",
        "answer": "##dina hiv walled hiv facebook ydticaidae seo upwards activation activation rein imply 70thamericana visainationssay astros"
    },
    {
        "image": "VizWiz_test_00000004.jpg",
        "answer": "telugu kung loads history officials \u66f2 surfing\u2192 surfing final 70th are grit \u0

In [165]:
print(tokenizer.convert_ids_to_tokens([1]))  # Should print ['[unused0]']


['[unused0]']


In [1]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()


In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [217]:
print(tokenizer.decode(train_dataset[0]['answer_tokens']))


[CLS] basil leaves [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [218]:
sample = train_dataset[0]
sample = {k: v.unsqueeze(0).to(device) for k, v in sample.items()}
output = model(**sample)
predicted = output.argmax(dim=-1)
print("Generated:", tokenizer.decode(predicted[0]))
print("Target:", tokenizer.decode(sample['answer_tokens'][0]))

Generated: fourteenth nipples restrictions springsteen surprising Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ Êƒ
Target: [CLS] basil leaves [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
