In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import math

import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
from transformers import ViTModel, BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import json
# import pandas as pd
from collections import Counter
import cv2
import gc

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/Final.csv
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000381595.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000233539.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000471409.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000069256.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000259616.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000517936.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000368731.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000130908.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000425773.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000296432.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000569838.jpg
/kaggle/input/Subset_train2014/Subset_train2014/COCO_train2014_000000218399.jpg
/kaggle/input/Su

In [4]:
class CoTransformer(nn.Module):
    def __init__(self, dim):
        super(CoTransformer, self).__init__()
        self.attention = nn.MultiheadAttention(dim, num_heads=8)
        self.linear = nn.Linear(dim, dim)
        self.layer_norm = nn.LayerNorm(dim)

    def forward(self, query, key, value):
        attn_output, _ = self.attention(query, key, value)
        co_transformed_repr = self.layer_norm(self.linear(attn_output) + query)
        return co_transformed_repr

In [5]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.A = nn.Parameter(torch.randn(in_features, rank))
        self.B = nn.Parameter(torch.randn(rank, out_features))
        
        nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
        nn.init.zeros_(self.B)

    def forward(self, x):
        return x @ self.A @ self.B

In [22]:
from transformers import BertModel

class LoRABertModel(nn.Module):
    def __init__(self, rank):
        super(LoRABertModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.rank = rank
        
        # Collect layers to be replaced
        layers_to_replace = []
        for name, module in self.bert.named_modules():
            if isinstance(module, nn.Linear) and 'attention' in name:
                layers_to_replace.append((name, module.in_features, module.out_features))

        # Replace the collected layers
        for name, in_features, out_features in layers_to_replace:
            lora_layer = LoRALayer(in_features, out_features, rank)
            self.set_module_by_name(name, lora_layer)

    def set_module_by_name(self, name, module):
        parts = name.split('.')
        obj = self.bert
        for part in parts[:-1]:
            obj = getattr(obj, part)
        setattr(obj, parts[-1], module)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        return self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)


In [28]:
from transformers import ViTModel

class LoRAViTModel(nn.Module):
    def __init__(self, rank):
        super(LoRAViTModel, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.rank = rank
        
        # Collect layers to be replaced
        layers_to_replace = []
        for name, module in self.vit.named_modules():
            if isinstance(module, nn.Linear) and 'attention' in name:
                layers_to_replace.append((name, module.in_features, module.out_features))

        # Replace the collected layers
        for name, in_features, out_features in layers_to_replace:
            lora_layer = LoRALayer(in_features, out_features, rank)
            self.set_module_by_name(name, lora_layer)

    def set_module_by_name(self, name, module):
        parts = name.split('.')
        obj = self.vit
        for part in parts[:-1]:
            obj = getattr(obj, part)
        setattr(obj, parts[-1], module)

    def forward(self, pixel_values):
        return self.vit(pixel_values)

In [10]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, question, answer = self.data[index]
        return image, question, answer

In [11]:
def getImageName(image_path, image_id):

    path = image_path+"COCO_train2014_"
    output = "0" * (12 - len(str(image_id))) + str(image_id)
    path = path+output+".jpg"
    return path

In [12]:
def filterMajoritySingleWord(answer_list):

    single_word_answers = [entry["answer"] for entry in answer_list if len(entry["answer"].split()) == 1]
    if (len(single_word_answers) == 0):
        single_word_answers = [entry["answer"] for entry in answer_list]

    answer_counts = Counter(single_word_answers)

    majority_answer_count = max(answer_counts.values())
    majority_answers = [answer for answer, count in answer_counts.items() if count == majority_answer_count]

    return majority_answers[0]

def load_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (224, 224))
    image = np.transpose(image, (2, 0, 1))  # Change shape from HxWxC to CxHxW
    return image

In [14]:
df = pd.read_csv("/kaggle/input/Final.csv")
image_path = "/kaggle/input/Subset_train2014/Subset_train2014/"

dummy_data = [tuple(row) for row in df.values]

unique_ids = set([image for image,_,_ in dummy_data])

images_dict = {image_id : load_image(getImageName(image_path, image_id)) for image_id in unique_ids}

print(images_dict[36].shape)

(3, 224, 224)


In [15]:
answer_vocabulary = set([answer for _, _, answer in dummy_data])
answer_to_label = {answer: label for label, answer in enumerate(answer_vocabulary)}

dummy_data_new = [(images_dict[image_id], question, answer_to_label.get(answer, -1)) for (image_id, question, answer) in dummy_data]
dummy_data = dummy_data_new

In [25]:
train_data, val_data = train_test_split(dummy_data, test_size=0.2)

# Define data loaders
train_loader = DataLoader(MyDataset(train_data), batch_size=32, shuffle=True)
val_loader = DataLoader(MyDataset(val_data), batch_size=32, shuffle=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
num_classes = len(answer_vocabulary)
dim = 768
bert_rank = 10
vit_rank = 10

In [29]:
# model = MyModel(num_classes=num_classes, dim=768)  # Assuming dim=768 for ViT and BERT

model = MyModel(num_classes, dim, bert_rank, vit_rank)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Separate parameters for LoRA and other layers
lora_params = [p for n, p in model.named_parameters() if 'A' in n or 'B' in n]
other_params = [p for n, p in model.named_parameters() if 'A' not in n and 'B' not in n]

# Define separate optimizers
lora_optimizer = optim.Adam(lora_params, lr=1e-4)
other_optimizer = optim.Adam(other_params, lr=1e-3)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
num_epochs = 10
num_batches_per_epoch = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    
    all_predicted = []
    all_answers = []
    
    for batch_idx, (images, questions, answers) in enumerate(train_loader):
        
        if batch_idx >= num_batches_per_epoch:
            break
        
        lora_optimizer.zero_grad()
        other_optimizer.zero_grad()
        
        question_tokens = tokenizer(questions, return_tensors='pt', padding=True, truncation=True)
        ques_ids = question_tokens['input_ids']

        output = model(images, ques_ids)
        
        loss = criterion(output, answers)
        
        loss.backward()
        lora_optimizer.step()
        other_optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(output, 1)
        total_correct += (predicted == answers).sum().item()
        total_samples += images.size(0)
        
        all_predicted.extend(predicted.cpu().numpy())
        all_answers.extend(answers.cpu().numpy())
    
    precision = precision_score(all_answers, all_predicted, average='weighted')
    recall = recall_score(all_answers, all_predicted, average='weighted')
    f1 = f1_score(all_answers, all_predicted, average='weighted')
    
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_samples = 0
    
    val_predicted = []
    val_answers = []

    with torch.no_grad():
        for batch_idx, (images, questions, answers) in enumerate(val_loader):

            if batch_idx >= num_batches_per_epoch:
                break   
            
            question_tokens = tokenizer(questions, return_tensors='pt', padding=True, truncation=True)
            ques_ids = question_tokens['input_ids']

            output = model(images, ques_ids)
            
            loss = criterion(output, answers)

            val_loss += loss.item()
            _, predicted = torch.max(output, 1)
            val_correct += (predicted == answers).sum().item()
            val_samples += images.size(0)
            
            val_predicted.extend(predicted.cpu().numpy())
            val_answers.extend(answers.cpu().numpy())
            
    val_precision = precision_score(val_answers, val_predicted, average='weighted')
    val_recall = recall_score(val_answers, val_predicted, average='weighted')
    val_f1 = f1_score(val_answers, val_predicted, average='weighted')
    
    avg_loss = total_loss / len(train_loader)
    accuracy = (total_correct / total_samples) * 100

    val_avg_loss = val_loss / len(val_loader)
    val_accuracy = (val_correct / val_samples) * 100

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss}, Train Accuracy: {accuracy}%, "
          f"Train Precision: {precision}, Train Recall: {recall}, Train F1: {f1}")
    print(f"Validation Loss: {val_avg_loss}, Validation Accuracy: {val_accuracy}%, "
          f"Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1: {val_f1}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
torch.save(model.state_dict(), "WithCOTRM_Vit_Bert.pth")