In [1]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
torch.cuda.empty_cache()

# Load model and processor
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()



In [2]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.cuda.empty_cache()
torch.manual_seed(42)

True


<torch._C.Generator at 0x294e824abf0>

In [3]:
df = pd.read_csv("dataset/new_train.csv")

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    """Custom dataset for your fine-tuning."""
    
    def __init__(self, df, processor, image_folder):
        self.df = df
        self.processor = processor
        self.image_folder = image_folder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the image and text
        question = self.df.iloc[idx]['entity_name']  # Entity name as the question
        answer = self.df.iloc[idx]['entity_value']  # Entity value as the answer
        image_name = os.path.basename(self.df.iloc[idx]['image_link'])
        image_path = os.path.join(self.image_folder, image_name)
        image = Image.open(image_path).convert("RGB")

        # Process the input
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")  # Use truncation=True and padding="max_length"
        labels = self.processor.tokenizer.encode(answer, max_length=8, padding="max_length", truncation=True, return_tensors='pt')  # Apply same padding and truncation
        
        encoding["labels"] = labels
        
        # Remove the batch dimension
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding


In [5]:
image_folder = "timages/" 

In [6]:
# Create datasets
train_size = int(0.9 * len(df))
train_dataset = CustomDataset(df[:train_size], processor, image_folder)
valid_dataset = CustomDataset(df[train_size:], processor, image_folder)

# Create data loaders
batch_size = 4  # Reduce batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 10
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [7]:
# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for batch in tqdm(train_dataloader, desc='Training batch: ...'):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_mask = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            labels=labels)
            
        loss = outputs.loss
        epoch_loss += loss.item()
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # Evaluation loop
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc='Validating batch: ...'):
            input_ids = batch.pop('input_ids').to(device)
            pixel_values = batch.pop('pixel_values').to(device)
            attention_mask = batch.pop('attention_mask').to(device)
            labels = batch.pop('labels').to(device)

            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(input_ids=input_ids,
                                pixel_values=pixel_values,
                                labels=labels)
            
            loss = outputs.loss
            eval_loss += loss.item()

    # Track loss and learning rate
    tracking_information.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {epoch_loss/len(train_dataloader)} - Eval Loss: {eval_loss/len(valid_dataloader)} - LR: {optimizer.param_groups[0]['lr']}")

    scheduler.step()

    # Save the model if eval loss decreases
    if eval_loss < min_eval_loss:
        model.save_pretrained("Model/blip-saved-model", from_pt=True)
        print("Saved model to Model/blip-saved-model")
        min_eval_loss = eval_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            break

# Save training progress
with open("tracking_information.pkl", "wb") as f:
    pickle.dump(tracking_information, f)

print("Fine-tuning complete!")

Training batch: ...:   0%|          | 0/4811 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Training batch: ...:   1%|          | 39/4811 [09:53<20:10:21, 15.22s/it]


KeyboardInterrupt: 