In [1]:
import pandas as pd
import os
import random
from datasets import Dataset
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, CLIPProcessor, AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor , GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
import json


device = 'cuda'

# Paths
csv_path = "/kaggle/input/stanford-image-paragraph-captioning-dataset/stanford_df_rectified.csv"
image_folder = "/kaggle/input/stanford-image-paragraph-captioning-dataset/stanford_img/content/stanford_images"

# Load and prepare dataset
df = pd.read_csv(csv_path)
df = df.rename(columns={"Image_name": "image_name", "Paragraph": "caption"})
df["image_name"] = df["image_name"].astype(str).apply(lambda x: x if x.endswith(".jpg") else x + ".jpg")
df["image_path"] = df["image_name"].apply(lambda name: os.path.join(image_folder, name))
df = df[df["caption"].notna() & df["image_path"].apply(os.path.exists)]
df = df.reset_index(drop=True)

# Initialize processors
clip_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # Set pad token to eos token

brief_instructions = [
    "Describe the image concisely.",
    "Provide a brief description of the given image.",
    "Offer a succinct explanation of the picture presented.",
    "Summarize the visual content of the image.",
    "Give a short and clear explanation of the subsequent image.",
]

class VisionTextDataset(Dataset):
    def __init__(self, dataframe, image_processor, tokenizer, max_length=256):
        self.data = dataframe
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image = Image.open(item["image_path"]).convert("RGB")
        pixel_values = self.image_processor(image, return_tensors="pt")["pixel_values"][0]
        
        instruction = random.choice(brief_instructions)
        text = f"User: {instruction}\nAssistant: {item['caption']}"
        
        tokenized = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )
        
        return {
            "pixel_values": pixel_values,
            "input_ids": tokenized["input_ids"][0],
            "attention_mask": tokenized["attention_mask"][0]
        }

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = input_ids.clone()
    
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Create dataset and dataloader
dataset = VisionTextDataset(
    dataframe=df,
    image_processor=clip_processor,
    tokenizer=gpt2_tokenizer,
    max_length=256
)

pretrain_dataloader = DataLoader(
    dataset,
    batch_size=8,  # Increased batch size since we're using float32
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True
)

class VisionTextModel(nn.Module):
    def __init__(self, vision_encoder, llm, projection_in_dim, projection_out_dim, num_image_tokens=16):
        super().__init__()
        self.vision_encoder = vision_encoder
        self.llm = llm
        self.projection = nn.Sequential(
            nn.Linear(projection_in_dim, num_image_tokens * projection_out_dim),
            nn.LayerNorm(num_image_tokens * projection_out_dim)
        )
        self.num_image_tokens = num_image_tokens
        self.set_trainable_components(vision=False, llm=False, projection=True)

    def set_trainable_components(self, vision=False, llm=False, projection=True):
        for param in self.vision_encoder.parameters():
            param.requires_grad = vision
        for param in self.llm.parameters():
            param.requires_grad = llm
        for param in self.projection.parameters():
            param.requires_grad = projection
        self.vision_trainable = vision
        self.llm_trainable = llm

    def forward(self, images, input_ids, attention_mask=None, labels=None):
        # Process images
        if self.vision_trainable:
            vision_output = self.vision_encoder(images)
        else:
            with torch.no_grad():
                vision_output = self.vision_encoder(images)
        
        # Extract features
        image_features = vision_output.last_hidden_state.mean(dim=1)
        
        # Project features
        projected = self.projection(image_features)
        projected = projected.view(-1, self.num_image_tokens, self.projection[0].out_features // self.num_image_tokens)
        projected = projected * 0.1  # Stabilize training

        # Process text embeddings
        if self.llm_trainable:
            text_embeds = self.llm.get_input_embeddings()(input_ids)
        else:
            with torch.no_grad():
                text_embeds = self.llm.get_input_embeddings()(input_ids)

        # Combine embeddings
        inputs_embeds = torch.cat([projected, text_embeds], dim=1)

        # Adjust attention mask and labels
        if attention_mask is not None:
            image_mask = torch.ones(attention_mask.shape[0], self.num_image_tokens, device=attention_mask.device)
            attention_mask = torch.cat([image_mask, attention_mask], dim=1)
        
        if labels is not None:
            image_labels = torch.full((labels.shape[0], self.num_image_tokens), -100, device=labels.device)
            labels = torch.cat([image_labels, labels], dim=1)

        # Forward through LLM
        outputs = self.llm(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs



2025-07-31 09:04:11.574209: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753952651.596350     264 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753952651.602973     264 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load models in float32
clip_vision_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [3]:

# Initialize multimodal model
model = VisionTextModel(
    vision_encoder=clip_vision_model,
    llm=gpt2_model,
    projection_in_dim=768,  # CLIP hidden size
    projection_out_dim=768,  # Match GPT-2 hidden size
    num_image_tokens=32
).to(device)

# Set to pretrain mode
model.set_trainable_components(vision=False, llm=False, projection=True)

# Training configuration
gradient_accumulation_steps = 4
num_epochs = 3
log_interval = 50
optimizer = torch.optim.AdamW(model.projection.parameters(), lr=1e-4)

# Training loop
total_steps = num_epochs * len(pretrain_dataloader) // gradient_accumulation_steps
progress_bar = tqdm(total=total_steps, desc="Pretraining")
global_step = 0
running_loss = 0.0

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    for batch_idx, batch in enumerate(pretrain_dataloader):
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(
            images=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()
        
        global_step += 1
        running_loss += loss.item() * gradient_accumulation_steps
        
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            optimizer.zero_grad()
            
            avg_loss = running_loss / global_step
            progress_bar.set_postfix({
                "loss": f"{loss.item() * gradient_accumulation_steps:.4f}",
                "avg_loss": f"{avg_loss:.4f}"
            })
            progress_bar.update(1)
        
        if global_step % log_interval == 0:
            tqdm.write(f"Step {global_step}/{total_steps}: Loss={loss.item() * gradient_accumulation_steps:.4f}")

progress_bar.close()
print("Pretraining completed!")

Pretraining:   0%|          | 0/1834 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Pretraining:   1%|          | 12/1834 [00:13<30:18,  1.00it/s, loss=1.3543, avg_loss=4.5391]

Step 50/1834: Loss=1.1988


Pretraining:   1%|▏         | 25/1834 [00:25<29:57,  1.01it/s, loss=1.1464, avg_loss=2.8059]

Step 100/1834: Loss=1.1464


Pretraining:   2%|▏         | 37/1834 [00:38<29:45,  1.01it/s, loss=0.7277, avg_loss=2.2304]

Step 150/1834: Loss=0.9491


Pretraining:   3%|▎         | 50/1834 [00:50<29:32,  1.01it/s, loss=0.8466, avg_loss=1.9056]

Step 200/1834: Loss=0.8466


Pretraining:   3%|▎         | 62/1834 [01:02<29:20,  1.01it/s, loss=0.9459, avg_loss=1.7093]

Step 250/1834: Loss=0.8466


Pretraining:   4%|▍         | 75/1834 [01:15<29:07,  1.01it/s, loss=0.9481, avg_loss=1.5635]

Step 300/1834: Loss=0.9481


Pretraining:   5%|▍         | 87/1834 [01:27<28:56,  1.01it/s, loss=0.9049, avg_loss=1.4609]

Step 350/1834: Loss=1.0372


Pretraining:   5%|▌         | 92/1834 [01:32<28:50,  1.01it/s, loss=0.9192, avg_loss=1.4259]

KeyboardInterrupt: 

Pretraining:   5%|▌         | 92/1834 [01:33<29:33,  1.02s/it, loss=0.9192, avg_loss=1.4259]

In [None]:
output_dir = "/kaggle/working/projection_pretrained_model"
os.makedirs(output_dir, exist_ok=True)

torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
gpt2_tokenizer.save_pretrained(output_dir)
clip_processor.save_pretrained(output_dir)


config = {
    "projection_in_dim": 768,
    "projection_out_dim": 768,
    "num_image_tokens": 32
}

with open(os.path.join(output_dir, "model_config.json"), "w") as f:
    json.dump(config, f)