In [None]:
from datasets import load_dataset
from transformers import (
    VisionEncoderDecoderModel,
    AutoTokenizer,
)
from torchvision import transforms
from torch.utils.data import DataLoader
#from transformers import AdamW
import torch
from tqdm import tqdm

In [None]:
dataset = load_dataset("itsanmolgupta/mimic-cxr-dataset", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/357 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/397M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30633 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['image', 'findings', 'impression'],
    num_rows: 30633
})

In [None]:
df_train = dataset["train"].to_pandas()

In [None]:
df_train.head()

Unnamed: 0,image,findings,impression
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Lung volumes remain low. There are innumerable...,Low lung volumes and mild pulmonary vascular c...
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Lung volumes are low. This results in crowding...,Innumerable pulmonary metastases. Possible mil...
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,In comparison to study performed on of there i...,New mild pulmonary edema with persistent small...
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,The right costophrenic angle is not imaged. Ot...,An enteric tube courses below the level of the...


In [None]:
# Manually create 90% train / 10% validation split
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = dataset["train"]
val_ds = dataset["test"]

In [None]:
print(train_ds.shape, val_ds.shape)

(27569, 3) (3064, 3)


In [None]:
print(f"Train size: {len(train_ds)}, Validation size: {len(val_ds)}")
print(train_ds[0])

Train size: 27569, Validation size: 3064
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x793DD2A97D40>, 'findings': 'Left basal opacity compatible with known pneumonia is increased extending into the left midlung. Accompanying increase in vascular congestion is without overt edema. Cardiac size is stable, though silhouette is obscured by this process. ', 'impression': 'Increase in left-sided opacities, into the left mid lung, concerning for worsening pneumonia. Finings were discussed by phone with , NP, by Dr. at on .'}


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [None]:
def preprocess(batch):
  image = batch["image"].convert("RGB")
  batch["pixel_values"] = image_transform(image)
  text = batch["impression"]
  text_target = batch["findings"]
  encoding = tokenizer(
        text=text,
        text_target=text_target,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
  batch["labels"] = encoding.input_ids[0]
  return batch
train_ds = train_ds.map(preprocess)
val_ds = val_ds.map(preprocess)


Map:   0%|          | 0/27569 [00:00<?, ? examples/s]

Map:   0%|          | 0/3064 [00:00<?, ? examples/s]

In [None]:
# Set dataset format for PyTorch
train_ds.set_format(type="torch", columns=["pixel_values", "labels"])
val_ds.set_format(type="torch", columns=["pixel_values", "labels"])

In [None]:
import torch
import torch.nn as nn
from torchvision.models import resnet50
from transformers import GPT2LMHeadModel, VisionEncoderDecoderModel

class ResNetEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        resnet = resnet50(pretrained=True)
        self.backbone = nn.Sequential(*list(resnet.children())[:-1])  # remove classification head
        self.projection = nn.Linear(2048, 768)  # project to GPT2 hidden size (768)

    def forward(self, pixel_values):
        feats = self.backbone(pixel_values)  # (B, 2048, 1, 1)
        feats = feats.flatten(1)             # (B, 2048)
        feats = self.projection(feats)       # (B, 768)
        return feats.unsqueeze(1)

In [None]:
# Instantiate models
encoder = ResNetEncoder()
decoder = GPT2LMHeadModel.from_pretrained("gpt2")

# Combine them manually
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

AttributeError: 'ResNetEncoder' object has no attribute 'config'

In [None]:
from transformers import VisionEncoderDecoderModel, AutoConfig

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    "microsoft/resnet-50",  # encoder
    "gpt2" # Set to gpt2
)


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

AttributeError: 'ResNetConfig' object has no attribute 'hidden_size'

In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    "google/vit-base-patch16-224-in21k",  # Encoder
    "gpt2"                                # Decoder
)

processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [None]:


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

In [None]:
from torch.optim import AdamW

In [None]:
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=4)

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

Epoch 1/3:   0%|          | 0/6893 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Epoch 1/3: 100%|██████████| 6893/6893 [51:03<00:00,  2.25it/s, loss=0.125]


Epoch 1 Average Loss: 0.6933


Epoch 2/3: 100%|██████████| 6893/6893 [51:12<00:00,  2.24it/s, loss=0.529]


Epoch 2 Average Loss: 0.5710


Epoch 3/3: 100%|██████████| 6893/6893 [51:10<00:00,  2.24it/s, loss=0.0803]

Epoch 3 Average Loss: 0.5301





In [None]:
model.eval()
sample = val_ds[0]
pixel_values = sample["pixel_values"].unsqueeze(0).to(device)

with torch.no_grad():
    generated_ids = model.generate(pixel_values, max_length=80)
    report = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("\n=== Generated Report ===")
print(report)

print("\n=== Ground Truth Impression ===")
print(sample["labels"])

In [None]:
model.save_pretrained("./xray_vit_gpt2_model")
processor.save_pretrained("./xray_vit_gpt2_model")
tokenizer.save_pretrained("./xray_vit_gpt2_model")

print("✅ Model fine-tuned and saved to ./xray_vit_gpt2_model/")

## Git Hub Link

https://github.com/prince10arya/

https://github.com/yuneshkumar11/