<a href="https://colab.research.google.com/github/shakthi1731u/BillingSystem/blob/main/classifier%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Check and set GPU
import torch
print("GPU Available:", torch.cuda.is_available())
!nvidia-smi

GPU Available: False
/bin/bash: line 1: nvidia-smi: command not found


In [19]:
!pip install -q "transformers[torch]" datasets accelerate torchvision pandas
!pip install -q --upgrade pillow

In [13]:
DATA_FOLDER = "/content/drive/MyDrive/donut_training"

In [14]:
import shutil
import os

os.makedirs("data/images", exist_ok=True)

# Copy all images from Drive to Colab
shutil.copytree(f"{DATA_FOLDER}/images", "data/images", dirs_exist_ok=True)

# Copy JSON file
shutil.copy(f"{DATA_FOLDER}/training_data.json", "data/training_data.json")


'data/training_data.json'

In [16]:
TRAINING_DATA = "data/training_data.json"

In [20]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"  # <- disable TensorFlow support
os.environ["WANDB_DISABLED"] = "true"

from transformers import VisionEncoderDecoderModel, DonutProcessor, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset as TorchDataset
from torch.nn.utils.rnn import pad_sequence
from transformers import TrainerCallback
from datasets import Dataset
from PIL import Image
import pandas as pd
import numpy as np
import torch
import json


ValueError: Name tf.linalg.LinearOperatorIdentitySpec has already been registered for class abc.LinearOperatorIdentitySpec.

In [9]:

# Make sure you have these in place
TRAINING_DATA = "data/training_data_fixed.json"

# Load dataset from JSON
df = pd.read_json(TRAINING_DATA)
dataset = Dataset.from_pandas(df)

# Load model and processor
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base", use_fast=True)

# Required for training to avoid ValueError
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

# Move model to float32 for Colab compatibility
model.to(torch.float32)

# Filter out incomplete entries
dataset = dataset.filter(lambda x: "image" in x and "labels" in x and x["image"] and x["labels"])

print("Total examples after filtering:", len(dataset))

# Custom PyTorch dataset
class DonutTorchDataset(TorchDataset):
    def __init__(self, hf_dataset, processor):
        self.dataset = hf_dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]

        # Load and preprocess image
        try:
          image = Image.open(example["image"]).convert("RGB")
        except FileNotFoundError:
          print(f"Missing image: {example['image']}")
          return {"pixel_values": torch.zeros((3, 768, 576)), "labels": torch.zeros(256, dtype=torch.long)}

        image = image.resize((768, 576))
        image = np.array(image)
        pixel_values = self.processor(image, return_tensors="pt").pixel_values[0]

        # Convert label dict/list to string
        label_text = example["labels"]
        if isinstance(label_text, (list, dict)):
            label_text = json.dumps(label_text)

        labels = self.processor.tokenizer(
            label_text,
            max_length=256,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return {
            "pixel_values": pixel_values,
            "labels": torch.tensor(labels)
        }

class ProgressPrinterCallback(TrainerCallback):
    def __init__(self, total_steps):
        self.total_steps = total_steps
        self.current_step = 0

    def on_step_end(self, args, state, control, **kwargs):
        self.current_step += 1
        percent = (self.current_step / self.total_steps) * 100
        print(f"[{self.current_step}/{self.total_steps}] → {percent:.2f}% complete")
        return control


# Data collator to batch images and labels
class DonutDataCollator:
    def __call__(self, features):
        pixel_values = torch.stack([f["pixel_values"] for f in features])
        labels = torch.stack([f["labels"] for f in features])  # already same length
        return {
            "pixel_values": pixel_values,
            "labels": labels
        }

# Prepare dataset
train_dataset = DonutTorchDataset(dataset, processor)

# Check sample
sample = train_dataset[0]
print("Sample image shape:", sample["pixel_values"].shape)
print("Sample label length:", len(sample["labels"]))

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="donut_finetuned",
    per_device_train_batch_size=1,
    predict_with_generate=True,
    logging_dir="logs",
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    remove_unused_columns=False
)

total_steps = len(train_dataset) * training_args.num_train_epochs

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DonutDataCollator(),
    callbacks=[ProgressPrinterCallback(total_steps)],
    tokenizer=processor.tokenizer,
)

# 🔥 Start training
trainer.train()


ValueError: Name tf.linalg.LinearOperatorIdentitySpec has already been registered for class abc.LinearOperatorIdentitySpec.

In [26]:
import json

# Load and fix paths
with open("data/training_data.json", "r") as f:
    data = json.load(f)

# Replace backslashes with forward slashes in all image paths
for item in data:
    item["image"] = item["image"].replace("\\", "/")

# Save the fixed version (overwrite or save as new)
with open("data/training_data_fixed.json", "w") as f:
    json.dump(data, f, indent=2)
