In [1]:
!pip install transformers torch accelerate bitsandbytes jiwer datasets peft loralib tqdm pytesseract opencv-python
!apt-get install tesseract-ocr
!apt-get install tesseract-ocr-eng

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00

In [2]:
import torch
from transformers import ( LlamaForCausalLM, LlamaTokenizer, AutoProcessor, AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig )
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import jiwer
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import json
from glob import glob
import random
import pytesseract
import cv2

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import login

login(token=HF_TOKEN)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda




# Load Images from the folder and Cropping them

In [5]:
def load_images_from_folder(folder):
    images = []
    image_names = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg"):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_names.append(filename)
    return images, image_names

image_folder = "/kaggle/input/dataset/images"
dataset, image_names = load_images_from_folder(image_folder)
print("Loaded images successfully")

Loaded images successfully


In [6]:
def augment_images(image_names):
    for j,img in enumerate(image_names):
        image_path = f"/kaggle/input/dataset/images/{img}"  # Change this to your image path
        image = cv2.imread(image_path)
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
        
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        clean = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
        
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))  # Adjust based on text density
        dilated = cv2.dilate(clean, kernel, iterations=2)
        
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        bounding_boxes = [cv2.boundingRect(cnt) for cnt in contours]
        bounding_boxes = sorted(bounding_boxes, key=lambda b: b[1])
        
        merged_boxes = []
        i = 0
        
        while i < len(bounding_boxes) - 1:
            x1, y1, w1, h1 = bounding_boxes[i]
            x2, y2, w2, h2 = bounding_boxes[i + 1]
        
            # Check if two bounding boxes are close enough to be considered part of the same section
            if abs(y2 - (y1 + h1)) < 50:  # Merge if next box is within 50 pixels
                x_new = min(x1, x2)
                y_new = min(y1, y2)
                w_new = max(x1 + w1, x2 + w2) - x_new
                h_new = max(y1 + h1, y2 + h2) - y_new
        
                merged_boxes.append((x_new, y_new, w_new, h_new))
                i += 2
            else:
                i += 1
        
        output_dir = "/kaggle/working/cropped_paragraphs"
        os.makedirs(output_dir, exist_ok=True)
        
        min_area = 10000  # Minimum bounding box area (to ignore single words)
        min_height = 60   # Minimum height of a paragraph block
        min_aspect_ratio = 0.5  # To avoid very long but short-height text (headers, footers)
        
        for i, (x, y, w, h) in enumerate(merged_boxes):
            aspect_ratio = w / h 
        
            if h > min_height and w * h > min_area and aspect_ratio > min_aspect_ratio:
                cropped_paragraph = image[y:y+h, x:x+w]  # Crop paragraph
                save_path = os.path.join(output_dir, f"multi_paragraph_{j}_{i+1}.png")
                cv2.imwrite(save_path, cropped_paragraph)
        
                # cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        half_count = len(contours) // 3
        j=1
        for i, cnt in enumerate(contours):
            x, y, w, h = cv2.boundingRect(cnt)
            
            
            aspect_ratio = w / h 
        
            if h > min_height and w * h > min_area and aspect_ratio > min_aspect_ratio:
                cropped_paragraph = image[y:y+h, x:x+w]  # Crop paragraph
                save_path = os.path.join(output_dir, f"paragraph_{j}_{i+1}.png")
                cv2.imwrite(save_path, cropped_paragraph)
                j=j+1
            if j==half_count:
                break
                # Draw bounding box on original image (for visualization)
                # cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

In [7]:
augment_images(image_names)

# Generate True text from the images using OCR
takes about 12-15 min to generate text

In [8]:
def load_images_from_folder(folder):
    images = []
    image_names = []
    cnt=0
    for filename in os.listdir(folder):
        if filename.endswith(".png"):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_names.append(filename)
            cnt=cnt+1
    return images, image_names,cnt

image_folder = "/kaggle/working/cropped_paragraphs"
dataset, image_names,cnt = load_images_from_folder(image_folder)
print(f"Loaded images successfully: {cnt}")

Loaded images successfully: 1576


In [9]:
def generate_ground_truth(images, image_names):
    ground_truth = {}
    for img, name in tqdm(zip(images, image_names), total=len(image_names), desc="Processing Images"):
        text = pytesseract.image_to_string(img).strip()
        if not text:  # If OCR fails
            text = "N/A"
        ground_truth[name] = text
    return ground_truth

ground_truth_data = generate_ground_truth(dataset, image_names)
print("true_texts generated")

Processing Images: 100%|██████████| 1576/1576 [13:36<00:00,  1.93it/s]

true_texts generated





# Creating and Loading the model "Llama-3.2-11B-Vision"
Loaded a 4-bit quantized llama-3.2-11B-vision model. *Note: Loading the model can take upto 4 minutes as the model is 22GB big to download*

In [9]:
def load_model():
    model_name = "meta-llama/Llama-3.2-11B-Vision"

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    processor = AutoProcessor.from_pretrained(model_name)

    return model, processor

In [10]:
model, processor = load_model()

config.json:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/f2/7e/f27e26d7824ae3c888e292f98ea8166ad1843be96bc0fb64235bda4c0030da7b/806d7a1d87d0a2d45b2f6c42dddc8b8f2b77bcaf45ac085181d0af74f7492909?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00004-of-00005.safetensors%3B+filename%3D%22model-00004-of-00005.safetensors%22%3B&Expires=1742929185&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjkyOTE4NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2YyLzdlL2YyN2UyNmQ3ODI0YWUzYzg4OGUyOTJmOThlYTgxNjZhZDE4NDNiZTk2YmMwZmI2NDIzNWJkYTRjMDAzMGRhN2IvODA2ZDdhMWQ4N2QwYTJkNDViMmY2YzQyZGRkYzhiOGYyYjc3YmNhZjQ1YWMwODUxODFkMGFmNzRmNzQ5MjkwOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=FRnEI71GPYBvuvvOOlChZHdxS42otFryRj4wR76AzW03zN0a%7EpXPAxRYt6d65M5PuEZAsnqVc4LdB2X%7Eec5ecRD0h7D%7EMHiUiQ1qBOWx35zdyeZNvN5LxgC82fN1xUsyDHGY5dYt8ppiyupfEPiAloHj0Q7wLYJ-3pdTA9pAqZW9%7EZMwTsEewMvGDzKEvqOvl5yuBXH6ktD41aBNWkUSy9GqNIgUnu

model-00004-of-00005.safetensors:  75%|#######5  | 3.76G/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

#  Extract baseline texts using the model
* Took about 1.5 hours to extract text using the model if the max_length put to be 256.
* And would take 3 hours for max_length=512.

In [11]:
from torch.amp import autocast

In [12]:
torch.cuda.empty_cache()

In [13]:
def extract_text(images, model, processor, batch_size=6):
    texts = []
    
    device = next(model.parameters()).device
    
    for i in tqdm(range(0, len(images), batch_size)):
        batch = images[i:i+batch_size]
        
        inputs = processor(images=batch, return_tensors="pt", padding=True).to(device)
        
        with autocast('cuda'):
            outputs = model.generate(**inputs, max_length=128, num_beams=2, early_stopping=True)
        
        batch_texts = processor.batch_decode(outputs, skip_special_tokens=True)
        texts.extend(batch_texts)
    
    return texts

baseline_texts = extract_text(dataset, model, processor, batch_size=6)

  0%|          | 0/263 [00:13<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.41 GiB. GPU 0 has a total capacity of 14.74 GiB of which 862.12 MiB is free. Process 3855 has 13.90 GiB memory in use. Of the allocated memory 12.38 GiB is allocated by PyTorch, and 1.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Text Organization
Cleaning the texts using regular expression for removing some unnecessary characters

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

def clean_text(text):
    text = text.strip()  # Remove leading and trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\n+', ' ', text)  # Remove excessive newlines
    text = text.replace("  ", " ")  # Remove double spaces
    return text

def structure_text(text):
    sentences = sent_tokenize(text)  # Tokenize into sentences
    structured_text = "\n".join(sentences)  # Join sentences with newline
    return structured_text

def process_extracted_text(extracted_texts):
    organized_texts = []
    for text in extracted_texts:
        cleaned = clean_text(text)
        structured = structure_text(cleaned)
        organized_texts.append(structured)
    
    return organized_texts

In [None]:
def batch_list(data, batch_size):
    return ["".join(data[i:i + batch_size]) for i in range(0, len(data), batch_size)]
    
batch_true_text=batch_list(true_texts,4)

In [None]:
processed_true_texts = process_extracted_text(batch_true_text)

In [None]:
processed_predicted_texts=process_extracted_text(baseline_texts)

# Evaluating texts i.e. calculating words error and character error
* The error comes to be quite high because the length of the text generated by the model is much less than the length of actual text as extrated by OCR.

In [None]:
!pip install jiwer
from jiwer import wer, cer
def evaluate_texts(true_texts, predicted_texts):
    word_error = wer(true_texts, predicted_texts)
    char_error = cer(true_texts, predicted_texts)
    return word_error, char_error

In [None]:
baseline_wer,baseline_cer = evaluate_texts(true_texts, predicted_texts)
print(f"Word Error Rate: {baseline_wer}")
print(f"Character Error Rate: {baseline_cer}")

# Fine tune the model using LoRA (Low Rank Adaptation)
* training the model and obtaining the fine_tuned model took quite longer approax. 6-7 hours 

In [None]:
# def prepare_dataset(images, true_texts, processor):
#     pixel_values_list = []
#     input_ids_list = []
    
#     for img, text in zip(images, true_texts):
#         image_features = processor(images=img, return_tensors="pt")
#         pixel_values_list.append(image_features["pixel_values"][0])
        
#         text_features = processor(text=text, return_tensors="pt", padding="max_length", max_length=256)
#         input_ids_list.append(text_features["input_ids"][0])
    
#     dataset_dict = {
#         "pixel_values": pixel_values_list,
#         "labels": input_ids_list
#     }
    
#     dataset = Dataset.from_dict(dataset_dict)
    
#     dataset = dataset.train_test_split(test_size=0.1, seed=42)
#     return dataset

In [None]:
def prepare_dataset_batch(images, true_texts, processor, batch_size=4):
    pixel_values = []
    input_ids = []
    
    # Process images in batches
    for i in range(0, len(images), batch_size):
        batch_images = images[i:i+batch_size]
        batch_texts = ["".join(true_texts[i:i + batch_size])]
        
        # Process image batch
        image_features = processor(
            images=batch_images, 
            return_tensors="pt",
            padding=True
        )
        pixel_values.extend(image_features["pixel_values"])
        
        # Process text batch
        text_features = processor(
            text=batch_texts,
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True
        )
        input_ids.extend(text_features["input_ids"])
        
    print(len(pixel_values))
    print(len(input_ids))
    return Dataset.from_dict({
        "pixel_values": pixel_values,
        "labels": input_ids
    }).train_test_split(test_size=0.1, seed=42)


In [None]:
# def collate_fn(batch):
#     pixel_values = torch.stack([item["pixel_values"] for item in batch])
#     labels = torch.stack([item["labels"] for item in batch])
    
#     return {
#         "pixel_values": pixel_values,
#         "labels": labels
#     }

In [None]:
def configure_lora(model):
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model

In [None]:
# def train_model(model, dataset, processor):
#     training_args = TrainingArguments(
#         output_dir="./llama-vision-finetuned",
#         num_train_epochs=3,
#         per_device_train_batch_size=8,
#         per_device_eval_batch_size=8,
#         gradient_accumulation_steps=2,
#         dataloader_num_workers=2,  # parallel loading
#         learning_rate=2e-4,
#         weight_decay=0.01,
#         logging_steps=10,
#         eval_strategy="epoch",
#         save_strategy="epoch",
#         load_best_model_at_end=True,
#         push_to_hub=False,
#         remove_unused_columns=False
#     )
    
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset["train"],
#         eval_dataset=dataset["test"],
#         data_collator=collate_fn,
#     )
    
#     trainer.train()
#     return trainer, model

In [None]:
from transformers import TrainerCallback, TrainingArguments, Trainer
from tqdm import tqdm

class ProgressBarCallback(TrainerCallback):
    def __init__(self, total_steps):
        self.pbar = tqdm(total=total_steps, desc="Training Progress")

    def on_step_end(self, args, state, control, **kwargs):
        self.pbar.update(1)

    def on_train_end(self, args, state, control, **kwargs):
        self.pbar.close()

def train_model(model, dataset, processor):
    training_args = TrainingArguments(
        output_dir="./llama-vision-finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        dataloader_num_workers=2,
        learning_rate=2e-4,
        weight_decay=0.01,
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        remove_unused_columns=False
    )

    total_steps = (len(dataset["train"]) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)) * training_args.num_train_epochs
    progress_callback = ProgressBarCallback(total_steps)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=collate_fn,
        callbacks=[progress_callback]  # Attach the progress bar
    )

    trainer.train()
    return trainer, model


In [None]:
# Prepare dataset
dataset = prepare_dataset_batch(cropped_dataset, true_texts, processor)

In [None]:
# Configure and fine-tune model
model_finetuned = configure_lora(model)


In [None]:
trainer, model_finetuned = train_model(model_finetuned, dataset, processor)

In [None]:
finetuned_texts=extract_text(cropped_dataset,model_finetuned,processor)

In [None]:
organised_finetuned_texts=process_extracted_text(finetuned_texts)

In [None]:
finetuned_wer, finetuned_cer = evaluate_texts(true_texts, organized_finetuned_texts)

In [None]:
print("Baseline Model Performance:")
print(f"Word Error Rate: {baseline_wer:.4f}")
print(f"Character Error Rate: {baseline_cer:.4f}")

print("\nFine-tuned Model Performance:")
print(f"Word Error Rate: {finetuned_wer:.4f}")
print(f"Character Error Rate: {finetuned_cer:.4f}")

print("\nImprovement:")
print(f"Word Error Rate Improvement: {(baseline_wer - finetuned_wer) * 100:.2f}%")
print(f"Character Error Rate Improvement: {(baseline_cer - finetuned_cer) * 100:.2f}%")