In [None]:
!pip install transformers torch accelerate bitsandbytes jiwer datasets peft loralib tqdm pytesseract
!apt-get install tesseract-ocr
!apt-get install tesseract-ocr-eng

In [8]:
import torch
from transformers import ( LlamaForCausalLM, LlamaTokenizer, AutoProcessor, AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig )
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import jiwer
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import json
from glob import glob
import random
import pytesseract

In [9]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import login

login(token=HF_TOKEN)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda




# Load Images from the folder and Cropping them

In [11]:
def load_images_from_folder(folder):
    images = []
    image_names = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg"):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_names.append(filename)
    return images, image_names

image_folder = "/kaggle/input/dataset/images"
dataset, image_names = load_images_from_folder(image_folder)
print("Loaded images successfully")

Cropped dataset created successfully!


In [None]:
def crop_and_save_images(images, image_names, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cropped_images = []
    cropped_image_names = []

    for img, filename in zip(images, image_names):
        width, height = img.size
        left = width // 22
        upper = 0
        right = width
        lower = height - (height // 12.2)
        crop_box = (left, upper, right, lower)

        cropped_img = img.crop(crop_box)
        
        cropped_img_path = os.path.join(output_folder, filename)
        cropped_img.save(cropped_img_path)

        cropped_images.append(cropped_img)
        cropped_image_names.append(filename)

    return cropped_images, cropped_image_names

cropped_image_folder = "/kaggle/working/cropped_images"

cropped_dataset, cropped_image_names = crop_and_save_images(dataset, image_names, cropped_image_folder)

print("Cropped dataset created successfully!")

# Generate True text from the images using OCR
takes about 12-15 min to generate text

In [12]:
def generate_ground_truth(images, image_names):
    ground_truth = {}
    for img, name in zip(images, image_names):
        text = pytesseract.image_to_string(img).strip()
        if not text:  # If OCR fails
            text = "N/A"
        ground_truth[name] = text
    return ground_truth

ground_truth_data = generate_ground_truth(cropped_dataset, cropped_image_names)
true_texts = [ground_truth_data[img_name] for img_name in image_names]
print("true_texts generated")



true_texts generated


# Creating and Loading the model "Llama-3.2-11B-Vision"
Loaded a 4-bit quantized llama-3.2-11B-vision model. *Note: Loading the model can take upto 10 minutes.*

In [22]:
def load_model():
    model_name = "meta-llama/Llama-3.2-11B-Vision"

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",  # normalized float 4
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    processor = AutoProcessor.from_pretrained(model_name)

    return model, processor

In [None]:
model, processor = load_model()

#  Extract baseline texts using the model
* Took about 5 hours to extract text using the model if the max_length put to be 256.
* And would take 10 hours for max_length=512.

In [None]:
def extract_text(images,model,processor):
    texts = []
    for img in tqdm(images):
        inputs = processor(images=img, return_tensors="pt").to(device)
        outputs = model.generate(**inputs,max_length=256,num_beams=5,early_stopping=True)
        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        texts.append(text)
    return texts

baseline_texts = extract_text(cropped_dataset,model,processor)

# Text Organization
Cleaning the texts using regular expression for removing '\n' and some unnecessary characters

In [25]:
import re
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

def clean_text(text):
    text = text.strip()  # Remove leading and trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\n+', ' ', text)  # Remove excessive newlines
    text = text.replace("  ", " ")  # Remove double spaces
    return text

def structure_text(text):
    sentences = sent_tokenize(text)  # Tokenize into sentences
    structured_text = "\n".join(sentences)  # Join sentences with newline
    return structured_text

def process_extracted_text(extracted_texts):
    organized_texts = []
    for text in extracted_texts:
        cleaned = clean_text(text)
        structured = structure_text(cleaned)
        organized_texts.append(structured)
    
    return organized_texts

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
true_texts = process_extracted_text(true_texts)

In [None]:
predicted_texts=process_extracted_text(baseline_texts)

# Evaluating texts i.e. calculating words error and character error
* The error comes to be quite high because the length of the text generated by the model is much less than the length of actual text as extrated by OCR.

In [None]:
from jiwer import wer, cer
def evaluate_texts(true_texts, predicted_texts):
    word_error = wer(true_texts, predicted_texts)
    char_error = cer(true_texts, predicted_texts)
    return word_error, char_error

baseline_wer,baseline_cer = evaluate_texts(true_texts, organized_texts)
print(f"Word Error Rate: {word_error}")
print(f"Character Error Rate: {char_error}")

# Fine tune the model using LoRA (Low Rank Adaptation)
* training the model and obtaining the fine_tuned model took quite longer approax. 6-7 hours 

In [41]:
def prepare_dataset(images, true_texts, processor):
    pixel_values_list = []
    input_ids_list = []
    
    for img, text in zip(images, true_texts):
        image_features = processor(images=img, return_tensors="pt")
        pixel_values_list.append(image_features["pixel_values"][0])
        
        text_features = processor(text=text, return_tensors="pt", padding="max_length", max_length=256)
        input_ids_list.append(text_features["input_ids"][0])
    
    dataset_dict = {
        "pixel_values": pixel_values_list,
        "labels": input_ids_list
    }
    
    dataset = Dataset.from_dict(dataset_dict)
    
    dataset = dataset.train_test_split(test_size=0.1, seed=42)
    return dataset

In [42]:
def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [43]:
def configure_lora(model):
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model

In [44]:
def train_model(model, dataset, processor):
    training_args = TrainingArguments(
        output_dir="./llama-vision-finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        weight_decay=0.01,
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        remove_unused_columns=False
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=collate_fn,
    )
    
    trainer.train()
    return trainer, model

In [45]:
# Prepare dataset
dataset = prepare_dataset(cropped_dataset, true_texts, processor)

In [46]:
# Configure and fine-tune model
model_finetuned = configure_lora(model)


trainable params: 11,796,480 || all params: 10,654,737,955 || trainable%: 0.1107


In [None]:
trainer, model_finetuned = train_model(model_finetuned, dataset, processor)

In [None]:
finetuned_texts=extract_text(cropped_dataset,model_finetuned,processor)

In [None]:
organised_finetuned_texts=process_extracted_text(finetuned_texts)

In [None]:
finetuned_wer, finetuned_cer = evaluate_texts(true_texts, organized_finetuned_texts)

In [None]:
print("Baseline Model Performance:")
print(f"Word Error Rate: {baseline_wer:.4f}")
print(f"Character Error Rate: {baseline_cer:.4f}")

print("\nFine-tuned Model Performance:")
print(f"Word Error Rate: {finetuned_wer:.4f}")
print(f"Character Error Rate: {finetuned_cer:.4f}")

print("\nImprovement:")
print(f"Word Error Rate Improvement: {(baseline_wer - finetuned_wer) * 100:.2f}%")
print(f"Character Error Rate Improvement: {(baseline_cer - finetuned_cer) * 100:.2f}%")