In [None]:
!pip install transformers torch accelerate bitsandbytes jiwer datasets peft loralib tqdm pytesseract
!apt-get install tesseract-ocr
!apt-get install tesseract-ocr-eng

In [2]:
import torch
from transformers import ( LlamaForCausalLM, LlamaTokenizer, AutoProcessor, AutoModelForVision2Seq, TrainingArguments, Trainer, BitsAndBytesConfig )
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import jiwer
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import json
from glob import glob
import random
import pytesseract

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import login

login(token=HF_TOKEN)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda




# Load Images from the folder

In [5]:
import os
from PIL import Image

def load_images_from_folder(folder):
    images = []
    image_names = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg"):
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            image_names.append(filename)
    return images, image_names

def crop_and_save_images(images, image_names, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # Create folder if it doesn't exist

    cropped_images = []
    cropped_image_names = []

    for img, filename in zip(images, image_names):
        width, height = img.size
        left = width // 22
        upper = 0
        right = width
        lower = height - (height // 12.2)
        crop_box = (left, upper, right, lower)

        cropped_img = img.crop(crop_box)
        
        # Save cropped image
        cropped_img_path = os.path.join(output_folder, filename)
        cropped_img.save(cropped_img_path)

        cropped_images.append(cropped_img)
        cropped_image_names.append(filename)

    return cropped_images, cropped_image_names

# Paths
image_folder = "/kaggle/input/dataset/images"
cropped_image_folder = "/kaggle/working/cropped_images"

# Load images
dataset, image_names = load_images_from_folder(image_folder)

# Crop images and create dataset
cropped_dataset, cropped_image_names = crop_and_save_images(dataset, image_names, cropped_image_folder)

print("Cropped dataset created successfully!")


Cropped dataset created successfully!


# Generate True text from the images using OCR
takes about 12-15 min to generate text

In [6]:
def generate_ground_truth(images, image_names):
    ground_truth = {}
    for img, name in zip(images, image_names):
        text = pytesseract.image_to_string(img).strip()
        if not text:  # If OCR fails to extract text, use a placeholder
            text = "N/A"
        ground_truth[name] = text
    return ground_truth

ground_truth_data = generate_ground_truth(cropped_dataset, cropped_image_names)
true_texts = [ground_truth_data[img_name] for img_name in image_names]
print("true_texts generated")



true_texts generated


In [None]:
ground_truth_data['india_news_p000060.jpg']

# Creating and Loading the model "Llama-3.2-11B-Vision"
Loaded a 4-bit quantized llama-3.2-11B-vision model. *Note: Loading the model can take upto 10 minutes.*

In [8]:
def load_model():
    model_name = "meta-llama/Llama-3.2-11B-Vision"

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",  # normalized float 4
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16
    )

    processor = AutoProcessor.from_pretrained(model_name)

    return model, processor

In [None]:
model, processor = load_model()

# Extract baseline texts using the model
Took about 5 hours to extract text using the model.

In [None]:
def extract_text(images,model,processor):
    texts = []
    for img in tqdm(images):
        inputs = processor(images=img, return_tensors="pt").to(device)
        outputs = model.generate(**inputs,max_length=256,num_beams=5,early_stopping=True)
        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        texts.append(text)
    return texts

baseline_texts = extract_text(cropped_dataset,model,processor)

# Text Organization
Cleaning the texts using regular expression for removing '\n' and some unnecessary characters

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

def clean_text(text):
    text = text.strip()  # Remove leading and trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\n+', ' ', text)  # Remove excessive newlines
    text = text.replace("  ", " ")  # Remove double spaces
    return text

def structure_text(text):
    sentences = sent_tokenize(text)  # Tokenize into sentences
    structured_text = "\n".join(sentences)  # Join sentences with newline
    return structured_text

def process_extracted_text(extracted_texts):
    organized_texts = []
    for text in extracted_texts:
        cleaned = clean_text(text)
        structured = structure_text(cleaned)
        organized_texts.append(structured)
    
    return organized_texts

organized_texts = process_extracted_text(baseline_texts)

# Evaluating texts i.e. calculating words error and character error

In [None]:
from jiwer import wer, cer
def evaluate_texts(true_texts, predicted_texts):
    word_error = wer(true_texts, predicted_texts)
    char_error = cer(true_texts, predicted_texts)
    return word_error, char_error

word_error, char_error = evaluate_texts(true_texts, organized_texts)
print(f"Word Error Rate: {word_error}")
print(f"Character Error Rate: {char_error}")

# Fine tune the model using LoRA (Low Rank Adaptation)