# LLM OCR Correction



## Imports

In [None]:
from pathlib import Path

import os
import dotenv
import json
import openai
import random
import litellm
from litellm import completion
from tqdm.notebook import tqdm # Import tqdm for notebooks

import dotenv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from joblib import Parallel, delayed
import time

from joblib import Parallel, delayed

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

from logging import getLogger
logger = getLogger(__name__)
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    level=logging.INFO,
                    datefmt='%Y-%m-%d %H:%M:%S',
                    )

dotenv.load_dotenv(Path.home() / '.env')
try:
        # Modify OpenAI's API key and API base to use vLLM's API server.
    if (not "NSK_VLLM_KEY" in os.environ) or os.environ["NSK_VLLM_KEY"]==None or os.environ["NSK_VLLM_KEY"]=="" or os.environ["NSK_VLLM_KEY"]=="EMPTY":
        client = openai.OpenAI(
            base_url=os.environ["NSK_VLLM_BASE_URL"]
        )
        DEFAULT_MODEL = "google/gemma-3-27b-it"

    else:
        client = openai.OpenAI(
            api_key=os.environ["NSK_VLLM_KEY"],
            base_url=os.environ["NSK_VLLM_BASE_URL"]
        )
except:
    api_key=os.environ["LLM_PROXY_ILSP_EVAL_API_KEY"]
    base_url=os.environ["LLM_PROXY_ILSP_BASE_URL"]
    client = openai.OpenAI(
        api_key=api_key,
        base_url=base_url,
    )

if not DEFAULT_MODEL: 
    models_to_test = ["gemma3-27b-it", 'krikri-dpo-1560', "llama-3.1-8b"]
    DEFAULT_MODEL = models_to_test[0]

print(DEFAULT_MODEL)


## Defaults, template and functions

In [None]:
dotenv.load_dotenv(Path.home() / '.env')

try:
        # Modify OpenAI's API key and API base to use vLLM's API server.
    if (not "NSK_VLLM_KEY" in os.environ) or os.environ["NSK_VLLM_KEY"]==None or os.environ["NSK_VLLM_KEY"]=="" or os.environ["NSK_VLLM_KEY"]=="EMPTY":
        client = openai.OpenAI(
            base_url=os.environ["NSK_VLLM_BASE_URL"]
        )
        DEFAULT_MODEL = "google/gemma-3-27b-it"
    else:
        client = openai.OpenAI(
            api_key=os.environ["NSK_VLLM_KEY"],
            base_url=os.environ["NSK_VLLM_BASE_URL"]
        )
except:
    api_key=os.environ["LLM_PROXY_ILSP_EVAL_API_KEY"]
    base_url=os.environ["LLM_PROXY_ILSP_BASE_URL"]
    client = openai.OpenAI(
        api_key=api_key,
        base_url=base_url,
    )

models_to_test = ["gemma3-27b-it", 'krikri-dpo-1560', "llama-3.1-8b"]
if not DEFAULT_MODEL:
    DEFAULT_MODEL = models_to_test[1]

DEFAULT_SYSTEM_PROMPT = "You are an AI assistant specialized in correcting polytonic Greek text created via OCR. Always provide your output in the specified JSON format."
DEFAULT_USER_PROMPT = """Your input text is polytonic Greek text created via OCR. You have to read the input text and generate a corrected version without the errors created by the OCR process.  

You should return polytonic Greek text. You should NOT convert the input text to monotonic Greek. For example, you should not change  "ΌΡΟΙ" to "ΟΡΟΙ" if  "ΌΡΟΙ" is correct polytonic Greek.

You should not change markdown headers or tables in the input text.  For example, you should not change  "##" to "#" in a Markdown header.

You also have to provide a corrections list in Markdown explaining your corrections. Apart from this list, in your answer do NOT explain what you plan to do; simply return the output in the specified JSON format."""

class PolytonicSpeechCorrection(BaseModel):
    corrections_list: str = Field(description="Markdown list with the corrections applied in the polytonic input text.")
    corrected_text: str = Field(description="The corrected polytonic text.")

parser = PydanticOutputParser(pydantic_object=PolytonicSpeechCorrection)

template = DEFAULT_USER_PROMPT + "\n" + """

---
Polytonic Greek text created via OCR:
{ocr_text}
---

{format_instructions}
"""


def generate_text(
    client,
    model=DEFAULT_MODEL,
    system_prompt=DEFAULT_SYSTEM_PROMPT,
    user_prompt=DEFAULT_USER_PROMPT,
    temperature=0.1, 
    max_tokens=5000,
    top_p=0.9,
    frequency_penalty=0.0,
    presence_penalty=0.0,
):
    """
    Generates text using the litellm library (or any OpenAI-compatible client).

    Args:
        client (openai.OpenAI): The initialized OpenAI client connected to LiteLLM proxy.
        model (str): The language model to use.
        system_prompt (str): The system prompt for the model.
        user_prompt (str): The user's input prompt.
        temperature (float): Controls the randomness of the output.
        max_tokens (int): Limits the length of the generated response.
        top_p (float): Controls nucleus sampling.
        frequency_penalty (float): Penalizes repeated tokens.
        presence_penalty (float): Penalizes new tokens.

    Returns:
        str: The generated text, or None if there was an error.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error generating text: {e}")
        return None



def split_markdown_file_into_chunks(file_path: str, chunk_size: int = 2500) -> list[str]:
    """
    Splits a Markdown file into chunks of approximately 'chunk_size' words.
    The splitting occurs at the newline closest to the word limit.

    Args:
        file_path (str): The path to the Markdown file.
        chunk_size (int): The target word limit for each chunk. Defaults to 2500.

    Returns:
        list[str]: A list of strings, where each string is a chunk of the Markdown file.
                   Returns an empty list if the file cannot be read or is empty.
    """
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return []

    if not os.path.isfile(file_path):
        print(f"Error: '{file_path}' is not a file.")
        return []

    chunks = []
    current_chunk_lines = []
    current_chunk_word_count = 0

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # Calculate word count for the current line
                # Splitting by default splits by whitespace and handles multiple spaces
                line_word_count = len(line.split())

                # Check if adding this line would push the current chunk significantly over the limit
                # and if the current chunk already has content.
                # This ensures we don't create an empty chunk if the first line is huge.
                if current_chunk_word_count + line_word_count > chunk_size and current_chunk_lines:
                    # If so, finalize the current chunk before adding the new line
                    chunks.append("".join(current_chunk_lines))
                    # Reset for the new chunk, starting with the current line
                    current_chunk_lines = [line]
                    current_chunk_word_count = line_word_count
                else:
                    # Otherwise, add the line to the current chunk
                    current_chunk_lines.append(line)
                    current_chunk_word_count += line_word_count

            # After the loop, add any remaining content as the last chunk
            if current_chunk_lines:
                chunks.append("".join(current_chunk_lines))

    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return []

    return chunks


def process_markdown_file(md_path, n=30, model_name=DEFAULT_MODEL, temperature=0.1):
    correction_results = []
    with open(md_path, encoding="utf-8") as f:
        content = f.read()

    # Split into chunks
    chunks = split_markdown_file_into_chunks(str(md_path), chunk_size = 250)
    for idx, chunk in enumerate(chunks):
        chunk_stripped = chunk.strip()
        logger.info(f"Processing chunk #{idx+1} of {len(chunks)} of {md_path}")
        try:
            formatted_user_prompt = template.format(
                ocr_text = chunk_stripped,
                format_instructions=parser.get_format_instructions()
            )
            raw_llm_output = generate_text(client=client,  model=model_name, system_prompt=DEFAULT_SYSTEM_PROMPT, user_prompt=formatted_user_prompt, temperature=temperature )
            # Parse the raw LLM output into our Pydantic model
            parsed_output = parser.parse(raw_llm_output)
            correction_result = {
                "corrected_text": parsed_output.corrected_text,
                "corrections_list": parsed_output.corrections_list,
                "status": "success"
            } 
            logger.info(f"Success in processing chunk #{idx+1}")
        except Exception as e:
            correction_result = {
                "corrected_text" : None,
                "corrections_list" : [],
                "status" :  f"failure:  General error during analysis: {e}"
            } 
            logger.info(f"Failure in processing chunk #{idx+1}")
        correction_result.update({
            "filename": str(md_path),
            "chunk_index": idx,
            "original_text": chunk_stripped
        })
        # logger.info(correction_result)
        correction_results.append(correction_result)
    return correction_results



## Processing

In [None]:
data_folder = Path("..") / "data/Newspapers/"
subdirs = [p for p in data_folder.iterdir() if p.is_dir()]

markdown_paths = []
filenames_to_check = ["124_6_-1.docling.md",  "123_449_-1.docling.md",  "65_1_-1.docling.md", "108_2378_-1.docling.md", "5009_51.docling.md", 
                      "arc-2005-8632.docling.md", "Athinai_E_1907-1913_ar 17.docling.md"
                      ]

for subdir in subdirs:
    md_paths = list(subdir.glob("*.docling.md"))
    if filenames_to_check:
        for md_path in md_paths:
            if md_path.name in filenames_to_check:
                markdown_paths.append(md_path)
    else:
        for md_path in md_paths:
            markdown_paths.append(md_path)
        

for md_path in markdown_paths:  
    corrected_json_path = md_path.parent / (md_path.stem + f".corrected.json")
    if corrected_json_path.exists():
        logger.info(f"Skipping processing already processed {md_path}")
        continue
    else:
        logger.info(f"Processing {md_path} with {client.base_url}/{DEFAULT_MODEL}")
        file_results = process_markdown_file(md_path)
        try:
            with open(corrected_json_path, 'w', encoding='utf-8') as f:
                json.dump(file_results, f, indent=4, ensure_ascii=False)
                print(f"Exported results to {corrected_json_path} ")
        except IOError as e:
            print(f"Error saving file: {e}")
    


In [None]:
# df = pd.DataFrame(all_results)
# df.to_excel(Path.home() / "llm_correction_results.xlsx")