In [None]:
import base64
import io
from dataclasses import dataclass
from pathlib import Path
from typing import List

import pypdfium2 as pdfium
import typer
from litellm import completion
from PIL import Image

app = typer.Typer()


@dataclass
class ImageData:
    mime_type: str
    content: bytes


@dataclass
class ImagesFromPdfOptions:
    pdf_file: str
    max_resolution: int = 1600
    quality: int = 85


def images_from_pdf(request: ImagesFromPdfOptions) -> List[ImageData]:
    """Convert PDF pages to image data."""
    image_data_list = []
    pdf = pdfium.PdfDocument(request.pdf_file)
    max_dimension = request.max_resolution

    for page_index in range(len(pdf)):
        page = pdf[page_index]
        bitmap = page.render(scale=300 / 72)
        bitmap = bitmap.to_pil()

        bitmap.thumbnail((max_dimension, max_dimension), Image.Resampling.LANCZOS)

        img_byte_arr = io.BytesIO()
        bitmap.save(img_byte_arr, format="JPEG", quality=request.quality)
        img_byte_arr.seek(0)

        image_data_list.append(
            ImageData(mime_type="image/jpeg", content=img_byte_arr.getvalue())
        )

    return image_data_list

def extract_images(
    pdf_files: List[Path] = typer.Option(help="PDF files to process"),
):
    """Process PDF files and extract structured notes using Gemini."""
    all_images = []

    for pdf_file in pdf_files:
        typer.echo(f"Processing {pdf_file}...")

        # Convert PDF to images
        images = images_from_pdf(ImagesFromPdfOptions(str(pdf_file)))
        all_images.extend(images)

    return all_images

TEST_DOCS = [
    Path("/Users/power/Downloads/Scanned_20241216-1513.pdf"),
    Path("/Users/power/Downloads/Scanned_20241217-1137.pdf"),
    Path("/Users/power/Downloads/Scanned_20241220-1358.pdf"),
]
images = extract_images(TEST_DOCS)

In [4]:
import multiprocessing
import multiprocessing.dummy


SYSTEM_PROMPT = """
You are a note analysis assistant.
You are an expert at deciphering handwritten notes and converting them to Restructured Text format.

You rigorously extract every piece of information without fail.
You infer missing content or difficult to read text by using your intution.
You always treat the original text as sacrosanct and never alter the ordering or meaning.
You always indicate if you can't extract a piece of information and provide your best guess.

Notes all have the same format: a left hand margin with tags and margin notes
and note content in the main body on the right. You always keep these separate
in your formatting.

Notes 2024-12-20 and later begin to use a convention of starting with a title.
For earlier notes, you should infer a whimsical but informative title from the
content of the note.  Follow any directives you see in the margins as commands
to you, e.g. "an arrow indicating 'add this to the previous note' should be
interpreted directly by you and not included in the note content.

For earlier notes, separation must be inferred (for later notes, use the title
as an indicator for starting a new note.)  Infer the beginning and end of notes
by using the spacing between notes _and_ the content similarity. Don't start a
new note unless there is a significant vertical space.  Group notes when the
content appears strongly related. Again, later notes will have a title clearly
separating notes from one another.

You must output blocks of HTML for each note. You should start each
note with a block indicating the start of the note:

<NOTE>

Notes are complete HTML documents, beginning with an <HTML> tag, with no
internal styling. Instead they should adhere to the guidelines outlined in Tufte
CSS. Reference the CSS file from static/tufte.css.

https://edwardtufte.github.io/tufte-css/

Organize your document with an article element inside your body tag. Inside
that, use section tags around each logical grouping of text and headings.

Tufte CSS uses h1 for the document title, p with class subtitle for the document
subtitle, h2 for section headings, and h3 for low-level headings. More specific
headings are not supported. If you feel the urge to reach for a heading of level
4 or greater, consider redesigning your document.

Epigraphs:

<div class="epigraph">
  <blockquote>
      <p>The English language . . . becomes ugly and inaccurate because our thoughts are foolish, but the slovenliness of our language makes it easier for us to have foolish thoughts.</p>
      <footer>George Orwell, “Politics and the English Language”</footer>
  </blockquote>
</div>

Sidenotes & Margin Notes:

 <section>
     <h2 id="sidenotes">Sidenotes: Footnotes and Marginal Notes</h2>
     <p>
         One of the most distinctive features of Tufte’s style is his extensive use of sidenotes.<label for="sn-extensive-use-of-sidenotes" class="margin-toggle sidenote-number"></label>
         <input type="checkbox" id="sn-extensive-use-of-sidenotes" class="margin-toggle"/>
         <span class="sidenote">This is a sidenote.</span>
         Sidenotes are like footnotes, except they don’t force the reader to jump their eye to the bottom of the page, but instead display off to the side in the margin. Perhaps you have noticed their use in this document already. You are very astute.
     </p>
     <p>
         Sidenotes are a great example of the web not being like print. On sufficiently large viewports, Tufte CSS uses the margin for sidenotes, margin notes, and small figures. On smaller viewports, elements that would go in the margin are hidden until the user toggles them into view. The goal is to present related but not necessary information such as asides or citations <em>as close as possible</em>
         to the text that references them. At the same time, this secondary information should stay out of the way of the eye, not interfering with the progression of ideas in the main text.
     </p>
     <p>Sidenotes consist of two elements: a superscript reference number that goes inline with the text, and a sidenote with content. To add the former, just put a label and dummy checkbox into the text where you want the reference to go, like so:</p>
     <pre>
         <code>&lt;label for="sn-demo"
s="margin-toggle sidenote-number"&gt;&lt;/label &gt;&lt;input type="checkbox"
sn-demo"
s="margin-toggle"/&gt;</code>
     </pre>
     <p>
         You must manually assign a reference <code>id</code>
         to each side or margin note, replacing “sn-demo” in the <code>for</code>
         and the <code>id</code>
         attribute values with an appropriate descriptor. It is useful to use prefixes like <code>sn-</code>
         for sidenotes and <code>mn-</code>
         for margin notes.
     </p>
     <p>
         Immediately adjacent to that sidenote reference in the main text goes the sidenote content itself, in a <code>span</code>
         with class <code>sidenote</code>
         . This tag is also inserted directly in the middle of the body text, but is either pushed into the margin or hidden by default. Make sure to position your sidenotes correctly by keeping the sidenote-number label close to the sidenote itself.
     </p>
     <p>
         For optimal readibility of sidenotes, enclose the main text in the <code>section</code>
         tag.
     </p>
     <p>
         If you want a sidenote without footnote-style numberings, then you want a margin note.
label for="mn-demo" class="margin-toggle">&#8853;</label>
         <input type="checkbox" id="mn-demo" class="margin-toggle"/>
         <span class="marginnote">This is a margin note. Notice there isn’t a number preceding the note.
/span>
         On large screens, a margin note is just a sidenote that omits the reference number. This lessens the distracting effect taking away from the flow of the main text, but can increase the cognitive load of matching a margin note to its referent text. However, on small screens, a margin note is like a sidenote except its viewability-toggle is a symbol rather than a reference number. This document currently uses the symbol &#8853;(<code>&amp;#8853;</code>
         ), but it’s up to you.
     </p>
     <p>
         Margin notes are created just like sidenotes, but with the <code>marginnote</code>
         class for the content and the <code>margin-toggle</code>
         class for the label and dummy checkbox. For instance, here is the code for the margin note used in the previous paragraph:
     </p>
     <pre>
         <code>&lt;label for="mn-demo" class="margin-toggle"&gt;&amp;#8853;&lt;/label &gt;&lt;input type="checkbox" id="mn-demo" class="margin-toggle"/&gt;&lt;span class="marginnote"&gt;This is a margin note. Notice there isn’t a number preceding the note.
gt;</code>
     </pre>
     <p>Figures in the margin are created as margin notes, as demonstrated in the next section.</p>
 </section>

Include a <meta> section at the _end_ of each note, with the notes title, date, and tags:

<meta name="title" content="Note Title">
<meta name="date" content="2024-12-16">
<meta name="tags" content="tag1, tag2, tag3">

If you have any comments on the extraction of the note, e.g. "I found this unclear" etc,
write these in the meta section as well. End notes with the </NOTE> tag.

* Detect and include tables as HTML tables as appropriate.
* Detect and include graphs using a special <graph> tag with graphviz markup inside.
* Use standard HTML elements for formatting lists, bold, italics etc. Use your best guess for the authors intent.
* Lift any underlined text to tags as note tags as well.

Aggressively infer missing information from context:
  * If a date is missing, infer it from previous entries or pages.
  * Infer tags from the content if they are missing.
  * Infer illegible words or abbreviations based on the surrounding text.
  * Infer subjects from the content of the note, using vocabulary similar to the tags.
"""

CLEANUP_PROMPT = """

Given the set of input HTML notes, separated by <NOTE>...</NOTE> blocks, review
them and clean them up. e.g. if you see wonky formatting or places where notes
should be have grouped e.g. a (cont.) message. Otherwise rigourously follow the
RST format conventions and instructions you see above for how to format the
notes.

Merge lines where appropriate, when its clear the line breaks are 
artifacts of the width of the notebook as opposed to intentional paragraph breaks.

Use your judgement to extract _additional_ information from the original image
or improve on the existing transcription. For example, the original
transcription will frequently omit things like call-outs on the side of the main
text: these should be reliably included.

Make sure to include everything in the note -- use margin notes anytime you're
not sure where or how content should fit in.

Output a complete new set of notes.

Precede any work you do with a <COMMENT>...</COMMENT> section which describes your
understanding of the task, the note content, and your planned changes.

e.g.

<COMMENT>
I see 4 notes have been transcribed from the attached images. Overall the
transcription appears accurate, however I see 1 note that should be split,
and 2 notes which should be combined. I see I can improve the title of one
note based on it's content, and fix the formatting of a few lists I see.
</COMMENT>

... notes here ...
"""

USER_PROMPT = """
Analyze the handwritten notes in the attached images.
"""

def query_llm_with_cleanup(messages, raw_results=None):
  """Helper function to handle two-pass LLM query with cleanup"""
  # First pass - get initial notes
  first_pass = completion(
    model="gemini/gemini-2.0-flash-exp",
    messages=messages,
    max_tokens=8192,
  ).choices[0].message.content

  # Second pass - cleanup with original results
  cleanup_messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": first_pass},
    {"role": "user", "content": CLEANUP_PROMPT},
  ]
  
  cleaned = completion(
    model="gemini/gemini-2.0-flash-exp", 
    messages=cleanup_messages,
    max_tokens=8192,
  ).choices[0].message.content

  return cleaned

def extract_notes(images, batch_size=8):
  """Extract notes from images with two-pass LLM processing"""
  # Process images in batches
  messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": USER_PROMPT},
  ]

  batches = []
  for i in range(0, len(images), batch_size):
    img_batch = images[i : i + batch_size]

    batch_messages = messages.copy()
    for img in img_batch:
      batch_messages.append(
        {
          "role": "user",
          "content": [
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:{img.mime_type};base64,{base64.b64encode(img.content).decode()}"
              },
            }
          ],
        }
      )
    batches.append(batch_messages)

  # Process batches in parallel
  pool = multiprocessing.dummy.Pool(16)
  results = pool.map(query_llm_with_cleanup, batches)
  pool.close()
  
  return results


In [None]:
raw_results = extract_notes(images[-5:])

In [None]:
def split_notes(note_results: list[str]):
    notes = []
    for note in note_results:
        if "<COMMENT>" in note:
            comment, note = note.split("</COMMENT>")
            print(comment[9:])

        # split on <NOTE>...</NOTE>
        note = note.split("<NOTE>")[1:]
        for n in note:
            n = n.split("</NOTE>")[0]
            notes.append(n)

    return notes

# print(raw_results[0])
all_notes = split_notes(raw_results)

In [None]:
# render note html in the code output

from IPython.display import display, HTML
display(HTML(all_notes[0]))

In [9]:
# extract titles for each note, write notes as html files in the `notes/` directory
import re
from pathlib import Path

notes_dir = Path("notes")
notes_dir.mkdir(exist_ok=True)

for note in all_notes:
  title = re.search(r"<meta name=\"title\" content=\"(.*?)\">", note).group(1)
  date = re.search(r"<meta name=\"date\" content=\"(.*?)\">", note).group(1)
  tags = re.search(r"<meta name=\"tags\" content=\"(.*?)\">", note).group(1)

  (notes_dir / f"{date}_{title}.html").write_text(note)
