### Read PDF File

In [2]:
from pypdf import PdfReader

pdf_path = "../data/raw/taleoftwocities.pdf"
reader = PdfReader(pdf_path)

all_text = ""
for page in reader.pages:
    all_text += page.extract_text() + "\n"

print(all_text[:1000])

A Tale of Two Cities
Charles Dickens
Book the First
Recalled to Life
1 The Period . . . . . . . . . . . . . . . . 3
2 The Mail . . . . . . . . . . . . . . . . 5
3 The Night Shadows . . . . . . . . . . . . . 10
4 The Preparation . . . . . . . . . . . . . . 14
5 The Wine-shop . . . . . . . . . . . . . . 24
6 The Shoemaker . . . . . . . . . . . . . . 33
Book the Second
The Golden Thread
1 Five Years Later . . . . . . . . . . . . . . 45
2 A Sight . . . . . . . . . . . . . . . . . 50
3 A Disappointment . . . . . . . . . . . . . 56
4 Congratulatory . . . . . . . . . . . . . . 68
5 The Jackal . . . . . . . . . . . . . . . . 73
6 Hundreds of People . . . . . . . . . . . . . 78
7 Monseigneur in Town . . . . . . . . . . . . 89
8 Monseigneur in the Country . . . . . . . . . . 97
9 The Gorgon's Head . . . . . . . . . . . . . 102
10 Two Promises . . . . . . . . . . . . . . . 112
11 A Companion Picture . . . . . . . . . . . . 119
12 The Fellow of Delicacy . . . . . . . . . . . . 122
13 The Fellow of

### Basic Cleaning Function

In [6]:
import re

def clean_text(text: str) -> str:
    """
    - Remove excessive newlines
    - Remove repeated book title/author lines
    - Remove page numbers (lines with only digits)
    - Remove spaced-out book title lines
    - Remove 'Book the <X>' from TOC, but keep sub-book headers if immediately followed by 'Chapter 1'
    - Strip leading/trailing whitespace
    """
    # remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    
    # remove repeated title/author lines
    text = re.sub(r'^(A Tale of Two Cities|Charles Dickens)$', '', text, flags=re.MULTILINE)
    
    # remove spaced-out book title like 'A T A L E O F T W O C I T I E S'
    text = re.sub(r'^A(\s+T){1}\s+A(\s+L){1}\s+E(\s+O){1}\s+F(\s+T){1}\s+W(\s+O){1}\s+C(\s+I){1}\s+T(\s+I){1}\s+E(\s+S){1}$', '', text, flags=re.MULTILINE)
    
    # remove table of contents lines like "1 The Period . . . . 3"
    text = re.sub(r'^\d+\s+.*\.+\s*\d+', '', text, flags=re.MULTILINE)
    
    # remove lines with only numbers (page numbers)
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)

    # remove TOC-like 'Book the X' lines if NOT immediately followed by 'Chapter 1'
    def keep_book_if_next_chapter1(match):
        start = match.end()
        remaining = match.string[start:]
        # look ahead for first non-empty line
        next_line_match = re.search(r'^\s*(.+)$', remaining, flags=re.MULTILINE)
        if next_line_match and next_line_match.group(1).startswith('Chapter 1'):
            return match.group(0)  # keep this line
        return ''  # remove TOC entry

    text = re.sub(r'^Book the (First|Second|Third)\n.*', keep_book_if_next_chapter1, text, flags=re.MULTILINE)
    
    # strip leading/trailing whitespace
    text = text.strip()
    
    return text

cleaned_text = clean_text(all_text)
print(cleaned_text[:1000])

Book the First
Recalled to Life

Chapter 1
The Period
It was the best of times, it was the worst of times, it was the age of
wisdom, it was the age of foolishness, it was the epoch of belief, it was
the epoch of incredulity, it was the season of Light, it was the season of
Darkness, it was the spring of hope, it was the winter of despair, we had
everything before us, we had nothing before us, we were all going direct
to Heaven, we were all going direct the other wayÂ—in short, the period
was so far like the present period, that some of its noisiest authorities
insisted on its being received, for good or for evil, in the superlative
degree of comparison only.
There were a king with a large jaw and a queen with a plain face, on
the throne of England; there were a king with a large jaw and a queen
with a fair face, on the throne of France. In both countries it was clearer
than crystal to the lords of the State preserves of loaves and shes, that
things in general were settled for ever.
It

### Chunking Function

In [8]:
import re

def chunk_by_chapter_with_metadata(text: str) -> list:
    """
    Splits text into chapters and keeps metadata:
    - sub_book_number (First, Second, Third)
    - sub_book_title (e.g., 'Recalled to Life')
    - chapter_number (int)
    - chapter_title (string)
    - chapter_text
    - index
    """
    chunks = []
    sub_book_number = None
    sub_book_title = None
    chapter_pattern = re.compile(r'^Chapter (\d+)\s*(.*)')

    lines = text.split("\n")
    current_chunk = {}
    chapter_text_lines = []
    chunk_index = 0
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if not line:
            i += 1
            continue

        # detect sub-book line
        sub_book_match = re.match(r'^Book the (First|Second|Third)$', line)
        if sub_book_match:
            sub_book_number = sub_book_match.group(1)
            # next non-empty line is sub_book_title
            j = i + 1
            while j < len(lines) and not lines[j].strip():
                j += 1
            if j < len(lines):
                sub_book_title = lines[j].strip()
                i = j  # move to sub_book_title line
            i += 1
            continue

        # detect chapter start
        chapter_match = chapter_pattern.match(line)
        if chapter_match:
            # save previous chapter
            if current_chunk:
                current_chunk['chapter_text'] = "\n".join(chapter_text_lines).strip()
                current_chunk['index'] = chunk_index
                chunks.append(current_chunk)
                chunk_index += 1
                chapter_text_lines = []

            chapter_number = int(chapter_match.group(1))
            chapter_title = chapter_match.group(2).strip()

            # sometimes chapter title is missing, try to get from next line
            if not chapter_title and i+1 < len(lines):
                next_line = lines[i+1].strip()
                if next_line:
                    chapter_title = next_line
                    i += 1  # skip this line in chapter text

            current_chunk = {
                'sub_book_number': sub_book_number,
                'sub_book_title': sub_book_title,
                'chapter_number': chapter_number,
                'chapter_title': chapter_title,
                'chapter_text': ""  # placeholder
            }
        else:
            chapter_text_lines.append(line)

        i += 1

    # append last chapter
    if current_chunk:
        current_chunk['chapter_text'] = "\n".join(chapter_text_lines).strip()
        current_chunk['index'] = chunk_index
        chunks.append(current_chunk)

    return chunks

# usage
chapter_chunks = chunk_by_chapter_with_metadata(cleaned_text)
print(f"Total chapters: {len(chapter_chunks)}")
print(chapter_chunks[0])

Total chapters: 45
{'sub_book_number': 'First', 'sub_book_title': 'Recalled to Life', 'chapter_number': 1, 'chapter_title': 'The Period', 'chapter_text': "It was the best of times, it was the worst of times, it was the age of\nwisdom, it was the age of foolishness, it was the epoch of belief, it was\nthe epoch of incredulity, it was the season of Light, it was the season of\nDarkness, it was the spring of hope, it was the winter of despair, we had\neverything before us, we had nothing before us, we were all going direct\nto Heaven, we were all going direct the other way\x97in short, the period\nwas so far like the present period, that some of its noisiest authorities\ninsisted on its being received, for good or for evil, in the superlative\ndegree of comparison only.\nThere were a king with a large jaw and a queen with a plain face, on\nthe throne of England; there were a king with a large jaw and a queen\nwith a fair face, on the throne of France. In both countries it was clearer\ntha