In [1]:
import pdfplumber
import pandas as pd
import regex as re
import json

# Define the path to your PDF
pdf_path = "../data/julius-caesar.pdf"

## Extracting PDF

In [2]:
all_text = ""
all_tables = []

print(f"Starting extraction from {pdf_path}...")

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        # Extract text from the page
        page_text = page.extract_text()
        if page_text:
            all_text += page_text + "\n"
        
        # Extract tables (key for dialogue)
        tables = page.extract_tables()
        if tables:
            all_tables.extend(tables)

print("Extraction complete.")
print(f"Total characters extracted: {len(all_text)}")
print(f"Total tables found: {len(all_tables)}")

Starting extraction from ../data/julius-caesar.pdf...
Extraction complete.
Total characters extracted: 151632
Total tables found: 0


## Cleaning of the data

In [3]:
# --- Inspect Raw Text ---
print("--- RAW TEXT (First 2000 chars) ---")
print(all_text[:2000])

# --- Inspect Raw Tables ---
print("\n\n--- RAW TABLES (First 5 tables) ---")
for table in all_tables[:5]:
    print(table)

--- RAW TEXT (First 2000 chars) ---
Folger Shakespeare Library
https://shakespeare.folger.edu/
Get even more from the Folger
You can get your own copy of this text to keep. Purchase a full copy
to get the text, plus explanatory notes, illustrations, and more.
Buy a copy
Contents
From the Director of the Folger Shakespeare
Library
Front
Textual Introduction
Matter
Synopsis
Characters in the Play
Scene 1
ACT 1 Scene 2
Scene 3
Scene 1
Scene 2
ACT 2
Scene 3
Scene 4
Scene 1
ACT 3 Scene 2
Scene 3
Scene 1
ACT 4 Scene 2
Scene 3
Scene 1
Scene 2
ACT 5 Scene 3
Scene 4
Scene 5
From the Director of the Folger Shakespeare
Library
It is hard to imagine a world without Shakespeare. Since their
composition four hundred years ago, Shakespeare’s plays and poems
have traveled the globe, inviting those who see and read his works to
make them their own.
Readers of the New Folger Editions are part of this ongoing process
of “taking up Shakespeare,” finding our own thoughts and feelings
in language that strik

In [4]:
# --- Start with the raw text from Cell 2 ---
cleaned_text = all_text

# 1. Discard the "Front Matter"
# We'll find the first instance of "ACT 1" and "Scene 1"
play_start_marker = "ACT 1\nScene 1"
start_index = cleaned_text.find(play_start_marker)

if start_index == -1:
    # Try an alternative with different spacing
    play_start_marker_alt = "ACT 1\n\nScene 1"
    start_index = cleaned_text.find(play_start_marker_alt)

if start_index != -1:
    cleaned_text = cleaned_text[start_index:]
else:
    print("WARNING: Could not find play start marker 'ACT 1\nScene 1'.")

# 2. Remove FTLN line numbers
#    Pattern: "FTLN " followed by one or more digits and a space
#    Example: "FTLN 0001 "
ftln_pattern = r"FTLN \d+ "
cleaned_text = re.sub(ftln_pattern, "", cleaned_text)

# 3. Remove footers (More precise)
#    Pattern: A newline, (optional number), "Julius Caesar ACT ", and scene info
#    Example: "\n9 Julius Caesar ACT 1. SC. 1\n" or "\nJulius Caesar ACT 1. SC. 1\n"
footer_pattern = r"\n\d* ?Julius Caesar ACT \d+\. SC\. \d*\n"
cleaned_text = re.sub(footer_pattern, "\n", cleaned_text, flags=re.IGNORECASE)

# 4. NOTE: We are intentionally NOT removing the simple line numbers (5, 10, 15)
#    The previous rule for this was deleting the "1" from "ACT 1" and "Scene 1".

print("--- CLEANED TEXT (First 2000 chars) ---")
print(cleaned_text[:2000])

--- CLEANED TEXT (First 2000 chars) ---
ACT 1
Scene 1
Enter Flavius, Marullus, and certain Commoners,
including a Carpenter and a Cobbler, over the stage.
FLAVIUS
Hence! Home, you idle creatures, get you home!
Is this a holiday? What, know you not,
Being mechanical, you ought not walk
Upon a laboring day without the sign
Of your profession?—Speak, what trade art thou? 5
CARPENTER Why, sir, a carpenter.
MARULLUS
Where is thy leather apron and thy rule?
What dost thou with thy best apparel on?—
You, sir, what trade are you?
COBBLER Truly, sir, in respect of a fine workman, I am 10
but, as you would say, a cobbler.
MARULLUS
But what trade art thou? Answer me directly.
COBBLER A trade, sir, that I hope I may use with a safe
conscience, which is indeed, sir, a mender of bad
soles. 15
FLAVIUS
What trade, thou knave? Thou naughty knave, what
trade?
7
COBBLER Nay, I beseech you, sir, be not out with me.
Yet if you be out, sir, I can mend you.
MARULLUS
What mean’st thou by that? Mend me, thou s

## Implementing chunking

In [5]:
# Start with the cleaned_text from Cell 4
scene_chunks = []

# 1. Define a regex pattern to split by ACT
# This will split the play into 5-6 parts (an empty string + 5 acts)
act_split_pattern = r"(ACT\s+\d+)"
act_splits = re.split(act_split_pattern, cleaned_text, flags=re.IGNORECASE)

current_act = 0

# We skip the first element (index 0) which is usually empty
for i in range(1, len(act_splits), 2):
    
    # --- Get the Act marker and content ---
    act_marker = act_splits[i]  # e.g., "ACT 1"
    act_content = act_splits[i+1] # All text for Act 1
    
    try:
        current_act = int(re.search(r'\d+', act_marker).group())
    except Exception as e:
        print(f"Error parsing Act marker: {act_marker} - {e}")
        continue
        
    # 2. Now, split this Act's content by SCENE
    scene_split_pattern = r"(Scene\s+\d+)"
    scene_splits = re.split(scene_split_pattern, act_content, flags=re.IGNORECASE)
    
    # We skip the first element (index 0) which is empty
    for j in range(1, len(scene_splits), 2):
        
        # --- Get the Scene marker and content ---
        scene_marker = scene_splits[j] # e.g., "Scene 1"
        scene_content = scene_splits[j+1].strip() # Text for Act 1, Scene 1
        
        try:
            current_scene = int(re.search(r'\d+', scene_marker).group())
        except Exception as e:
            print(f"Error parsing Scene marker: {scene_marker} - {e}")
            continue
            
        # --- Combine for the full chunk text ---
        # We put the Act and Scene markers back in
        full_scene_text = f"{act_marker.upper()}\n{scene_marker.upper()}\n{scene_content}"
        
        # --- Create the chunk dictionary ---
        chunk = {
            "act": current_act,
            "scene": current_scene,
            "text": full_scene_text
        }
        scene_chunks.append(chunk)

print(f"Successfully chunked the play into {len(scene_chunks)} scenes.")

# --- Inspect the first TWO chunks to verify ---
if len(scene_chunks) > 0:
    print("\n--- FIRST CHUNK (ACT 1, SCENE 1) ---")
    print(json.dumps(scene_chunks[0], indent=2))
if len(scene_chunks) > 1:
    print("\n--- SECOND CHUNK (ACT 1, SCENE 2) ---")
    print(json.dumps(scene_chunks[1], indent=2))

Successfully chunked the play into 18 scenes.

--- FIRST CHUNK (ACT 1, SCENE 1) ---
{
  "act": 1,
  "scene": 1,
  "text": "ACT 1\nSCENE 1\nEnter Flavius, Marullus, and certain Commoners,\nincluding a Carpenter and a Cobbler, over the stage.\nFLAVIUS\nHence! Home, you idle creatures, get you home!\nIs this a holiday? What, know you not,\nBeing mechanical, you ought not walk\nUpon a laboring day without the sign\nOf your profession?\u2014Speak, what trade art thou? 5\nCARPENTER Why, sir, a carpenter.\nMARULLUS\nWhere is thy leather apron and thy rule?\nWhat dost thou with thy best apparel on?\u2014\nYou, sir, what trade are you?\nCOBBLER Truly, sir, in respect of a fine workman, I am 10\nbut, as you would say, a cobbler.\nMARULLUS\nBut what trade art thou? Answer me directly.\nCOBBLER A trade, sir, that I hope I may use with a safe\nconscience, which is indeed, sir, a mender of bad\nsoles. 15\nFLAVIUS\nWhat trade, thou knave? Thou naughty knave, what\ntrade?\n7\nCOBBLER Nay, I beseech yo

In [6]:
import json

output_filename = "../chunks.jsonl"

with open(output_filename, 'w', encoding='utf-8') as f:
    for chunk in scene_chunks:
        f.write(json.dumps(chunk) + '\n')

print(f"Successfully saved {len(scene_chunks)} chunks to {output_filename}.")

Successfully saved 18 chunks to ../chunks.jsonl.
