# Biology Textbook Units Extraction with LangExtract

This notebook demonstrates extracting all units/chapters from a biology textbook using LangExtract.

## Setup

In [None]:
# Install LangExtract
%pip install -q langextract

In [1]:
# Load environment variables (including GEMINI_API_KEY)
import os
from dotenv import load_dotenv

load_dotenv()

# Verify API key is loaded
if 'GEMINI_API_KEY' in os.environ:
    print("✓ Gemini API key loaded from .env")
else:
    print("❌ Gemini API key not found in .env file")

✓ Gemini API key loaded from .env


## Define Extraction Task for Biology Units

In [2]:
import langextract as lx
import textwrap

# Define the extraction task for biology units
prompt = textwrap.dedent("""\
    Extract all units, chapters, and major sections from this biology textbook.
    Look for headings, unit titles, chapter titles, and section headers.
    Use exact text for extractions. Do not paraphrase.
    Provide meaningful attributes like unit number, chapter number, or topic area.""")

# Provide examples for biology textbook structure
examples = [
    lx.data.ExampleData(
        text="Unit 1: Introduction to Biology\n\nChapter 1: The Study of Life\n\n1.1 What is Biology?",
        extractions=[
            lx.data.Extraction(
                extraction_class="unit",
                extraction_text="Unit 1: Introduction to Biology",
                attributes={"unit_number": "1", "topic": "Introduction to Biology"}
            ),
            lx.data.Extraction(
                extraction_class="chapter",
                extraction_text="Chapter 1: The Study of Life",
                attributes={"chapter_number": "1", "title": "The Study of Life", "unit": "1"}
            ),
            lx.data.Extraction(
                extraction_class="section",
                extraction_text="1.1 What is Biology?",
                attributes={"section_number": "1.1", "title": "What is Biology?", "chapter": "1"}
            ),
        ]
    )
]

## Load Biology Textbook and Extract Units

In [4]:
import os

# Set default file path
default_path = "output/BIOLOGY GRADE 9 - REVIEW 2023 3__formula__20250820_223232.md"

# Check if default file exists, otherwise prompt for path
if os.path.exists(default_path):
    file_path = default_path
    print(f"✓ Using default file: {file_path}")
else:
    print(f"❌ Default file not found: {default_path}")
    file_path = input("Enter the path to the biology markdown file: ")
    print(f"Using file: {file_path}")

✓ Using default file: output/BIOLOGY GRADE 9 - REVIEW 2023 3__formula__20250820_223232.md


In [None]:
# Load the biology textbook content
try:
    print(f"Attempting to load: {file_path}")
    
    if not os.path.exists(file_path):
        print(f"❌ File does not exist: {file_path}")
        biology_text = None
    else:
        with open(file_path, 'r', encoding='utf-8') as file:
            biology_text = file.read()
        
        if biology_text:
            print(f"✓ Loaded biology textbook ({len(biology_text)} characters)")
            print(f"First 200 characters preview:\n{biology_text[:200]}...")
        else:
            print("❌ File is empty")
            biology_text = None
    
except Exception as e:
    print(f"❌ Error loading file: {e}")
    biology_text = None

In [15]:
# Extract units from the biology textbook
if biology_text is None:
    print("❌ Cannot extract units - biology_text is None")
elif not biology_text.strip():
    print("❌ Cannot extract units - biology_text is empty")
else:
    print("🔍 Extracting units from biology textbook...")
    print(f"Text length: {len(biology_text)} characters")
    
    # Clean and validate the text
    biology_text_clean = biology_text.strip()
    print(f"Cleaned text length: {len(biology_text_clean)} characters")
    
    # Show first few lines for debugging
    lines = biology_text_clean.split('\n')[:10]
    print(f"First 10 lines preview:")
    for i, line in enumerate(lines):
        print(f"  {i+1}: {line[:100]}...")
    
    if not biology_text_clean:
        print("❌ Biology text is empty after cleaning")
    elif len(biology_text_clean) < 50:
        print(f"❌ Text too short ({len(biology_text_clean)} chars), might cause issues")
    else:
        print(f"✓ Text is valid, proceeding with extraction...")
        
        try:
            # Use a smaller chunk of text first to test
            test_text = biology_text_clean[:5000]  # First 5000 characters
            print(f"Testing with first {len(test_text)} characters...")
            
            result = lx.extract(
                text_or_documents=test_text,
                prompt_description=prompt,
                examples=examples,
                model_id="gemini-2.5-flash",
            )
            
            # Display entities with positions
            print(f"Input: Biology textbook sample ({len(test_text)} characters)\n")
            print("Extracted entities:")
            if result.extractions:
                for entity in result.extractions:
                    position_info = ""
                    if entity.char_interval:
                        start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
                        position_info = f" (pos: {start}-{end})"
                    print(f"• {entity.extraction_class.capitalize()}: {entity.extraction_text}{position_info}")
            else:
                print("No entities extracted")
                            
        except Exception as e:
            print(f"❌ Error during extraction: {e}")
            print(f"Error type: {type(e).__name__}")
            import traceback
            traceback.print_exc()
            result = None

🔍 Extracting units from biology textbook...
Text length: 151101 characters
Cleaned text length: 151101 characters
First 10 lines preview:
  1: ## BIOLOGY GRADE 9...
  2: ...
  3: In this image we can see a poster with some text and images....
  4: ...
  5: <!-- image -->...
  6: ...
  7: ## BIOLOGY GRADE 9...
  8: ...
  9: In this image, we can see some images and text....
  10: ...
✓ Text is valid, proceeding with extraction...
Testing with first 5000 characters...


2025-08-24 23:54:56,658 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.__init__(self=<GeminiLanguageModel>, constraint=Constraint(co...NONE: 'none'>), kwargs={})
2025-08-24 23:54:56,660 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.__init__ -> None (0.0 ms)
2025-08-24 23:54:56,662 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.apply_schema(self=<GeminiLanguageModel>, schema_instance=GeminiSchema(...xtractions']}))
2025-08-24 23:54:56,663 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.apply_schema -> None (0.0 ms)
DEBUG:absl:Initialized Annotator with prompt:
Extract all units, chapters, and major sections from this biology textbook.
Look for headings, unit titles, chapter titles, and section headers.
Use exact text for extractions. Do not paraphrase.
Provide meaningful attributes like unit number, chapter number, or topic area.

Examples
Q: Unit 1: Introducti

[92m✓[0m Extraction processing complete



INFO:absl:Finalizing annotation for document ID doc_f3e57dda.
INFO:absl:Document annotation completed.


Input: Biology textbook sample (5000 characters)

Extracted entities:
No entities extracted


## Save Results

In [14]:
# Save and visualize the results
if 'result' in locals() and result and result.extractions:
    # Save results to JSONL for further analysis
    lx.io.save_annotated_documents([result], output_name="biology_units_extraction.jsonl", output_dir="entities")
    
    # Generate the interactive visualization
    html_content = lx.visualize("biology_units_extraction.jsonl")
    with open("biology_units_visualization.html", "w") as f:
        if hasattr(html_content, 'data'):
            f.write(html_content.data)  # For Jupyter/Colab
        else:
            f.write(html_content)
    
    print("Interactive visualization saved to biology_units_visualization.html")
else:
    print("❌ No results to save")

❌ No results to save
