# Biology Textbook Units Extraction with LangExtract

This notebook demonstrates extracting all units/chapters from a biology textbook using LangExtract.

## Setup

In [None]:
# Install LangExtract
%pip install -q langextract

In [1]:
# Load environment variables (including GEMINI_API_KEY)
import os
from dotenv import load_dotenv

load_dotenv()

# Verify API key is loaded
if 'GEMINI_API_KEY' in os.environ:
    print("✓ Gemini API key loaded from .env")
else:
    print("❌ Gemini API key not found in .env file")

✓ Gemini API key loaded from .env


## Define Extraction Task for Biology Units

In [2]:
import langextract as lx
import textwrap

# Define the extraction task for biology units
prompt = textwrap.dedent("""\
    Extract all units, chapters, and major sections from this biology textbook.
    Look for headings, unit titles, chapter titles, and section headers.
    Use exact text for extractions. Do not paraphrase.
    Provide meaningful attributes like unit number, chapter number, or topic area.""")

# Provide examples for biology textbook structure
examples = [
    lx.data.ExampleData(
        text="Unit 1: Introduction to Biology\n\nChapter 1: The Study of Life\n\n1.1 What is Biology?",
        extractions=[
            lx.data.Extraction(
                extraction_class="unit",
                extraction_text="Unit 1: Introduction to Biology",
                attributes={"unit_number": "1", "topic": "Introduction to Biology"}
            ),
            lx.data.Extraction(
                extraction_class="chapter",
                extraction_text="Chapter 1: The Study of Life",
                attributes={"chapter_number": "1", "title": "The Study of Life", "unit": "1"}
            ),
            lx.data.Extraction(
                extraction_class="section",
                extraction_text="1.1 What is Biology?",
                attributes={"section_number": "1.1", "title": "What is Biology?", "chapter": "1"}
            ),
        ]
    )
]

## Load Biology Textbook and Extract Units

In [4]:
import os

# Set default file path
default_path = "output/BIOLOGY GRADE 9 - REVIEW 2023 3__formula__20250820_223232.md"

# Check if default file exists, otherwise prompt for path
if os.path.exists(default_path):
    file_path = default_path
    print(f"✓ Using default file: {file_path}")
else:
    print(f"❌ Default file not found: {default_path}")
    file_path = input("Enter the path to the biology markdown file: ")
    print(f"Using file: {file_path}")

✓ Using default file: output/BIOLOGY GRADE 9 - REVIEW 2023 3__formula__20250820_223232.md


In [5]:
# Load the biology textbook content
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        biology_text = file.read()
    
    print(f"✓ Loaded biology textbook ({len(biology_text)} characters)")
    print(f"First 500 characters preview:\n{biology_text[:500]}...")
    
except FileNotFoundError:
    print(f"❌ File not found: {file_path}")
    biology_text = None

✓ Loaded biology textbook (151101 characters)
First 500 characters preview:
## BIOLOGY GRADE 9

In this image we can see a poster with some text and images.

<!-- image -->

## BIOLOGY GRADE 9

In this image, we can see some images and text.

<!-- image -->

i

-Head Curriculum Implementation,Textbook Development and Evaluation

## BIOLOGY TEXTBOOK DEVELOPMENT PANEL

## MAURITIUS INSTITUTE OF EDUCATION

Dr Sarojiny Saddul-Hauzaree

- Coordinator, Associate Professor, MIE

Dr Shakeel M C Atchia

- Lecturer, MIE

Karuna Baguant

- Educator

Maya Mohabeer

- Educator

Nand...


In [6]:
# Extract units from the biology textbook
if biology_text:
    print("🔍 Extracting units from biology textbook...")
    
    result = lx.extract(
        text_or_documents=biology_text,
        prompt_description=prompt,
        examples=examples,
        model_id="gemini-2.5-flash",
    )
    
    # Display results organized by type
    print(f"\n✓ Extracted {len(result.extractions)} items from the biology textbook:\n")
    
    # Group by extraction class
    units = []
    chapters = []
    sections = []
    other = []
    
    for extraction in result.extractions:
        if extraction.extraction_class == "unit":
            units.append(extraction)
        elif extraction.extraction_class == "chapter":
            chapters.append(extraction)
        elif extraction.extraction_class == "section":
            sections.append(extraction)
        else:
            other.append(extraction)
    
    # Display organized results
    if units:
        print(f"📚 UNITS ({len(units)}):")
        for extraction in units:
            print(f"  • {extraction.extraction_text}")
            if extraction.attributes:
                for key, value in extraction.attributes.items():
                    print(f"    - {key}: {value}")
        print()
    
    if chapters:
        print(f"📖 CHAPTERS ({len(chapters)}):")
        for extraction in chapters:
            print(f"  • {extraction.extraction_text}")
            if extraction.attributes:
                for key, value in extraction.attributes.items():
                    print(f"    - {key}: {value}")
        print()
    
    if sections:
        print(f"📄 SECTIONS ({len(sections)}):")
        for extraction in sections:
            print(f"  • {extraction.extraction_text}")
            if extraction.attributes:
                for key, value in extraction.attributes.items():
                    print(f"    - {key}: {value}")
        print()
    
    if other:
        print(f"📋 OTHER ({len(other)}):")
        for extraction in other:
            print(f"  • {extraction.extraction_class}: {extraction.extraction_text}")
            if extraction.attributes:
                for key, value in extraction.attributes.items():
                    print(f"    - {key}: {value}")
else:
    print("❌ Cannot extract units - file not loaded")

DEBUG:absl:Registered GeminiLanguageModel with patterns ['^gemini'] at priority 10
DEBUG:absl:Registered OllamaLanguageModel with patterns ['^gemma', '^llama', '^mistral', '^mixtral', '^phi', '^qwen', '^deepseek', '^command-r', '^starcoder', '^codellama', '^codegemma', '^tinyllama', '^wizardcoder', '^gpt-oss', '^meta-llama/[Ll]lama', '^google/gemma', '^mistralai/[Mm]istral', '^mistralai/[Mm]ixtral', '^microsoft/phi', '^Qwen/', '^deepseek-ai/', '^bigcode/starcoder', '^codellama/', '^TinyLlama/', '^WizardLM/'] at priority 10
DEBUG:absl:Registered OpenAILanguageModel with patterns ['^gpt-4', '^gpt4\\.', '^gpt-5', '^gpt5\\.'] at priority 10


🔍 Extracting units from biology textbook...


2025-08-24 22:22:45,828 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.__init__(self=<GeminiLanguageModel>, constraint=Constraint(co...NONE: 'none'>), kwargs={})
2025-08-24 22:22:45,830 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.__init__ -> None (0.0 ms)
2025-08-24 22:22:45,832 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.apply_schema(self=<GeminiLanguageModel>, schema_instance=GeminiSchema(...xtractions']}))
2025-08-24 22:22:45,833 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.apply_schema -> None (0.0 ms)
DEBUG:absl:Initialized Annotator with prompt:
Extract all units, chapters, and major sections from this biology textbook.
Look for headings, unit titles, chapter titles, and section headers.
Use exact text for extractions. Do not paraphrase.
Provide meaningful attributes like unit number, chapter number, or topic area.

Examples
Q: Unit 1: Introducti

ResolverParsingError: Failed to parse content.

## Save Results

In [None]:
# Save results to JSONL for further analysis
if 'result' in locals() and result.extractions:
    output_name = "biology_units_extraction.jsonl"
    lx.io.save_annotated_documents([result], output_name=output_name, output_dir="entities")
    print(f"✓ Results saved to {output_name}")
    
    # Also save a summary text file
    with open("biology_units_summary.txt", "w", encoding="utf-8") as f:
        f.write("BIOLOGY TEXTBOOK UNITS EXTRACTION SUMMARY\n")
        f.write("=" * 50 + "\n\n")
        
        f.write(f"Total extractions: {len(result.extractions)}\n\n")
        
        if units:
            f.write(f"UNITS ({len(units)}):\n")
            for i, extraction in enumerate(units, 1):
                f.write(f"{i}. {extraction.extraction_text}\n")
            f.write("\n")
        
        if chapters:
            f.write(f"CHAPTERS ({len(chapters)}):\n")
            for i, extraction in enumerate(chapters, 1):
                f.write(f"{i}. {extraction.extraction_text}\n")
            f.write("\n")
        
        if sections:
            f.write(f"SECTIONS ({len(sections)}):\n")
            for i, extraction in enumerate(sections, 1):
                f.write(f"{i}. {extraction.extraction_text}\n")
    
    print("✓ Summary saved to biology_units_summary.txt")
else:
    print("❌ No results to save")