Skip to content

isolate_run() #980

@scanny

Description

@scanny

This is some code I developed to answer this SO question.

You give it a character-position range in a paragraph and it does the needful to isolate that range of characters into its own single run having the same character formatting as the original. If you don't change the text of the paragraph between calls, it can be called repeatedly with different ranges to isolate multiple ranges, like multiple matches to re.Pattern.findall().

I'm not sure what will become of it but it was more work than I originally guessed so I want to keep it around for future reference.

import itertools

def isolate_run(paragraph, start, end):
    """Return docx.text.Run object containing only `paragraph.text[start:end]`.

    Runs are split as required to produce a new run at the `start` that ends at `end`.
    Runs are unchanged if the indicated range of text already occupies its own run. The
    resulting run object is returned.

    `start` and `end` are as in Python slice notation. For example, the first three
    characters of the paragraph have (start, end) of (0, 3). `end` is *not* the index of
    the last character. These correspond to `match.start()` and `match.end()` of a regex
    match object and `s[start:end]` in Python slice notation.
    """
    rs = tuple(paragraph._p.r_lst)

    def advance_to_run_containing_start(start, end):
        """Return (r_idx, start, end) triple indicating start run and adjusted offsets.

        The start run is the run the `start` offset occurs in. The returned `start` and
        `end` values are adjusted to be relative to the start of `r_idx`.
        """
        # --- add 0 at end so `r_ends[-1] == 0` ---
        r_ends = tuple(itertools.accumulate(len(r.text) for r in rs)) + (0,)
        r_idx = 0
        while start >= r_ends[r_idx]:
            r_idx += 1
        skipped_rs_offset = r_ends[r_idx - 1]
        return rs[r_idx], r_idx, start - skipped_rs_offset, end - skipped_rs_offset

    def split_off_prefix(r, start, end):
        """Return adjusted `end` after splitting prefix off into separate run.

        Does nothing if `r` is already the start of the isolated run.
        """
        if start > 0:
            prefix_r = copy.deepcopy(r)
            r.addprevious(prefix_r)
            r.text = r.text[start:]
            prefix_r.text = prefix_r.text[:start]
        return end - start

    def split_off_suffix(r, end):
        """Split `r` at `end` such that suffix is in separate following run."""
        suffix_r = copy.deepcopy(r)
        r.addnext(suffix_r)
        r.text = r.text[:end]
        suffix_r.text = suffix_r.text[end:]

    def lengthen_run(r, r_idx, end):
        """Add prefixes of following runs to `r` until `end` is reached."""
        while len(r.text) < end:
            suffix_len_reqd = end - len(r.text)
            r_idx += 1
            next_r = rs[r_idx]
            if len(next_r.text) <= suffix_len_reqd:
                # --- subsume next run ---
                r.text = r.text + next_r.text
                next_r.getparent().remove(next_r)
                continue
            if len(next_r.text) > suffix_len_reqd:
                # --- take prefix from next run ---
                r.text = r.text + next_r.text[:suffix_len_reqd]
                next_r.text = next_r.text[suffix_len_reqd:]

    # --- 1. skip over any runs before the one containing the start of our range ---
    r, r_idx, start, end = advance_to_run_containing_start(start, end)

    # --- 2. split first run where our range starts, placing "prefix" to our range
    # ---    in a new run inserted just before this run. After this, our run will begin
    # ---    at the right point and the left-hand side of our work is done.
    end = split_off_prefix(r, start, end)

    # --- 3. if run is longer than isolation-range we need to split-off a suffix run ---
    if len(r.text) > end:
        split_off_suffix(r, end)

    # --- 4. But if our run is shorter than the desired isolation-range we need to
    # ---    lengthen it by taking text from subsequent runs
    elif len(r.text) < end:
        lengthen_run(r, r_idx, end)

    # --- if neither 3 nor 4 apply it's because the run already ends in the right place
    # --- and there's no further work to be done.

    return Run(r, paragraph)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions