-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Open
Description
This is some code I developed to answer this SO question.
You give it a character-position range in a paragraph and it does the needful to isolate that range of characters into its own single run having the same character formatting as the original. If you don't change the text of the paragraph between calls, it can be called repeatedly with different ranges to isolate multiple ranges, like multiple matches to re.Pattern.findall()
.
I'm not sure what will become of it but it was more work than I originally guessed so I want to keep it around for future reference.
import itertools
def isolate_run(paragraph, start, end):
"""Return docx.text.Run object containing only `paragraph.text[start:end]`.
Runs are split as required to produce a new run at the `start` that ends at `end`.
Runs are unchanged if the indicated range of text already occupies its own run. The
resulting run object is returned.
`start` and `end` are as in Python slice notation. For example, the first three
characters of the paragraph have (start, end) of (0, 3). `end` is *not* the index of
the last character. These correspond to `match.start()` and `match.end()` of a regex
match object and `s[start:end]` in Python slice notation.
"""
rs = tuple(paragraph._p.r_lst)
def advance_to_run_containing_start(start, end):
"""Return (r_idx, start, end) triple indicating start run and adjusted offsets.
The start run is the run the `start` offset occurs in. The returned `start` and
`end` values are adjusted to be relative to the start of `r_idx`.
"""
# --- add 0 at end so `r_ends[-1] == 0` ---
r_ends = tuple(itertools.accumulate(len(r.text) for r in rs)) + (0,)
r_idx = 0
while start >= r_ends[r_idx]:
r_idx += 1
skipped_rs_offset = r_ends[r_idx - 1]
return rs[r_idx], r_idx, start - skipped_rs_offset, end - skipped_rs_offset
def split_off_prefix(r, start, end):
"""Return adjusted `end` after splitting prefix off into separate run.
Does nothing if `r` is already the start of the isolated run.
"""
if start > 0:
prefix_r = copy.deepcopy(r)
r.addprevious(prefix_r)
r.text = r.text[start:]
prefix_r.text = prefix_r.text[:start]
return end - start
def split_off_suffix(r, end):
"""Split `r` at `end` such that suffix is in separate following run."""
suffix_r = copy.deepcopy(r)
r.addnext(suffix_r)
r.text = r.text[:end]
suffix_r.text = suffix_r.text[end:]
def lengthen_run(r, r_idx, end):
"""Add prefixes of following runs to `r` until `end` is reached."""
while len(r.text) < end:
suffix_len_reqd = end - len(r.text)
r_idx += 1
next_r = rs[r_idx]
if len(next_r.text) <= suffix_len_reqd:
# --- subsume next run ---
r.text = r.text + next_r.text
next_r.getparent().remove(next_r)
continue
if len(next_r.text) > suffix_len_reqd:
# --- take prefix from next run ---
r.text = r.text + next_r.text[:suffix_len_reqd]
next_r.text = next_r.text[suffix_len_reqd:]
# --- 1. skip over any runs before the one containing the start of our range ---
r, r_idx, start, end = advance_to_run_containing_start(start, end)
# --- 2. split first run where our range starts, placing "prefix" to our range
# --- in a new run inserted just before this run. After this, our run will begin
# --- at the right point and the left-hand side of our work is done.
end = split_off_prefix(r, start, end)
# --- 3. if run is longer than isolation-range we need to split-off a suffix run ---
if len(r.text) > end:
split_off_suffix(r, end)
# --- 4. But if our run is shorter than the desired isolation-range we need to
# --- lengthen it by taking text from subsequent runs
elif len(r.text) < end:
lengthen_run(r, r_idx, end)
# --- if neither 3 nor 4 apply it's because the run already ends in the right place
# --- and there's no further work to be done.
return Run(r, paragraph)
JSv4, ymmeng, mariusleu, tranduchuy682, 4ka0 and 2 moretranduchuy682JSv4, tranduchuy682 and voroninmanvoroninman
Metadata
Metadata
Assignees
Labels
No labels