/
document.py
87 lines (70 loc) · 2.99 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Collection of functions for preparing LaTeX file for processing.
The main purpose is to provide the ability of identifying the relevant part
of a reledmac-encoded LaTeX file and serve up content from each paragraph
between `\\beginnumbering` and `\\endnumbering` for sameword processing.
"""
import regex
import unicodedata
from typing import List
def doc_content(filename: str) -> str:
"""Return the content of file."""
with open(filename, mode="r", encoding="utf-8") as f:
try:
return unicodedata.normalize("NFC", f.read())
except UnicodeDecodeError as e:
raise ValueError("The input file must be in utf-8 unicode encoding.") from e
def chunk_doc(content: str) -> List[str]:
"""
Split document into a list of chunks. All unequal numbered indices are
numbered text.
:param content: The content of the document as a string.
"""
starts = regex.finditer(r"\\beginnumbering\n", content)
ends = regex.finditer(r"\n\\endnumbering", content)
if "\\beginnumbering\n" in content:
indices = []
for start, end in zip(starts, ends):
if not indices:
# Setup the indices with the slice before first numbered text
indices.append([0, start.span()[0]])
else:
# Add the indices between previous numbered section and next
indices.append([indices[-1][-1], start.span()[0]])
# Now, add the indices of the numbered section
indices.append([start.span()[0], end.span()[1]])
# Add the tail from last numbered to end
try:
indices.append([indices[-1][-1], len(content) + 1])
except IndexError:
raise ValueError(
r"Your document did not contain one or both of "
r"\beginnumbering and \endnumbering"
)
# Chunk the text according to the indices
chunked = [content[start:end] for start, end in indices]
else:
chunked = [content]
return chunked
def chunk_pars(content):
"""Given the context contained between `\\beginnumbering` and
`\\endnumbering`, return list of paragraphs.
This is able to handle paragraphs demarcated by `\\pstart` and `\\pend` as
well as when `\\autopar` is used (see §5.2.2 of the reledmac
documentation). The use of `\\autopar` assumes that the `\\autopar` command
is given right after the `\\beginnumbering` as in the documentation.
"""
if content.find(r"\autopar") is not -1:
positions = [idx.start() for idx in regex.finditer("\n\n", content)]
else:
positions = [idx.start() for idx in regex.finditer(r"\\pstart", content)]
paragraphs = []
paragraphs.append(content[: positions[0]])
for index, par in enumerate(positions):
try:
paragraphs.append(content[par : positions[index + 1]])
except IndexError:
paragraphs.append(content[par:])
return paragraphs