Skip to content

Commit

Permalink
Add USX3 Import and Export, added chapter/verse end handling for USFM…
Browse files Browse the repository at this point in the history
… 2, USX 2 and USFX

 #38
  • Loading branch information
Rolf-Smit committed Jan 6, 2021
1 parent a1b60dc commit 891a996
Show file tree
Hide file tree
Showing 13 changed files with 5,998 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ public void doExport(Bible bible, String... exportArgs) throws Exception {
VerseIdentifier location = new VerseIdentifier(pid, cnum, v.getNumber());
v.accept(new ParatextExportVisitor("in verse", bk.getId().isNT(), ctx, null, ParagraphKind.PARAGRAPH_P, location));
}
ctx.endChapter(cnum);
}
}
doExportBooks(books, exportArgs);
Expand Down Expand Up @@ -526,6 +527,10 @@ public void startChapter(int cnum) {
charContent = null;
}

public void endChapter(int cnum) {
book.getContent().add(new ParatextBook.ChapterEnd(new ChapterIdentifier(book.getId(), cnum)));
}

public void closeParagraph() {
currentParagraph = null;
charContent = null;
Expand Down Expand Up @@ -777,6 +782,9 @@ public Visitor<RuntimeException> visitExtraAttribute(ExtraAttributePriority prio

@Override
public boolean visitEnd() {
if (verseLocation != null) {
getCharContent().getContent().add(new ParatextCharacterContent.VerseEnd(verseLocation));
}
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ protected List<ParatextBook> doImportAllBooks(File inputFile) throws Exception {
break;
case "CHAPTER":
currentBook.getContent().add(new ChapterStart(new ChapterIdentifier(currentBook.getId(),Integer.parseInt(parts[1]))));
case "CHAPTER-END":
currentBook.getContent().add(new ParatextBook.ChapterEnd(ChapterIdentifier.fromLocationString(parts[1])));
break;
case "PARAGRAPH":
currentBook.getContent().add(new ParagraphStart(Objects.requireNonNull(allParagraphKinds.get(parts[1]))));
break;
Expand Down Expand Up @@ -90,6 +93,9 @@ private void importCharContent(List<ParatextCharacterContentPart> target, Buffer
case "VERSE":
target.add(new VerseStart(VerseIdentifier.fromStringOrThrow(parts[2]), parts[3]));
break;
case "VERSE-END":
target.add(new ParatextCharacterContent.VerseEnd(VerseIdentifier.fromStringOrThrow(parts[2])));
break;
case "FOOTNOTE":
FootnoteXref fx = new FootnoteXref(Objects.requireNonNull(FootnoteXrefKind.allTags().get(parts[1])), parts[2]);
target.add(fx);
Expand Down Expand Up @@ -155,7 +161,7 @@ public void visitChapterStart(ChapterIdentifier location) throws IOException {

@Override
public void visitChapterEnd(ChapterIdentifier location) throws IOException {

bw.write("CHAPTER-END\t" + location + "\n");
}

@Override
Expand Down Expand Up @@ -223,7 +229,7 @@ public void visitEnd() throws IOException {

@Override
public void visitVerseEnd(VerseIdentifier location) throws IOException {

bw.write("VERSE-END\t" + location + "\n");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
import biblemulticonverter.format.paratext.ParatextCharacterContent.VerseStart;
import biblemulticonverter.format.paratext.model.ChapterIdentifier;
import biblemulticonverter.format.paratext.model.VerseIdentifier;
import biblemulticonverter.format.paratext.model.Version;
import biblemulticonverter.format.paratext.utilities.ImportUtilities;
import biblemulticonverter.format.paratext.utilities.StandardExportLogMessages;

import java.io.BufferedWriter;
import java.io.File;
Expand Down Expand Up @@ -52,11 +55,15 @@ public class USFM extends AbstractParatextFormat {

public static final Set<String> KNOWN_CHARACTER_TAGS = new HashSet<>(Arrays.asList("f", "fe", "x"));

private static final Set<ParagraphKind> USFM_2_PARAGRAPH_KINDS = ParagraphKind.allForVersion(Version.V2_2);
private static final Set<AutoClosingFormattingKind> USFM_2_AUTO_CLOSING_FORMATTING_KINDS = AutoClosingFormattingKind.allForVersion(Version.V2_2);

public static final Map<String, ParagraphKind> PARAGRAPH_TAGS = ParagraphKind.allTags();
public static final Map<String, FootnoteXrefKind> FOOTNOTE_XREF_TAGS = FootnoteXrefKind.allTags();
public static final Map<String, AutoClosingFormattingKind> AUTO_CLOSING_TAGS = AutoClosingFormattingKind.allTags();

private final StandardExportLogMessages logger = new StandardExportLogMessages("USFM 2");

@Override
protected ParatextBook doImportBook(File inputFile) throws Exception {
return doImportBook(inputFile, StandardCharsets.UTF_8);
Expand All @@ -66,7 +73,7 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
KNOWN_CHARACTER_TAGS.addAll(AUTO_CLOSING_TAGS.keySet());
if (!inputFile.getName().toLowerCase().endsWith(".usfm") && !inputFile.getName().toLowerCase().endsWith(".sfm"))
return null;
String data = new String(Files.readAllBytes(inputFile.toPath()), charset).replaceAll("[\\p{Cc}\\p{Z}]+", " ").trim() + "\\$EOF$";
String data = new String(Files.readAllBytes(inputFile.toPath()), charset).replaceAll("[\\p{Cc}]+", " ").trim() + "\\$EOF$";
if (!data.startsWith("\\id ")) {
System.out.println("WARNING: Skipping malformed file " + inputFile);
return null;
Expand All @@ -82,6 +89,10 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
ParatextBook result = new ParatextBook(id, idParts.length == 1 ? "" : idParts[1]);
List<ParatextCharacterContentContainer> containerStack = new ArrayList<>();
boolean ignoreAutoClosingTags = Boolean.getBoolean("biblemulticonverter.usfm.ignoreautoclosingtags");

VerseStart openVerse = null;
ChapterStart openChapter = null;

while (startPos < finalPos) {
if (data.charAt(startPos) != '\\')
throw new IllegalStateException();
Expand All @@ -106,7 +117,13 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
}
boolean closeCharacterAttributes = false;
if (PARAGRAPH_TAGS.containsKey(tag)) {
result.getContent().add(new ParagraphStart(PARAGRAPH_TAGS.get(tag)));

ParagraphKind kind = PARAGRAPH_TAGS.get(tag);
//if (kind.getCategory() != ParatextBook.ParagraphKindCategory.TEXT) {
// Close any open verse
// openVerse = closeOpenVerse(result, openVerse, false);
//}
result.getContent().add(new ParagraphStart(kind));
closeCharacterAttributes = true;
} else if (tag.endsWith("*")) {
String rawTag = tag.substring(0, tag.length() - 1);
Expand Down Expand Up @@ -173,24 +190,39 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
}
}
} else if (tag.equals("v")) {
String[] parts = textPart.split(" ", 2);
ImportUtilities.closeOpenVerse(result, openVerse);

String[] parts = textPart.split(" ", 2);
ChapterStart chapter = result.findLastBookContent(ChapterStart.class);
if (chapter == null) {
throw new IllegalStateException("Verse \\v found before chapter start milestone");
}

// A verse number in USFM 2 may be in the format 6-7, 6a or even 6-7a.
// Attempt to parse these numbers by first adding the book and chapter and then parsing it as a whole.
VerseIdentifier location = VerseIdentifier.fromStringOrThrow(chapter.getLocation() + ":" + parts[0]);
VerseIdentifier location = VerseIdentifier.fromStringOrThrow(openChapter.getLocation() + ":" + parts[0]);

openVerse = new VerseStart(location, parts[0]);

containerStack.get(containerStack.size() - 1).getContent().add(new VerseStart(location, parts[0]));
containerStack.get(containerStack.size() - 1).getContent().add(openVerse);
textPart = parts.length == 1 ? "" : parts[1];
} else if (tag.equals("c")) {

ImportUtilities.closeOpenVerse(result, openVerse);
openVerse = null;

// There is not really a good way to accurately determine where the end of a chapter should be placed
// based on USFM 2 content. Maybe a title above this chapter marker was already intended to be part of
// this chapter. This is basically a best guess. This should not really matter when converting from
// USFM 2 to USX 2 or USFX (which is based on USFM 2), however when up-converting to USX 3 this might
// lead to unexpected results.
ImportUtilities.closeOpenChapter(result, openChapter);

String[] parts = textPart.split(" ", 2);
if (!parts[0].matches("[0-9]+"))
throw new NumberFormatException("Invalid chapter number in \\c " + textPart);
result.getContent().add(new ChapterStart(new ChapterIdentifier(id, Integer.parseInt(parts[0]))));
openChapter = new ChapterStart(new ChapterIdentifier(id, Integer.parseInt(parts[0])));
result.getContent().add(openChapter);
closeCharacterAttributes = true;
textPart = parts.length == 1 ? "" : parts[1];
} else if (tag.matches("t[hc]r?[0-9]+")) {
Expand Down Expand Up @@ -223,6 +255,8 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
throw new IOException("Two charsets specified: " + charset + " and " + correctCharset);
}
return doImportBook(inputFile, correctCharset);
} else {
result.getAttributes().put(tag, textPart);
}
textPart = "";
} else if (BOOK_HEADER_ATTRIBUTE_TAGS.contains(tag)) {
Expand All @@ -244,6 +278,8 @@ private ParatextBook doImportBook(File inputFile, Charset charset) throws Except
containerStack.get(containerStack.size() - 1).getContent().add(new ParatextCharacterContent.Text(textPart));
}
}
ImportUtilities.closeOpenVerse(result, openVerse);
ImportUtilities.closeOpenChapter(result, openChapter);
return result;
}

Expand All @@ -256,7 +292,7 @@ protected void doExportBook(ParatextBook book, File outFile) throws IOException
}
book.accept(new ParatextBookContentVisitor<IOException>() {

private USFMExportContext context = new USFMExportContext();
private USFMExportContext context = new USFMExportContext(logger);

@Override
public void visitChapterStart(ChapterIdentifier location) throws IOException {
Expand All @@ -271,8 +307,36 @@ public void visitChapterEnd(ChapterIdentifier location) throws IOException {

@Override
public void visitParagraphStart(ParagraphKind kind) throws IOException {
bw.write("\n\\" + kind.getTag());
context.needSpace = true;
if (USFM_2_PARAGRAPH_KINDS.contains(kind)) {
bw.write("\n\\" + kind.getTag());
context.needSpace = true;
} else {
visitUnsupportedParagraphStart(kind);
}
}

private void visitUnsupportedParagraphStart(ParagraphKind kind) throws IOException {
if (kind == ParagraphKind.HEBREW_NOTE) {
// According to documentation this is very similar to `d` (ParagraphKind.DESCRIPTIVE_TITLE)
logger.logReplaceWarning(kind, ParagraphKind.DESCRIPTIVE_TITLE);
visitParagraphStart(ParagraphKind.DESCRIPTIVE_TITLE);
} else if (kind.isSameBase(ParagraphKind.SEMANTIC_DIVISION)) {
// TODO maybe add more than 1 blank line?
logger.logReplaceWarning(kind, ParagraphKind.BLANK_LINE);
visitParagraphStart(ParagraphKind.BLANK_LINE);
} else if (kind == ParagraphKind.PARAGRAPH_PO || kind == ParagraphKind.PARAGRAPH_LH || kind == ParagraphKind.PARAGRAPH_LF) {
logger.logReplaceWarning(kind, ParagraphKind.PARAGRAPH_P);
visitParagraphStart(ParagraphKind.PARAGRAPH_P);
} else if (kind.getTag().startsWith(ParagraphKind.PARAGRAPH_LIM.getTag())) {
// Documentation is not entirely clear on what the exact difference is between `lim#` and `li#`
// one is "embedded" the other is not: https://ubsicap.github.io/usfm/lists/index.html#lim
// The assumption is made here that `lim#` is directly replaceable with `li#`
ParagraphKind replacement = ParagraphKind.PARAGRAPH_LI.getWithNumber(kind.getNumber());
logger.logReplaceWarning(kind, replacement);
visitParagraphStart(replacement);
} else {
throw new RuntimeException("Could not export to USFM 2 because an unhandled paragraph type `" + kind + "` from a newer USFM/USX version was found.");
}
}

@Override
Expand All @@ -299,7 +363,12 @@ private static String escape(String text, boolean escapePipe) {
}

private static class USFMExportContext {
StandardExportLogMessages logger;
boolean needSpace = false;

public USFMExportContext(StandardExportLogMessages logger) {
this.logger = logger;
}
}

private static class USFMCharacterContentVisitor implements ParatextCharacterContentVisitor<IOException> {
Expand Down Expand Up @@ -336,26 +405,81 @@ public ParatextCharacterContentVisitor<IOException> visitFootnoteXref(FootnoteXr

@Override
public ParatextCharacterContentVisitor<IOException> visitAutoClosingFormatting(AutoClosingFormattingKind kind, Map<String, String> attributes) throws IOException {
if (context.needSpace)
bw.write(" ");
AutoClosingFormattingKind lastTag = getLastTag();
String thisTag = (lastTag != null ? "+" : "") + kind.getTag();
bw.write("\\" + thisTag);
context.needSpace = true;
if (attributes.isEmpty()) {
pushSuffix(thisTag);
} else {
StringBuilder attrs = new StringBuilder("");
for (Map.Entry<String, String> entry : attributes.entrySet()) {
if (attrs.length() > 0)
attrs.append(" ");
attrs.append(entry.getKey() + "=\"" + entry.getValue() + "\"");
if (USFM_2_AUTO_CLOSING_FORMATTING_KINDS.contains(kind)) {
if (context.needSpace)
bw.write(" ");
AutoClosingFormattingKind lastTag = getLastTag();
String thisTag = (lastTag != null ? "+" : "") + kind.getTag();
bw.write("\\" + thisTag);
context.needSpace = true;
if (attributes.isEmpty()) {
pushSuffix(thisTag);
} else {
// TODO it can happen that newer attributes are unintentionally exported
StringBuilder attrs = new StringBuilder("");
for (Map.Entry<String, String> entry : attributes.entrySet()) {
if (attrs.length() > 0)
attrs.append(" ");
attrs.append(entry.getKey() + "=\"" + entry.getValue() + "\"");
}
pushSuffix(thisTag + "\t|" + attrs.toString());
}
pushSuffix(thisTag + "\t|" + attrs.toString());
} else {
return visitUnsupportedAutoClosingFormatting(kind, attributes);
}
return this;
}

private ParatextCharacterContentVisitor<IOException> visitUnsupportedAutoClosingFormatting(AutoClosingFormattingKind kind, Map<String, String> attributes) throws IOException {
if (kind == AutoClosingFormattingKind.LIST_TOTAL || kind == AutoClosingFormattingKind.LIST_KEY || kind.isSameBase(AutoClosingFormattingKind.LIST_VALUE)) {
// It should not be too much of an issue to just skip these list tags
// E.g.
// \li1 \lik Reuben\lik* \liv1 Eliezer son of Zichri\liv1*
// Wil become:
// \li1 Reuben Eliezer son of Zichri
context.logger.logSkippedWarning(kind);
return new USFMCharacterContentVisitor(bw, context);
} else if (kind == AutoClosingFormattingKind.FOOTNOTE_WITNESS_LIST) {
// The Footnote witness list is just extra markup found within a footnote, however according to
// documentation found here: https://ubsicap.github.io/usfm/v3.0.rc1/notes_basic/fnotes.html
// Each element within a footnote must start with it's appropriate tag. So we can't just skip this tag
// since it could contain text. It would be better to turn this into a text entry `ft`.
context.logger.logReplaceWarning(kind, AutoClosingFormattingKind.FOOTNOTE_TEXT);
return visitAutoClosingFormatting(AutoClosingFormattingKind.FOOTNOTE_TEXT, attributes);
} else if (kind == AutoClosingFormattingKind.XREF_PUBLISHED_ORIGIN) {
// Published cross reference origin texts do not exist in USFM 2.x
// There is not really a nice way to downgrade these, we cannot put the `xop` tag into `xo` because it
// might not follow the usual `<chapter><separator><verse>` pattern.
// TODO, maybe we can just write the contents to the parent target, just like FOOTNOTE_WITNESS_LIST?
context.logger.logRemovedWarning(kind);
return null;
} else if (kind == AutoClosingFormattingKind.XREF_TARGET_REFERENCES_TEXT) {
// "Target reference(s) extra / added text" does not exist in USFM 2.x
// We should be able to get away with just adding the raw content directly `target`.
context.logger.logSkippedWarning(kind);
return new USFMCharacterContentVisitor(bw, context);
} else if (kind == AutoClosingFormattingKind.SUPERSCRIPT) {
// There is not really a good way to represent superscript in USFM 2.x
// To avoid losing data, we skip the tag and just add the content directly to `target`.
// TODO, maybe we can use `sc` (Small caps) instead?
context.logger.logSkippedWarning(kind, "This might lead to text that is not separated by whitespace," +
"since the previous text and superscript text may not have had been separated by whitespace.");
return new USFMCharacterContentVisitor(bw, context);
} else if (kind == AutoClosingFormattingKind.ARAMAIC_WORD) {
// There is not really a good way to represent Aramaic words in USFM 2.x
// To avoid losing data, we skip the tag and just add the content directly to `target`.
context.logger.logSkippedWarning(kind);
return new USFMCharacterContentVisitor(bw, context);
} else if (kind == AutoClosingFormattingKind.PROPER_NAME_GEOGRAPHIC) {
// This marker just gives geographic names a different presentation, thus can easily be skipped without
// too much loss.
context.logger.logSkippedWarning(kind);
return new USFMCharacterContentVisitor(bw, context);
} else {
throw new RuntimeException("Could not export to USFM 2 because an unhandled char type `" + kind + "` from a newer USFM/USX version was found.");
}
}

@Override
public void visitReference(Reference reference) throws IOException {
visitText(reference.getContent());
Expand All @@ -365,12 +489,13 @@ public void visitReference(Reference reference) throws IOException {
public void visitText(String text) throws IOException {
if (context.needSpace)
bw.write(" ");
context.needSpace = text.endsWith(" ");
if (context.needSpace) {
text = text.substring(0, text.length() - 1);
}
// context.needSpace = text.endsWith(" ");
// if (context.needSpace) {
// text = text.substring(0, text.length() - 1);
// }
AutoClosingFormattingKind lastTag = getLastTag();
bw.write(escape(text, lastTag != null && lastTag.getDefaultAttributes() != null));
context.needSpace = false;
}

@Override
Expand Down
Loading

0 comments on commit 891a996

Please sign in to comment.