Skip to content

Commit

Permalink
Refactor RTF file parsing to fix nested group handling; fixes #1, #2
Browse files Browse the repository at this point in the history
  • Loading branch information
sammdot committed Jan 1, 2021
1 parent 57e4075 commit acb8b14
Showing 1 changed file with 112 additions and 37 deletions.
149 changes: 112 additions & 37 deletions src/rtf.rs
Original file line number Diff line number Diff line change
@@ -1,66 +1,141 @@
use nom::IResult;
use nom::branch::alt;
use nom::bytes::complete::{is_not, tag, take_until};
use nom::combinator::opt;
use nom::multi::many0;
use nom::sequence::{delimited, tuple};
use nom::bytes::complete::{is_not, tag};
use nom::character::complete::{alpha1, digit1, one_of};
use nom::combinator::{opt, recognize};
use nom::multi::{many0, many1, many_till};
use nom::sequence::tuple;

use crate::dict::Dictionary;
use crate::translation_parse::format_rtf_to_plover;

fn header(input: &str) -> IResult<&str, &str> {
let (input, (_, _, _, system_name, _, _)) = tuple((tag("{\\rtf1"), take_until("{\\*\\cxsystem "),
tag("{\\*\\cxsystem "), take_until("}"), tag("}"), alt((take_until("{\\*\\cxs "), take_until("}")))))(input)?;
Ok((input, system_name))
use std::char::from_u32;

fn unsigned(input: &str) -> IResult<&str, u32> {
let (input, num) = digit1(input)?;
Ok((input, num.parse::<u32>().unwrap()))
}

fn integer(input: &str) -> IResult<&str, i32> {
let (input, num) = recognize(tuple((tag("-"), digit1)))(input)?;
Ok((input, num.parse::<i32>().unwrap()))
}

fn unicode(input: &str) -> IResult<&str, String> {
let (input, (_, code, _)) = tuple((
tag("\\u"), unsigned, opt(tag(" "))))(input)?;
Ok((input, from_u32(code).unwrap().to_string()))
}

fn control_word(input: &str) -> IResult<&str, String> {
let (input, cw) = recognize(tuple((
tag("\\"), alpha1, opt(integer), opt(tag(" ")))))(input)?;
Ok((input, cw.to_string()))
}

fn steno_group(input: &str) -> IResult<&str, &str> {
Ok(delimited(tag("{\\*\\cxs "), is_not("}"), tag("}"))(input)?)
fn control_symbol(input: &str) -> IResult<&str, String> {
let (input, cs) = recognize(tuple((
tag("\\"),
one_of("\\{}*_~"))))(input)?;
Ok((input, cs.to_string()))
}

fn comment(input: &str) -> IResult<&str, &str> {
let (input, (_, comment, _)) = tuple((tag("{\\*\\cxcomment "), take_until("}"), tag("}")))(input)?;
Ok((input, comment))
fn text(input: &str) -> IResult<&str, String> {
let (input, text) = recognize(many1(is_not("\\{}")))(input)?;
Ok((input, text.to_string()))
}

fn non_steno(input: &str) -> IResult<&str, (String, Option<&str>)> {
let (input, (left, comment, right)) = tuple((
take_until("{\\*\\cx"), opt(comment), take_until("{\\*\\cxs ")))(input)?;
Ok((input, (format!("{}{}", left, right), comment)))
fn group(input: &str) -> IResult<&str, String> {
let (input, (l, grp, r)) = tuple((
tag("{"),
many1(alt((group, unicode, control_word, control_symbol, text))),
tag("}")))(input)?;
Ok((input, format!("{}{}{}", l, grp.join(""), r)))
}

fn last_non_steno_with_comment(input: &str) -> IResult<&str, (String, Option<&str>)> {
let (input, (left, _, comment, _, right)) = tuple((
take_until("{\\*\\cxcomment "), tag("{\\*\\cxcomment "), take_until("}"),
tag("}"), take_until("}")))(input)?;
Ok((input, (format!("{}{}", left, right), Some(comment))))
fn steno_group(input: &str) -> IResult<&str, String> {
let (input, (steno, _)) = tuple((
many1(alt((unicode, control_word, control_symbol, text))),
tag("}")))(input)?;
Ok((input, steno.join("")))
}

fn last_non_steno_without_comment(input: &str) -> IResult<&str, (String, Option<&str>)> {
let (input, tl) = take_until("}")(input)?;
Ok((input, (tl.to_string(), None)))
#[derive(Debug)]
enum TranslationItem {
Comment(String),
NotComment(String),
}

fn steno_entry(input: &str) -> IResult<&str, (&str, (String, Option<&str>))> {
Ok(tuple((steno_group, non_steno))(input)?)
fn cxcomment(input: &str) -> IResult<&str, TranslationItem> {
let (input, (_, comment, _)) = tuple((
tag(r"{\*\cxcomment "),
many0(alt((group, unicode, control_word, control_symbol, text))),
tag("}")))(input)?;
Ok((input, TranslationItem::Comment(comment.join(""))))
}

fn last_steno_entry(input: &str) -> IResult<&str, (&str, (String, Option<&str>))> {
Ok(tuple((steno_group,
alt((last_non_steno_with_comment, last_non_steno_without_comment))))(input)?)
fn non_comment(input: &str) -> IResult<&str, TranslationItem> {
let (input, item) =
alt((group, unicode, control_word, control_symbol, text))(input)?;
Ok((input, TranslationItem::NotComment(item)))
}

fn steno_entry(input: &str) -> IResult<&str, (String, String, Option<String>)> {
let (input, (steno_group, (contents, _))) = tuple((
steno_group,
many_till(
alt((cxcomment, non_comment)),
tag(r"{\*\cxs "))))(input)?;
let translation = contents.iter()
.map(|obj| match obj { TranslationItem::NotComment(s) => s.as_str(), _ => "" })
.collect::<Vec<&str>>().join("").trim().to_string();
let comment = match contents.iter()
.map(|obj| match obj { TranslationItem::Comment(s) => s.as_str(), _ => "" })
.collect::<Vec<&str>>().join("").trim() { "" => None, s => Some(s.to_string()) };
Ok((input, (steno_group, translation, comment)))
}

fn last_steno_entry(input: &str) -> IResult<&str, (String, String, Option<String>)> {
let (input, (steno_group, (contents, _))) = tuple((
steno_group,
many_till(
alt((cxcomment, non_comment)),
tag(r"}"))))(input)?;
let translation = contents.iter()
.map(|obj| match obj { TranslationItem::NotComment(s) => s.as_str(), _ => "" })
.collect::<Vec<&str>>().join("").trim().to_string();
let comment = match contents.iter()
.map(|obj| match obj { TranslationItem::Comment(s) => s.as_str(), _ => "" })
.collect::<Vec<&str>>().join("").trim() { "" => None, s => Some(s.to_string()) };
Ok((input, (steno_group, translation, comment)))
}

fn cxsystem(input: &str) -> IResult<&str, String> {
let (input, (_, system, _)) = tuple((
tag(r"{\*\cxsystem "),
many0(alt((group, unicode, control_word, control_symbol, text))),
tag("}")))(input)?;
Ok((input, system.join("")))
}

pub fn parse_file(input: &str) -> IResult<&str, Dictionary> {
let mut file = tuple((header, many0(steno_entry), last_steno_entry, tag("}")));
let (input, (header, mut entries, last_entry, _)) = file(input)?;
let (input, (_, cxsystem, _, _, mut entries, last_entry)) = tuple((
tag(r"{\rtf1\ansi{\*\cxrev100}\cxdict"),
cxsystem,
opt(tuple((
tag(r"{\stylesheet"),
many1(alt((group, unicode, control_word, control_symbol, text))),
tag("}")))),
recognize(many_till(alt((group, unicode, control_word, control_symbol, text)), tag(r"{\*\cxs "))),
many1(steno_entry),
last_steno_entry
))(input)?;
entries.push(last_entry);

let mut dict = Dictionary::new(header);
for (steno, (translation, comment)) in entries {
dict.add_entry(steno.to_string(), format_rtf_to_plover(&translation.trim()),
match comment { Some(c) => Some(c.to_string()), None => None });
let mut dict = Dictionary::new(&cxsystem);
for (steno, translation, comment) in entries {
dict.add_entry(steno, format_rtf_to_plover(translation.trim()), comment);
}

Ok((input, dict))
}

Expand Down

0 comments on commit acb8b14

Please sign in to comment.