From 66780b8b1bc708f970de497e6e0c73d6f4a1a377 Mon Sep 17 00:00:00 2001 From: Seamus Tuohy Date: Thu, 27 Jul 2023 07:48:04 -0400 Subject: [PATCH] Release 0.1.0 (#18) The major change here is that I updated RTFDE to only work with bytes and leave all encoding up to the downstream library. Please update your code as necessary! Updated to only work with bytes. Added far greater unicode support. Fixed various whitespace issues. Added proper htmlrtf support. Added support for extracting (but not parsing binary data). Update grammar to be far more explicit when extracting objects. Fixes to ensure deencapsulation handles whitespace properly. Adds grammar composer to make grammar modifications more clearly defined. Adding support for surrogates which use raw unicode instead of 16bit signed encoding. Added better handling of parsing unicode HH replacement chars. --- .gitattributes | 5 + .gitignore | 5 + .pylintrc | 570 +++++ CONTRIBUTING.md | 84 + README.md | 29 +- RTFDE/__init__.py | 17 +- RTFDE/deencapsulate.py | 469 ++-- RTFDE/grammar.py | 323 +++ RTFDE/text_extraction.py | 750 ++++++ RTFDE/transformers.py | 504 ++-- RTFDE/utils.py | 305 +++ docs/RTFDE/deencapsulate.html | 1366 +++++++++++ docs/RTFDE/deencapsulate.md | 148 ++ docs/RTFDE/exceptions.html | 203 ++ docs/RTFDE/exceptions.md | 46 + docs/RTFDE/grammar.html | 598 +++++ docs/RTFDE/grammar.md | 16 + docs/RTFDE/index.html | 127 + docs/RTFDE/index.md | 14 + docs/RTFDE/text_extraction.html | 2154 +++++++++++++++++ docs/RTFDE/text_extraction.md | 279 +++ docs/RTFDE/transformers.html | 1582 ++++++++++++ docs/RTFDE/transformers.md | 224 ++ docs/RTFDE/utils.html | 886 +++++++ docs/RTFDE/utils.md | 132 + scripts/extract_rtf_from_msg.py | 60 +- scripts/make_docs.sh | 47 + scripts/prep_private_rtf_test_folder.sh | 80 + scripts/run_tests.sh | 59 + setup.py | 10 +- tests/deencapsulate/test_de_encapsulate.py | 456 +++- tests/parse_rtf/test_parse_rtf.py | 319 ++- tests/parse_rtf/test_validate_file.py | 19 +- tests/test_data/html/multiple-encodings.rtf | 10 +- .../personal_rtf_test_files/README.md | 63 + .../personal_rtf_test_output_files/README.md | 29 + tests/test_data/plain_text/test_data.rtf | 8 + tests/test_data/rtf_parsing/control_chars.rtf | 185 ++ .../rtf_parsing/encapsulated_example.html | 23 + .../rtf_parsing/encapsulated_example.rtf | 38 + tests/test_data/rtf_parsing/five_spaces.rtf | 11 + .../rtf_parsing/font_table_template.rtf | 27 + .../test_data/rtf_parsing/small_template.rtf | 7 + .../test_data/rtf_parsing/surrogate_pairs.rtf | 11 + .../rtf_parsing/surrogate_pairs_02.rtf | 10 + .../rtf_parsing/surrogate_pairs_03.rtf | 10 + .../rtf_parsing/surrogate_pairs_04.rtf | 10 + tests/test_data/rtf_parsing/surrogates.rtf | 31 + tests/test_data/rtf_parsing/theta.rtf | 1 + tests/test_data/rtf_parsing/translated_by.rtf | 5 + .../rtf_parsing/unicode_HH_replacement.rtf | 10 + .../rtf_parsing/unicode_HH_replacement_01.rtf | 10 + tests/test_data/rtf_parsing/windows_950.rtf | 14 + tests/test_utils/test_main_utils.py | 172 ++ 54 files changed, 12040 insertions(+), 531 deletions(-) create mode 100644 .gitattributes create mode 100644 .pylintrc create mode 100644 RTFDE/grammar.py create mode 100644 RTFDE/text_extraction.py create mode 100644 RTFDE/utils.py create mode 100644 docs/RTFDE/deencapsulate.html create mode 100644 docs/RTFDE/deencapsulate.md create mode 100644 docs/RTFDE/exceptions.html create mode 100644 docs/RTFDE/exceptions.md create mode 100644 docs/RTFDE/grammar.html create mode 100644 docs/RTFDE/grammar.md create mode 100644 docs/RTFDE/index.html create mode 100644 docs/RTFDE/index.md create mode 100644 docs/RTFDE/text_extraction.html create mode 100644 docs/RTFDE/text_extraction.md create mode 100644 docs/RTFDE/transformers.html create mode 100644 docs/RTFDE/transformers.md create mode 100644 docs/RTFDE/utils.html create mode 100644 docs/RTFDE/utils.md create mode 100755 scripts/make_docs.sh create mode 100755 scripts/prep_private_rtf_test_folder.sh create mode 100755 scripts/run_tests.sh create mode 100644 tests/test_data/personal_rtf_test_files/README.md create mode 100644 tests/test_data/personal_rtf_test_output_files/README.md create mode 100644 tests/test_data/plain_text/test_data.rtf create mode 100644 tests/test_data/rtf_parsing/control_chars.rtf create mode 100644 tests/test_data/rtf_parsing/encapsulated_example.html create mode 100644 tests/test_data/rtf_parsing/encapsulated_example.rtf create mode 100644 tests/test_data/rtf_parsing/five_spaces.rtf create mode 100644 tests/test_data/rtf_parsing/font_table_template.rtf create mode 100644 tests/test_data/rtf_parsing/small_template.rtf create mode 100644 tests/test_data/rtf_parsing/surrogate_pairs.rtf create mode 100644 tests/test_data/rtf_parsing/surrogate_pairs_02.rtf create mode 100644 tests/test_data/rtf_parsing/surrogate_pairs_03.rtf create mode 100644 tests/test_data/rtf_parsing/surrogate_pairs_04.rtf create mode 100644 tests/test_data/rtf_parsing/surrogates.rtf create mode 100644 tests/test_data/rtf_parsing/theta.rtf create mode 100644 tests/test_data/rtf_parsing/translated_by.rtf create mode 100644 tests/test_data/rtf_parsing/unicode_HH_replacement.rtf create mode 100644 tests/test_data/rtf_parsing/unicode_HH_replacement_01.rtf create mode 100644 tests/test_data/rtf_parsing/windows_950.rtf create mode 100644 tests/test_utils/test_main_utils.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d56aea8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +tests/test_data/html/multiple-encodings.rtf text eol=crlf +tests/test_data/rtf_parsing/surrogate_pairs.rtf text eol=crlf +tests/test_data/rtf_parsing/encapsulated_example.html text eol=crlf +tests/test_data/rtf_parsing/surrogates.rtf text eol=crlf +tests/test_data/rtf_parsing/small_template.rtf text eol=crlf \ No newline at end of file diff --git a/.gitignore b/.gitignore index b60fa3d..917dffe 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,8 @@ dmypy.json # Pyre type checker .pyre/ + + +# Testing +tests/test_data/personal_rtf_test_files/*.rtf +tests/test_data/personal_rtf_test_output_files/*.html \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..e241638 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,570 @@ +[MASTER] + +disable=line-too-long + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold to be exceeded before program exits with error. +fail-under=10.0 + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regex patterns to the ignore-list. The +# regex matches against paths and can be in Posix or Windows format. +ignore-paths= + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.10 + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'error', 'warning', 'refactor', and 'convention' +# which contain the number of messages in each category, as well as 'statement' +# which is the total number of statements analyzed. This score is used by the +# global evaluation report (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=no + +# Signatures are removed from the similarity computation +ignore-signatures=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the 'python-enchant' package. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear and the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +#notes-rgx= + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +#variable-rgx= + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# class is considered mixin if its name matches the mixin-class-rgx option. +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins ignore-mixin- +# members is set to 'yes' +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "BaseException, Exception". +overgeneral-exceptions=BaseException, + Exception diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 34e3144..f54636d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,3 +40,87 @@ pip3 install -e .[dev] ``` python3 -m unittest discover -v ``` + +# Documentation + +This project uses [Google style docstrings](catch_common_validation_issues) which allows us to auto-generate the pdoc documentation. + + +# Advanced Logging + +Any logging (including how verbose the logging is) can be handled by configuring logging. + +You can enable RTFDE's primary logging by getting and setting the "RTFDE" logger. Any logging that is needed for basic triage will be turned on by this logger. + +``` +log = logging.getLogger("RTFDE") +log.setLevel(logging.DEBUG) +``` + +**Here is an example run on one of the test cases.** + +``` +import logging +import RTFDE + +log = logging.getLogger("RTFDE") +log.setLevel(logging.DEBUG) + + +path = 'tests/test_data/plain_text/quoted_printable_01.rtf' +with open(path, 'r') as fp: + raw = fp.read() +output = RTFDE.DeEncapsulator(rtf) +output = RTFDE.DeEncapsulator(raw) +output.deencapsulate() +``` + +## Developer Debugging + +You should read this section if the normal logging set to DEBUG didn't give you enough information to understand an error or weird behavior of RTFDE. I've added a variety of levels of debug logging to help you dig in and understand these problems. + +### RTF Validation Errors + +You have, what you believe to be, a valid RTF file which RTFDE is rejecting and telling you is invalid. You can see what each of the validators is parsing by setting the `RTFDE.validation_logger`. If set to DEBUG it will output the data being validated so you can evaluate it yourself to track down the issue. + +``` +log = logging.getLogger("RTFDE.validation_logger") +log.setLevel(logging.DEBUG) +``` + +### HTMLRTF Stripping Logging + +If you want to log all text and RTF control words that are suppressed by HTMLRTF control words you can use the `RTFDE.HTMLRTF_Stripping_logger` logger. If set to DEBUG it will output the Tokens which have been removed and the line the token starts on, the line it ends on, the starting position of that token in the line, and the end position of that token. The log uses the following format `HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}` + +Here is how you enable this log. +``` +log = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger") +log.setLevel(logging.DEBUG) +``` + +### HTMLRTF Stripping Logging + +If you are having difficulty tracking down some sort of text-transformation/decoding issue then you can use the text_extraction logging to show you FAR more information about what is occuring during text extraction. WARNING: This log is a flood of information! + +Here is how you enable this log. +``` +log = logging.getLogger("RTFDE.text_extraction") +log.setLevel(logging.DEBUG) +``` + + + + +### Grammar Debugging + +RTFDE + + + +### Lark Debug Logs +If you want to see underlying Lark language parsing toolkit's logging you can activate its logger like this. + +``` +log = logging.getLogger("lark") +log.setLevel(logging.DEBUG) +``` diff --git a/README.md b/README.md index ed45aa1..75bb105 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ pip3 install RTFDE ```python from RTFDE.deencapsulate import DeEncapsulator -with open('rtf_file', 'r') as fp: +with open('rtf_file', 'rb') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() @@ -44,6 +44,33 @@ with open('rtf_file', 'r') as fp: print(rtf_obj.text) ``` + + +# Enabling Logging + +Any logging (including how verbose the logging is) can be handled by configuring logging. You can enable RTFDE's logging at the highest level by getting and setting the "RTFDE" logger. + +``` +log = logging.getLogger("RTFDE") +log.setLevel(logging.INFO) +``` + + + + + + +To see how to enable more in-depth logging for debugging check out the CONTRIBUTING.md file. + +``` +# Now, get the log that you want +# The main logger is simply called RTFDE. That will get you all the *normal* logs. +requests_log = logging.getLogger("RTFDE") +requests_log.setLevel(logging.DEBUG) +requests_log.propagate = True +``` + + # Contribute Please check the [contributing guidelines](./CONTRIBUTING.md) diff --git a/RTFDE/__init__.py b/RTFDE/__init__.py index d152822..6719a30 100644 --- a/RTFDE/__init__.py +++ b/RTFDE/__init__.py @@ -17,22 +17,19 @@ """ RTFDE: A python3 library for extracting HTML content from RTF encapsulated HTML. -https://github.com/seamustuohy/RTF_De-Encapsulator +https://github.com/seamustuohy/RTFDE """ __author__ = 'seamus tuohy' -__date__ = '2020-12-05' -__version__ = '0.00.1' +__date__ = '2023-06-18' +__version__ = '0.1.0' import logging +from logging import NullHandler + +logging.getLogger(__name__).addHandler(NullHandler()) +logging.getLogger(__name__ + ".tree_logger").addHandler(NullHandler()) -FORMAT = "%(levelname)s [%(filename)s:%(lineno)s - %(funcName)s() ] %(message)s" -formatter = logging.Formatter(FORMAT) -default_handler = logging.StreamHandler() -default_handler.setFormatter(formatter) -logger = logging.getLogger(__name__) -logger.addHandler(default_handler) -logger.setLevel(logging.WARNING) from RTFDE.deencapsulate import DeEncapsulator diff --git a/RTFDE/deencapsulate.py b/RTFDE/deencapsulate.py index e7c01fc..5694f26 100644 --- a/RTFDE/deencapsulate.py +++ b/RTFDE/deencapsulate.py @@ -13,186 +13,255 @@ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. -import re +from typing import Union, AnyStr, Tuple, Dict, Any +from io import BufferedReader + from lark import Lark from lark.tree import Tree from lark.lexer import Token -from oletools.common import codepages - -from RTFDE.transformers import RTFUnicodeDecoder, StripNonVisibleRTFGroups, RTFCleaner +from lark.exceptions import UnexpectedInput + +from RTFDE.transformers import RTFCleaner, StripControlWords +from RTFDE.transformers import StripNonVisibleRTFGroups +from RTFDE.transformers import StripUnusedSpecialCharacters +from RTFDE.utils import encode_escaped_control_chars +from RTFDE.utils import log_validators, log_transformations, is_logger_on +from RTFDE.transformers import get_stripped_HTMLRTF_values, DeleteTokensFromTree, strip_binary_objects +from RTFDE.grammar import make_concise_grammar +from RTFDE.text_extraction import TextDecoder +from RTFDE.text_extraction import validate_ansi_cpg # For catching exceptions from RTFDE.exceptions import NotEncapsulatedRtf, MalformedEncapsulatedRtf, MalformedRtf -from io import BufferedReader - import logging -log = logging.getLogger('RTFDE') +log = logging.getLogger("RTFDE") class DeEncapsulator(): """De-Encapsulating RTF converter of HTML/TEXT found in .msg files. - De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content. - - - Parameters: - raw_rtf: (str): It's the raw RTF string. - - grammar: (raw str): OPTIONAL - Lark (https://github.com/lark-parser/lark) parsing grammar which defines the RTF language. If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. :D +De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content. + + +Parameters: + raw_rtf: (bytes): It's the raw RTF file as bytes. + grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. + +Attributes: + content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function. + html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function. + text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function. + found_binary: List of dictionaries containing binary data extracted from the rtf file. + content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function. + full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function. + doc_tree: The `document` portion of the .rtf full_tree object. + raw_rtf: The raw encapsulated .rtf data in byte format. + grammar: The Lark parsing grammer used to parse the .rtf data. + content_type_token: The .rtf header token identifying the content type. (\\fromhtml1 OR \\fromtext) + parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object. """ - def __init__(self, raw_rtf:str, grammar:str = None): + def __init__(self, raw_rtf:bytes, grammar: Union[str,None] = None): """Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF. - NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step. - +NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step. - Parameters: - raw_rtf: (str): It's the raw RTF string. +Parameters: + raw_rtf: (bytes): It's the raw RTF string. + grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. - grammar: (raw str): OPTIONAL - Lark (https://github.com/lark-parser/lark) parsing grammar which defines the RTF language. If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. :D - - """ - self.content = None - self.content_type = None - self._content_type_token = None - self.html = None - self.plain_text = None - self.stripped_rtf = None - self.simplified_rtf = None - self.full_tree = None - self.doc_tree = None - self.charset = None - self.text_codec = None - self._catch_common_validation_issues(raw_rtf) +Raises: + TypeError: The raw_rtf data passed is not the correct type of data (string/byte string). +""" + self.content: str + self.content_type: str + self.content_type_token: str + self.parser: Any + + self.html: str + self.text: str + self.found_binary: list + self.full_tree: Tree + self.doc_tree: Tree + self.catch_common_validation_issues(raw_rtf) if isinstance(raw_rtf, bytes): - self.raw_rtf = raw_rtf.decode() - elif isinstance(raw_rtf, str): - self.raw_rtf = raw_rtf + raw_rtf_bytes = raw_rtf else: raise TypeError("DeEncapssulator only accepts RTF files in string or byte-string formats") + raw_rtf_bytes = raw_rtf_bytes.rstrip(b'\x00') + raw_rtf_bytes = raw_rtf_bytes.replace(b'\r\n',b'\n') + raw_rtf_bytes = raw_rtf_bytes.replace(b'\r',b'\n') + self.raw_rtf: bytes = raw_rtf_bytes if grammar is not None: - self._grammar = grammar + self.grammar: str = grammar else: - self._grammar = r""" -start : OPENPAREN document CLOSEPAREN + self.grammar = make_concise_grammar() -document: (CONTROLWORD | CONTROLSYMBOL | TEXT | group | " " | RTFESCAPE)+ -group: OPENPAREN (CONTROLWORD | CONTROLSYMBOL | TEXT | group | RTFESCAPE)* CLOSEPAREN + def deencapsulate(self): + """De-encapsulate the RTF content loaded into the De-Encapsulator. + +Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.) + """ + stripped_data = strip_binary_objects(self.raw_rtf) + non_binary_rtf = stripped_data[0] + found_binary = stripped_data[1] + if len(found_binary) > 0: + self.found_binary = found_binary + log.info("Binary data found and extracted from rtf file.") + escaped_rtf = encode_escaped_control_chars(non_binary_rtf) + if is_logger_on("RTFDE.transform_logger") is True: + log_transformations(escaped_rtf) + try: + self.parse_rtf(escaped_rtf) + except UnexpectedInput as _e: + raise MalformedEncapsulatedRtf(f"Malformed encapsulated RTF discovered:") from _e + Decoder = TextDecoder() + Decoder.update_children(self.full_tree) + self.get_doc_tree() + self.validate_encapsulation() + + # remove htmlrtf escaped values + htmlrtf_stripped = self.strip_htmlrtf_tokens() + # Strips whitespace from control words + control_stripped = StripControlWords().transform(htmlrtf_stripped) + # Strip unused control chars + special_stripper = StripUnusedSpecialCharacters() + non_special_tree = special_stripper.transform(control_stripped) + # Strip out non-visible RTF groups + stripper = StripNonVisibleRTFGroups() + stripped_tree = stripper.transform(non_special_tree) + # Converts any remaining tokens + cleaner = RTFCleaner(visit_tokens=True) + cleaned_text = cleaner.transform(stripped_tree) -// Text is given priority over control terms with TERM.PRIORITY = 2 -// This is used to ensure that escaped \ AND { AND } are not matched in others -TEXT.2: /\\\\/ | /\\[{}]/+ | /[^\\{}]/+ -CONTROLWORD: /(? bytes: + """Validate and return the RTF charset keyword from the RTF streams header. -OPENPAREN: "{" -CLOSEPAREN: "}" +Args: + fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found. -%import common.ESCAPED_STRING -%import common.SIGNED_NUMBER +Raises: + MalformedRtf: RTF stream does not include charset control word. -%import common.WS -%ignore WS +Returns: + The RTF charset keyword from the RTF streams header. """ - @staticmethod - def _catch_common_validation_issues(raw_rtf): - """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.""" - if isinstance(raw_rtf, BufferedReader): - raise TypeError("Data passed as file pointer. DeEncapsulator only accepts strings and byte-strings.") - if raw_rtf == None: - raise TypeError("Data passed as raw RTF file is a null object `None` keyword.") - if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": - raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.") - if (raw_rtf == b"") or (raw_rtf == ""): - raise MalformedRtf("Data passed as raw RTF file is an empty string.") + main_headers = self.get_header_control_words_before_first_group() - def _simplify_text_for_parsing(self): - """Replaces control chars within the text with their RTF encoded versions \\'HH. - """ - cleaned = self.stripped_rtf.replace('\\\\', "\\'5c") - cleaned = cleaned.replace('\\{', "\\'7b") - cleaned = cleaned.replace('\\}', "\\'7d") - return cleaned + for token in main_headers: + if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']: + return token + log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.") + if fallback_to_default is False: + raise MalformedRtf("RTF stream does not include charset control word.") - def deencapsulate(self): - """De-encapsulate the RTF content loaded into the De-Encapsulator. + log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.") + log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.") + log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.") + return b"\\ansi" - Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.) - """ - self.stripped_rtf = self._strip_htmlrtf_sections() - self.simplified_rtf = self._simplify_text_for_parsing() - self.doc_tree = self._parse_rtf() - self._validate_encapsulation() - self.charset = self._get_charset() - self.text_codec = self._get_python_codec() - self.content = self._deencapsulate_from_tree() + def set_content(self): + """Populate the html or text content based on the content type. Populates self.html and/or self.text variables.""" self.content_type = self.get_content_type() if self.content_type == 'html': self.html = self.content else: self.text = self.content + def get_doc_tree(self): + """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute. + +Raises: + ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object. +""" + if self.full_tree.children[1].data == "document": + self.doc_tree = self.full_tree.children[1] + else: + raise ValueError("Document object in the wrong place after parsing.") + def get_content_type(self): """Provide the type of content encapsulated in RTF. - NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data. - """ - if self._content_type_token is None: - self._validate_FROM_in_doc_header() - elif self._content_type_token == '\\fromhtml1': +NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data. + +Raises: + NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file. +""" + if self.content_type_token is None: + self.validate_FROM_in_doc_header() + elif self.content_type_token == b'\\fromhtml1': return 'html' - elif self._content_type_token == '\\fromtext': + elif self.content_type_token == b'\\fromtext': return "text" - else: - raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).") - def _validate_encapsulation(self): - """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation. - """ - self._validate_rtf_doc_header() - self._validate_FROM_in_doc_header() + raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).") - def _parse_rtf(self) -> Tree: - """Parse RTF file's header and document and extract the objects within the RTF into a Tree.""" - parser = Lark(self._grammar, parser='lalr') - self.full_tree = parser.parse(self.simplified_rtf) - # An RTF file has the following syntax: '{'
'}' - # We only need the header and document so we only extract the 1st obj. - return self.full_tree.children[1] + def validate_encapsulation(self): + """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation.""" + self.validate_rtf_doc_header(self.doc_tree) + self.validate_charset() + self.validate_FROM_in_doc_header() + ansicpg = self.get_ansicpg_header() + if ansicpg is not None: # ansicpg is not manditory + validate_ansi_cpg(ansicpg.value) - def _strip_htmlrtf_sections(self) -> Tree: - """Strip out \\htmlrtf tagged sections which need to be ignored in the de-encapsulation and are difficult to extract after it has been converted into a tree. - - The \\htmlrtf keyword toggles pieces of RTF to be ignored during reverse RTF->HTML conversion. Lack of a parameter turns it on, parameter 0 turns it off. But, these are not always included in a consistent way. They can appear withing and across groups in the stream. So, they need to be extracted before the stream is tokenized and placed into a tree. - """ - htmlrtf = re.compile(r'[\s]*\\htmlrtf[^0].*?\\htmlrtf0[\n]*', flags=re.MULTILINE|re.DOTALL) - return htmlrtf.sub("", self.raw_rtf) + def get_ansicpg_header(self) -> Union[Token,None]: + """Extract the ansicpg control word from the .rtf header. - def _deencapsulate_from_tree(self) -> str: - """De-encapsulates HTML from document tree into final content. - """ - decoded_tree = RTFUnicodeDecoder().visit_topdown(self.doc_tree) +Returns: + A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file. +""" + headers = self.get_header_control_words_before_first_group() + for item in headers: + if item.value.startswith(b'\\ansicpg'): + return item + return None - stripper = StripNonVisibleRTFGroups() - stripped_tree = stripper.transform(decoded_tree) + def parse_rtf(self, rtf: str): + """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute. - cleaner = RTFCleaner(rtf_codec=self.text_codec) - cleaned_text = cleaner.transform(stripped_tree) - # The conversion process inserts spaces on newlines where there were none - cleaned_text = re.sub(r'[\r\n][\s\r\n]{2,}', '\n', cleaned_text) - return cleaned_text +Args: + rtf: The .rtf string to parse with the projects lark grammar. +""" + # Uncomment Lark debug argument if you want to enable logging. + # Note, this not enable ALL lark debug logging. + # To do that we would not be able to use the Lark convinence class which we are using here. + self.parser = Lark(self.grammar, + parser='lalr', + keep_all_tokens=True, + use_bytes=True, + # debug=True, + propagate_positions=True) + self.full_tree = self.parser.parse(rtf) + if is_logger_on("RTFDE.transform_logger") is True: + log_transformations(self.full_tree) + + + def strip_htmlrtf_tokens(self) -> Tree: + """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content. + +Returns: + .rtf doc_tree stripped of all non-original tokens. +""" + # remove htmlrtf escaped values + delete_generator = get_stripped_HTMLRTF_values(self.doc_tree) + tokens_to_delete = list(delete_generator) + deleter = DeleteTokensFromTree(tokens_to_delete) + htmlrtf_cleaned_tree = deleter.transform(self.doc_tree) + return htmlrtf_cleaned_tree - def _get_header_control_words_before_first_group(self) -> list: + def get_header_control_words_before_first_group(self) -> list: """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.) - This is used to extract initial header values for validation functions. +This is used to extract initial header values for validation functions. + +Returns: + A list containing the header tokens in the .rtf data. """ initial_control_words = [] for token in self.doc_tree.children[:20]: @@ -203,113 +272,67 @@ def _get_header_control_words_before_first_group(self) -> list: return initial_control_words - def _get_charset(self, fallback_to_default:bool =False) -> str: - """Extracts the RTF charset keyword from the RTF streams header. - - Parameters: - fallback_to_default (bool): Allows you to force the use of the default charset "\ansi" if one is not found. - """ - main_headers = self._get_header_control_words_before_first_group() - charset = None - for token in main_headers: - if token in ["\\ansi", "\\mac", "\\pc", "\\pac"]: - return token - - if charset is None: - log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.") - if fallback_to_default is False: - raise MalformedRtf("RTF stream does not include charset control word.") - else: - log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.") - log.info("Attempting to decode RTF using the defulat charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.") - log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.") - return "\\ansi" - - def _get_codepage_num(self) -> int: - """Extracts the unicode codepage number from the RTF streams header. - """ - # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it. - # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers - # Retrieved on 2020-12-18 - allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001]) - charset_check = re.compile(r'\\ansicpg([0-9]+)') - main_headers = self._get_header_control_words_before_first_group() - for unicode_charset in main_headers: - cmatch = charset_check.match(unicode_charset.strip()) - if cmatch is not None: - codepage_num = int(cmatch.groups()[0]) - if codepage_num in allowed_codepage_nums: - return codepage_num - else: - raise MalformedRtf("Unsupported unicode codepage number `{}` found in the header".format(codepage_num)) - - log.debug("No unicode codepage number found in the header. The following headers were checked: {0}".format(main_headers)) - raise MalformedRtf("No unicode codepage number found in the header") - - def _get_python_codec(self) -> str: - """Returns the python codec needed to decode bytes to unicode. - """ - _codepage_num = self._get_codepage_num() - text_codec = codepages.codepage2codec(_codepage_num) - log.debug('Found python codec corresponding to code page {0}: {1}'.format(_codepage_num, text_codec)) - return text_codec - - def _validate_FROM_in_doc_header(self): + def validate_FROM_in_doc_header(self): """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document. - NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX +NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX + +Raises: + MalformedEncapsulatedRtf: The .rtf headers are malformed. + NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file. """ cw_found = {"rtf1":False, "from":False, "fonttbl":False, "malformed":False} # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words. - first_ten_tokens = self.doc_tree.children[:10] + decoded_tree = StripControlWords().transform(self.doc_tree) + first_ten_tokens = decoded_tree.children[:10] operating_tokens = [] found_token = None for token in first_ten_tokens: if isinstance(token, Token): operating_tokens.append(token) else: - operating_tokens += [i for i in token.scan_values(lambda t: t.type in ('CONTROLWORD'))] - log.debug("Header tokens being evaluated: {0}".format(operating_tokens)) + operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD')) + if is_logger_on("RTFDE.validation_logger") is True: + log_validators(f"Header tokens being evaluated: {operating_tokens}") for token in operating_tokens: - cw_found,found_token = self._check_from_token(token=token, cw_found=cw_found) + cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found) if cw_found['from'] is True and cw_found["malformed"] is True: raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information") # Save content type token available for id-ing type of content later if found_token is not None: - self._content_type_token = found_token + self.content_type_token = found_token if cw_found['from'] is False: log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.") raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.") - def _get_font_table(self) -> Tree: - """Extract the font table group from the document""" - for token in self.doc_tree.children[:20]: - if isinstance(token, Tree): - table_type = token.children[1].value - if table_type == "\\fonttbl": - return token - @staticmethod - def _check_from_token(token, cw_found:dict) -> dict: - """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the _validate_FROM_in_doc_header function. + def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] : + """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function. + +Args: + token: The token to check for in the cw_found state dictionary. + cw_found: The state dictionary which is used to track the position of the from token within the header. + + `cw_found = {"rtf1":, "from":, "fonttbl":, "malformed":}` + + +Returns: + cw_found: Updated state dictionary + found_token: The content_type_token found in the header. - Parameters: - cw_found: (dict): The state dictionary which is used to track the position of the from token within the header - `cw_found = {"rtf1":, "from":, "fonttbl":, "malformed":}` """ - from_cws = ['\\fromhtml1', '\\fromtext'] + from_cws = [b'\\fromhtml1', b'\\fromtext'] # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF]. - rtf1_cw = "\\rtf1" + rtf1_cw = b"\\rtf1" found_token = None - fonttbl_cw = "\\fonttbl" - maltype = [] + fonttbl_cw = b"\\fonttbl" if token.type == "CONTROLWORD": - if token.value in from_cws: + if token.value.strip() in from_cws: if cw_found['from'] is True: cw_found["malformed"] = True log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.") @@ -320,22 +343,48 @@ def _check_from_token(token, cw_found:dict) -> dict: log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.") cw_found['from'] = True cw_found["malformed"] = True - elif token.value == rtf1_cw: + elif token.value.strip() == rtf1_cw: cw_found['rtf1'] = True - elif token.value == fonttbl_cw: + elif token.value.strip() == fonttbl_cw: cw_found['fonttbl'] = True - if cw_found['from'] != True: + if cw_found['from'] is not True: log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.") cw_found["malformed"] = True return cw_found, found_token - def _validate_rtf_doc_header(self): + @staticmethod + def validate_rtf_doc_header(doc_tree: Tree): """Check if doc starts with a valid RTF header `\\rtf1`. - "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\rtf1")." - MS-OXRTFEX - """ - first_token = self.doc_tree.children[0].value - if first_token != "\\rtf1": + "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX + +Raises: + MalformedRtf: The .rtf headers do not include \\rtf1. +""" + first_token = doc_tree.children[0].value + if first_token != b"\\rtf1": log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"") - raise MalformedRtf("RTF stream does not start with {rtf1") + if is_logger_on("RTFDE.validation_logger") is True: + log_validators(f"First child object in document tree is: {first_token!r}") + raise MalformedRtf("RTF stream does not start with {\\rtf1") + + @staticmethod + def catch_common_validation_issues(raw_rtf: AnyStr): + """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them. + +Args: + raw_rtf: A raw .rtf string or byte-string. + +Raises: + TypeError: The data passed is the wrong type of data. + MalformedRtf: The data passed is not a correctly formatted .rtf string. +""" + if isinstance(raw_rtf, BufferedReader): + raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.") + if raw_rtf is None: + raise TypeError("Data passed as raw RTF file is a null object `None` keyword.") + if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": + raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.") + if raw_rtf in (b'', ''): + raise MalformedRtf("Data passed as raw RTF file is an empty string.") diff --git a/RTFDE/grammar.py b/RTFDE/grammar.py new file mode 100644 index 0000000..1ef267b --- /dev/null +++ b/RTFDE/grammar.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# This file is part of RTFDE, a RTF De-Encapsulator. +# Copyright © 2020 seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + + +# TODO: Remove +HTMLRTF_GRAMMAR = """ +start : obj+ +obj : HTMLRTF | OTHER | WS +%import common.DIGIT +%import common.LETTER +%import common.WS +_SPACE_DELETE : " " +HTMLRTF : "\\htmlrtf" DIGIT~0..3 +OTHER : /((?!\\\\htmlrtf).)+/s +""" + + +GRAMMAR = { + "imports": r""" +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.DIGIT +%import common.NEWLINE +%import common.LETTER""", + "ignore": r"""%ignore NEWLINE""", + "_LBRACE": r'"{"', + "_RBRACE": r'"}"', + "BACKSLASH": r'"\\"', + "start": r"_LBRACE document _RBRACE", + "document": r"""(CONTROLWORD + | control_symbol + | string + | group + | HTMLRTF + | hexarray + | _SPACE_DELETE + | SPACE_SAVE + | UNICODE)+""", + "group": r"""_LBRACE (CONTROLWORD + | control_symbol + | string + | htmltag_group + | mhtmltag_group + | group + | SPACE_SAVE + | _SPACE_DELETE + | HTMLRTF + | UNICODE + | hexarray + | NEWLINE )* _RBRACE""", + "htmltag_group": r"STAR_ESCAPE HTMLTAG ( string | group )*", + "HTMLTAG": r'"\\htmltag" DIGIT~0..3 _SPACE_DELETE?', + "MHTMLTAG": r'"\\mhtmltag" DIGIT~0..3 _SPACE_DELETE?', + "mhtmltag_group": r"STAR_ESCAPE MHTMLTAG ( string | group )*", + "NUMERICALDEL": r"SIGNED_NUMBER*", + "_SPACE_DELETE": r'" "', + "SPACE_SAVE": r'" "', + "DELIMITER": r"NUMERICALDEL _SPACE_DELETE?", + "ASCIILETTERSEQUENCE" : r"LETTER+", + "CONTROLWORD": "BACKSLASH ASCIILETTERSEQUENCE~1..32 DELIMITER", + "STAR_ESCAPE": r'BACKSLASH "*"', + "NONBREAKING_HYPHEN": r'BACKSLASH "_"', + "OPTIONAL_HYPHEN": r'BACKSLASH "-"', + "NONBREAKING_SPACE": r'BACKSLASH "~"', + "FORMULA_CHARACTER": r'BACKSLASH "|"', + "INDEX_SUBENTRY": r'BACKSLASH ":"', + "control_symbol": r"(STAR_ESCAPE | INDEX_SUBENTRY | FORMULA_CHARACTER | NONBREAKING_SPACE | OPTIONAL_HYPHEN | NONBREAKING_HYPHEN )", + "STRING": r'/.+?/', + "?string": r"STRING+ SPACE_SAVE?", + "_QUESTION_MARK": r'"?"', + "UNICODE" : r"""("\\u" /[-]*[0-9]+/)+""", + "HEXENCODED": """("\\'" /[0-9A-Fa-f]/~2)""", + "hexarray": "HEXENCODED+", + "HTMLRTF": r'"\\htmlrtf" DIGIT~0..3 _SPACE_DELETE?', + } + + +# // == Priority Levels == +# This dictionary sets the priority level for each type of object in the lexer. +# Higher numbers give a greater priority. +# All must start with a period +# EXPLICIT IS BETTER THEN RELYING ON DEFAULTS +# // 0 = Raw String Matching // text should defer to everything else if conflicting +# // 1 = Generic undefined object (i.e. group, CONTROL_WORD, CONTROL_SYMBOL, etc.) +# // 2 = Specific instances of objects (i.e. HTMLTAG, MHTMLTAG, etc.) + +PRIORITY_LEVELS = { + "_LBRACE": ".2", + "_RBRACE": ".2", + "BACKSLASH" : ".1", + "start" : ".1", + "document": ".1", + "group": ".1", + "htmltag_group" : ".2", + "HTMLRTF" : ".2", + "HTMLTAG" : ".2", + "MHTMLTAG" : ".2", + "mhtmltag_group" : ".2", + "NUMERICALDEL" : ".1", + "_SPACE_DELETE" : ".1", + "SPACE_SAVE" : ".1", + "DELIMITER" : ".1", + "ASCIILETTERSEQUENCE" : ".1", + "CONTROLWORD": ".1", + "STAR_ESCAPE": ".1", + "NONBREAKING_HYPHEN": ".1", + "OPTIONAL_HYPHEN": ".1", + "NONBREAKING_SPACE": ".1", + "FORMULA_CHARACTER": ".1", + "INDEX_SUBENTRY": ".1", + "control_symbol": ".1", + "STRING" : ".0", + "_QUESTION_MARK": ".1", + "?string" : ".0", + "UNICODE" : ".2", + "HEXENCODED" : ".1", + "hexarray" : ".2", +} + +def make_concise_grammar(): + """Make a grammar string to use with the lexer. + """ + grammar = r"""""" + for key, priority in PRIORITY_LEVELS.items(): + grammar += "{0}{1} : {2}\n".format(key,priority,GRAMMAR[key]) + grammar += GRAMMAR['imports'] + "\n" + grammar += GRAMMAR['ignore'] + "\n" + return (grammar) + + + +def make_literate_grammar(): + """Create a VERBOSE grammar string which can be used to understand the grammar. + + This SHOULD be updated to include and changes to the grammar. + This is valuable when debugging and/or otherwise trying to understand the grammar. + """ + grammar = r""" + +// ===== Precedence ========= +// Literals are matched according to the following precedence: +// 1. Highest priority first (priority is specified as: TERM.number: …) +// 2. Length of match (for regexps, the longest theoretical match is used) +// 3. Length of literal / pattern definition +// 4. Name +// +// == Priority Levels == +// WARNING: Priority Levels are not shown in this literate grammar. +// NOTE: Look at PRIORITY_LEVELS for the prioritized levels used in production. +// 0 = Raw String Matching // text should defer to everything else if conflicting +// 1 = Generic undefined object (i.e. group, CONTROL_WORD, CONTROL_SYMBOL, etc.) +// 2 = Specific instances of objects (i.e. HTMLTAG, MHTMLTAG, etc.) + +// ====== GRAMMAR OBJECT IMPORTS FROM LARK COMMONS ====== +// https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark +{imports} + + +// ====== Ignore Newlines ====== +// The real carriage returns are stored in \par or \line tags. +{ignore} + +// ====== SIMPLE GRAMMAR OBJECTS USED THROUGHOUT ====== +// RTF is braces all the way down +// We don't have to worry about escaped braces since we are pre-processing out escaped braces already +_LBRACE: {_LBRACE} +_RBRACE: {_RBRACE} + +// We don't have to worry about escaped backslashes since we are pre-processing out escaped braces already +BACKSLASH: {BACKSLASH} + +// RTF control words are made up of ASCII alphabetical characters (a through z and A through Z) +ASCIILETTERSEQUENCE: {ASCIILETTERSEQUENCE} + +// A space that should be deleted (See Delimiters below) +_SPACE_DELETE: {_SPACE_DELETE} + +// But, we want to save spaces within strings. So, we have a special space for that. +SPACE_SAVE : {SPACE_SAVE} + +// ====== UNMATCHED RAW TEXT ====== +// In order to split out everything that is simply plain text and not a special RTF object I've had to match all raw text characters individually. This allows us to store them all in their own rule branch (string) for tranformation later on. + +STRING : {STRING} + +// We use the ? char to inline this rule to remove the branch and replace it with its children if it has one match. This will make it easier to parse later and remove uneccesary matches of it. +?string: {?string} + + + + +// ====== HIGH LEVEL DOCUMENT PARSING ====== + +// The start object is the top level object in the tree +// An RTF file has the following syntax: '{{'
'}}' +start: {start} + +// Parse
+document: {document} + +// A group consists of text and control words or control symbols enclosed in braces ({{}}). +// The opening brace ({{ ) indicates the start of the group and the closing brace ( }}) indicates the end of the group. +group: {group} + + +// ====== CONTROL WORD(s) ====== + +// A control word is defined by: \ +// A control word’s name cannot be longer than 32 letters. +CONTROLWORD: {CONTROLWORD} + +// === Delimiter == + +DELIMITER: {DELIMITER} + +// The can be one of the following: +// 1. A numeric digit or an ASCII minus sign (-), which indicates that a numeric parameter is associated with the control word. +NUMERICALDEL: {NUMERICALDEL} +// 2. A space: When a space is used as a the delimiter, it is discarded. This means that it’s not included in subsequent processing. So, we are using a discarded terminal (by putting a underscore in front of the name) to ensure it is tossed. +// See: "_SPACE_DELETE" under SIMPLE GRAMMAR OBJECTS + +// 3. Any character other than a letter or a digit. In this case, the delimiting character terminates the control word and is not part of the control word. So, it's not included in the grammar here. + + +// ====== CONTROL SYMBOLS(s) ====== + +// A control symbol consists of a backslash followed by a single, nonalphabetic character. +// For example, \~ represents a nonbreaking space. + +// The STAR_ESCAPE special construct means that if the program understands the \command, it takes this to mean {\command ...}, but if it doesn’t understand \command, the program ignores not just \command (as it would anyway) but everything in this group. +STAR_ESCAPE: {STAR_ESCAPE} +NONBREAKING_HYPHEN: {NONBREAKING_HYPHEN} +OPTIONAL_HYPHEN: {OPTIONAL_HYPHEN} +NONBREAKING_SPACE: {NONBREAKING_SPACE} +FORMULA_CHARACTER: {FORMULA_CHARACTER} +INDEX_SUBENTRY: {INDEX_SUBENTRY} + +// Control symbols take no delimiters. +control_symbol: {control_symbol} + + + + + +// ====== SPECIAL CONTROL WORD(s) ====== + +// ====== HEADER OBJECTS ====== + +// The FROMHTML control word specifies that the RTF document contains encapsulated HTML text. +// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated +// FROMTEXT: {FROMTEXT} + +//The FROMHTML control word specifies that the RTF document contains encapsulated HTML text. +// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated. +//FROMHTML : {FROMHTML} + + +// ====== SPECIFIC CONTROL WORD OBJECTS ====== + +// HTMLRTF Toggle Control Word +// The HTMLRTF control word identifies fragments of RTF that were not in the original HTML content +// If the flag is "\htmlrtf" or "\htmlrtf1" then do not process anything else until you encounter "\htmlrtf0" which will toggle this off again. +// A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups. +// This means that you can only turn this off on it's own level (turning it off in an object nested within it does nothing). And, if the object it's in ends then it doesn't transfer up the tree to objects that contain it. So, if you don't find a closing "\htmlrtf0" you can delete from the opening "\htmlrtf" all the way until the end of the current object, but not above. +HTMLRTF : {HTMLRTF} + +// The HTMLTAG destination group encapsulates HTML fragments that cannot be directly represented in RTF +htmltag_group: {htmltag_group} + +// The "DIGIT~0..3" in the following definition is the HTMLTagParameter from the spec. + // A space MUST be used to separate the CONTENT HTML fragment from the HTMLTagParameter HTML fragment if the text starts with a DIGIT, or if the HTMLTagParameter HTML fragment is omitted. As such, we throw away this space by using _SPACE_DELETE if we encounter one. +HTMLTAG: {HTMLTAG} + + + +content : {content} + +// \*\mhtmltag[HTMLTagParameter] [CONTENT] +// The values and format of the numeric parameter are identical to the numeric parameter in the HTMLTAG destination group. +// This RTF control word SHOULD be skipped on de-encapsulation and SHOULD NOT be written when encapsulating. +# TODO: https://datatracker.ietf.org/doc/html/draft-ietf-mhtml-cid-00#section-1 +// NOTE: mhtmltag's contain original URL which has been replaced in the corresponding htmltag with the CID of an object. As such, it contains possibly useful URI data that, while not useful for the direct output, should be saved. +MHTMLTAG : {MHTMLTAG} +mhtmltag_group: {mhtmltag_group} + + +// TODO: Check if really neeeded +// Increased priority of escape chars to make unescaping easier +// Multiple char acceptance is important here because if you just catch one escape at a time you mess up multi-byte values. +_QUESTION_MARK: {_QUESTION_MARK} + +// TODO Define these objects + +// RTFESCAPE no longer used +// RTFESCAPE : {RTFESCAPE} + +// UNICODE unicode chars +UNICODE : {UNICODE} + +// Hex chars [HEXENCODED] are stored in an array [hexarray] +// We often need to parse hex chars as a set so this is the easiest way +HEXENCODED : {HEXENCODED} +hexarray : {hexarray} + + """.format(**GRAMMAR) + return grammar + + +if __name__ == '__main__': + # print(make_literate_grammar()) + print(make_concise_grammar()) diff --git a/RTFDE/text_extraction.py b/RTFDE/text_extraction.py new file mode 100644 index 0000000..73cbed4 --- /dev/null +++ b/RTFDE/text_extraction.py @@ -0,0 +1,750 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This file is part of RTFDE, a RTF De-Encapsulator. +# Copyright © 2022 seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + +import codecs +import re +from collections import namedtuple +from typing import Union, Any, List, Tuple, Dict + +from oletools.common import codepages + +from lark.lexer import Token +from lark.tree import Tree + +from RTFDE.exceptions import MalformedRtf +from RTFDE.utils import is_codeword_with_numeric_arg +from RTFDE.utils import flatten_tree_to_string_array +from RTFDE.utils import log_text_extraction, is_logger_on + +import logging +log = logging.getLogger("RTFDE") + +fontdef = namedtuple("fontdef", ["fnum", "codepage", "codec", "fontdef_tree"]) + + +def get_font_table(tree: Tree) -> Tree: + """Extract the font table group from the first 20 tokens of a .rtf document. + +Args: + tree (Tree): A .rtf document object parsed into a Tree object + +Raises: + ValueError: If no group with a `\\fonttbl` token as its first controlword is found. + +Returns: + {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'), + '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'), + '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'), + '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'), + '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'), + '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'), + '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')} + """ + for item in tree.children[:20]: + if isinstance(item, Tree): + try: + ctrl_value = item.children[1] + except IndexError as _e: + continue + if isinstance(ctrl_value, Token): + table_type = ctrl_value.value.strip() + if table_type == b"\\fonttbl": + return item + raise ValueError("No font table found in tree") + + +def is_font_number(token: Token) -> bool: + """Checks if an object is a "font number". + +Returns: + True if an object is a "font number" controlword `\\fN`. False if not. + +""" + try: + if is_codeword_with_numeric_arg(token, b'\\f'): + return True + except AttributeError: # pragma: no cover + return False + return False + +def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]: + """Return the codepage to use with a specific fcharsetN. + +Args: + fcharsetN (int): The numeric argument N for a \fcharsetN control word. + +Returns: + (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.) + + """ + # Charset table retrieved on 2022-08-19 + # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN + charsets: dict[int,dict[str,Any]] = { + 0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252}, + 1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None}, + 2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None}, + 128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932}, + 129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949}, + 134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936}, + 136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950}, + 161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253}, + 162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254}, + 177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255}, + 178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256}, + 186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257}, + 204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251}, + 222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874}, + 238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250}, + 255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None}, +} + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction(f"Getting charset for {fcharsetN}") + charset = charsets.get(fcharsetN, None) + if charset is not None: + charset_id = charset.get('id', None) + return charset_id + return None + + +def get_default_font(tree: Tree) -> Union[str,None]: + """Extract the font number controlword default font if it exists. + +If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group. + +Args: + tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object. + +Returns: + The default font control number if it exists from the first `\\deffN`. None if not found. +""" + deff_gen = tree.scan_values( + lambda v: is_codeword_with_numeric_arg(v, b'\\deff') + ) + deff_options = list(deff_gen) + try: + # We just want the first \\deffN. It shouldn't be set multiple times. + deff = deff_options[0] + deff_num = deff.value[5:] + return b'\\f' + deff_num + except IndexError: + return None + +def parse_font_tree(font_tree: Tree) -> dict: + """Create a font tree dictionary with appropriate codeces to decode text. + +Args: + font_tree (Tree): The .rtf font table object decoded as a tree. + +Returns: + A dictionary which maps font numbers to appropriate python codeces needed to decode text. +""" + parsed_font_tree = {} + for tree in font_tree.children: + if isinstance(tree, Tree): + fnum = None + fcharset = None + cpg_num = None + for tok in tree.children: + if is_codeword_with_numeric_arg(tok, b'\\f'): + fnum = tok.value + elif is_codeword_with_numeric_arg(tok, b'\\fcharset'): + fchar_num = int(tok.value[9:]) + fcharset = get_codepage_num_from_fcharset(fchar_num) + elif is_codeword_with_numeric_arg(tok, b'\\cpg'): + cpg_num = int(tok.value[4:]) + if fnum is not None: + # get the codepage + codepage_num = None + + if fcharset is not None: + try: + codepage_num = check_codepage_num(fcharset) + except ValueError: # pragma: no cover + codepage_num = None + # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored. + if ((codepage_num is None) and (cpg_num is not None)): + try: + codepage_num = check_codepage_num(cpg_num) + except ValueError: # pragma: no cover + codepage_num = None + # Get the appropriate codec + if codepage_num is not None: + codec = get_python_codec(codepage_num) + else: + codec = None + # Only add if there is a font definition + tree_str = b"".join(list(flatten_tree_to_string_array(tree))) + parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str) + return parsed_font_tree + + +def get_python_codec(codepage_num: int) -> str: + """Returns the python codec needed to decode bytes to unicode. + +Args: + codepage_num (int): A codepage number. + +Returns: + The name of the codec in the Python codec registry. Used as the name for enacoding/decoding. +""" + text_codec = codepages.codepage2codec(codepage_num) + log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec)) + return text_codec + +def check_codepage_num(codepage_num: int) -> int: + """Provide the codepage number back to you if it is valid. + +Args: + codepage_num (int): A possible codepage number. + +Returns: + The codepage number IF it is a valid codepage number + +Raises: + ValueError: The codepage_num provided isn't a valid codepage number. + +""" + # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it. + # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers + # Retrieved on 2020-12-18 + allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001]) + if codepage_num in allowed_codepage_nums: + return codepage_num + # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages. + # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/ + raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header") + + +def validate_ansi_cpg(header: str) -> None: + """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage. + +Args: + header (str): The value from the lark `\\ansicpg` CONTROLWORD Token. + +Raises: + MalformedRtf: If the value passed is not a valid ansi codepage. +""" + try: + possible_cpg_num = int(header.strip()[8:]) + check_codepage_num(possible_cpg_num) + except ValueError as _e: + raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e + + +# UNICODE CHARS +def unicode_escape_to_chr(item: bytes) -> str: + """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents. + +Args: + item (str): A RTF Escape in the format \\u[-]NNNNN. + +Returns: + The unicode character representation of the identified character + +Raises: + ValueError: The escaped unicode character is not valid. +""" + try: + nnnn = int(item.removeprefix(b'\\u')) # raises ValueError if not int. + except ValueError as _e: + raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e + if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits + ncr = 65536 + nnnn + else: # § NNNNN is a positive integer expressed in decimal digits + ncr = nnnn + # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN + return chr(ncr) + +def is_hex_encoded(item: Token) -> bool: + """Identify if a token contains a HEXENCODED token. +Args: + item (token): A token to check if it is HEXENCODED. + +Return: + True if HEXENCODED. False if not. + """ + if isinstance(item, Token): + if item.type == "HEXENCODED": + return True + return False + +def is_valid_ANSI_representation_char(item: Token) -> bool: + """Is token contain a valid ANSI representation string for a Unicode char. + +Args: + item (token): A token to check if it is a valid ANSI representation. + +Return: + True if token is an ansi representation of a unicode char. False if not. +""" + if isinstance(item, Token): + # print(f"found TOKEN posssible ansi {repr(item)}") + if is_hex_encoded(item): + # print(f"found hex posssible ansi {repr(item)}") + return True + if item.type == 'STRING': + # print(f"found STRING posssible ansi {repr(item)}") + if not item.value.isspace(): # whitespace doesn't count. + # print(f"found posssible ansi {repr(item)}") + return True + # else: + # print(f"found SPACE posssible ansi {repr(item)}") + # print(f"found NON TOKEN posssible ansi {repr(item)}") + return False + +def is_unicode_encoded(item: Token) -> bool: + """Is token contain a unicode char. + +Args: + item (token): A token to check if contains a unicode char. + +Return: + True if token contains a unicode char. False if not. +""" + if isinstance(item, Token): + if item.type == "UNICODE": + return True + return False + +def includes_unicode_chars(children: List[Token]) -> bool: + """Does a list include Tokens which contain unicode characters. Not recursive. + +Args: + children (list): A Tree.children list to check to see if it includes unicode characters. + +Returns: + True if list includes tokens which contain unicode chars. False if not. +""" + for child in children: + if is_unicode_encoded(child): + return True + return False + + +def remove_unicode_replacements(children: List[Token], + return_ascii_map: bool = True, + byte_count: int = 1) -> Union[ + Tuple[List[Token], Dict[Token,List[Token]]], + List[Token]]: + """Remove all unicode replacement characters from a list of Tokens. + +Args: + children (list): A Tree.children list to remove unicode replacement characters from. + return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed. + byte_count (int): The number of bytes corresponding to a given \\uN Unicode character. A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes. + +Returns: + new_children (list): The list of Tokens with all unicode replacement characters removed. + ascii_map (dict): All the Tokens which were removed from the provided children list keyed by + +""" + byte_count = 1 + ascii_map: Dict[Token,List[Token]] = {} + new_children = [] + removal_map: List[Token] = [] + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction(f"Removing unicode replacements on {repr(children)}") + for child in children: + if len(removal_map) > 0: + if isinstance(child, Token): + # Delete all spaces between a unicode char and the last ANSI representation + # print(f"FOUND SPACE STRING with RM: {removal_map}") + if child.value.isspace(): + ascii_map.setdefault(removal_map[0], []).append(child) + continue + if is_valid_ANSI_representation_char(child): + # Found an ansi representation removing unicode char from removal map. + # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}") + ascii_map.setdefault(removal_map.pop(), []).append(child) + continue + elif isinstance(child, Tree) and ( + (child.data == "string") or (child.data == "hexarray")): + # print(f"FOUND ASCII STRING {child} with RM: {removal_map}") + ansi_children = child.children + new_ansi_children = [] + for aci,ac in enumerate(ansi_children): + # print(f"AC CHILD {repr(ac)}") + if is_valid_ANSI_representation_char(ac): + # print(f"AC CHILD VALID {repr(ac)}") + if len(removal_map) > 0: + # print(f"AC CHILD MAP >0 {repr(ac)}") + # print(f"Popping removal for {repr(ac)}") + ascii_map.setdefault(removal_map.pop(), []).append(ac) + else: + # print(f"AC CHILD MAP < 0 {repr(ac)}") + new_ansi_children.append(ac) + else: + # print(f"AC CHILD NOT VALID {repr(ac)}") + new_ansi_children.append(ac) + # print(f"NEW Children = {new_ansi_children}") + if new_ansi_children == []: + from RTFDE.utils import make_token_replacement + # from RTFDE.utils import embed + # embed() + child = make_token_replacement("STRING", b"", child) + else: + child.children = new_ansi_children + # print(f"NEW Tree = {child}") + # else: + # print(f"FOUND ASCII STRING {child} with RM: {removal_map}") + # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}") + # Modify char byte count if we encounter it. + if is_unicode_char_byte_count(child): + byte_count = get_unicode_char_byte_count(child) + # print(f"Changing byte count because {child} to {byte_count}") + if is_unicode_encoded(child): + # print(f"Found unicode {child}") + for j in range(byte_count): + # Add the unicode key to the removal map once per byte + # This ensures we remove the right number of ANSI representation chars + removal_map.append(child) + new_children.append(child) + if return_ascii_map is True: + return new_children, ascii_map + return new_children + + +# UNICODE SURROGATE CHARACTERS +def is_surrogate_high_char(item: bytes) -> bool: + """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate. + + item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179" + """ + if item.startswith(b"\\u"): + item = item[2:] + if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF: + return True + # In case unicode is NOT using the 16 bit signed integer + elif 0xD800 <= int(item) <= 0xDBFF: + return True + return False + +def is_surrogate_low_char(item: bytes) -> bool: + """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates. + + item (bytes): A bytes representation of a string representing a unicode character. + """ + if item.startswith(b"\\u"): + item = item[2:] + if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF: + return True + # In case unicode is NOT using the 16 bit signed integer + elif 0xDC00 <= int(item) <= 0xDFFF: + return True + return False + +def is_surrogate_16bit(item: bytes, cp_range) -> bool: + """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions. + +Args: + item (bytes): A bytes representation of a string representing a unicode character. + cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate). + """ + if cp_range == 'low': + if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF: + return True + elif cp_range == 'high': + if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF: + return True + else: + raise ValueError("cp_range must be either 'low' or 'high'") + return False + + +def is_surrogate_pair(first: bytes, second: bytes) -> bool: + """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order. + +Args: + first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char. + second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char. + """ + if is_surrogate_high_char(first): + if is_surrogate_low_char(second): + return True + else: + log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second)) + return False + +def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes: + """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent. + +Args: + high (bytes): the high-surrogate code point + low (bytes): the low-surrogate code point + encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal. + """ + # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard. + # Unicode scalar value means the same thing as "code position" or "code point" + # https://www.unicode.org/versions/Unicode3.0.0/ + # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9 + if high.startswith(b"\\u"): + high = high[2:] + if low.startswith(b"\\u"): + low = low[2:] + if is_surrogate_16bit(high, "high"): + char_high = chr(65536+int(high)) + else: + char_high = chr(int(high)) + if is_surrogate_16bit(low, "low"): + char_low = chr(65536+int(low)) + else: + char_low = chr(int(low)) + unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000 + unicode_bytes = chr(unicode_scalar_value).encode(encoding) + return unicode_bytes.decode(encoding).encode() + +def merge_surrogate_chars(children, + ascii_map, + use_ASCII_alternatives_on_unicode_decode_failure = False): + """ + + +Raises: + ValueError: A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character. + """ + surrogate_start = None + surrogate_high = None + for i,c in enumerate(children): + if isinstance(c, Tree): + continue + if is_unicode_encoded(c): + if is_surrogate_high_char(c.value): + surrogate_start = i + surrogate_high = c + elif surrogate_start is not None: + if is_surrogate_low_char(c.value): + surrogate_low = c + try: + surrogate_value = decode_surrogate_pair(surrogate_high.value, + surrogate_low.value) + # Convert into STRING token + surrogate_tok = Token('STRING', + surrogate_value, + start_pos=surrogate_high.start_pos, + end_pos=surrogate_low.end_pos, + line=surrogate_high.line, + end_line=surrogate_low.end_line, + column=surrogate_high.column, + end_column=surrogate_low.end_column) + children[surrogate_start] = surrogate_tok + blank_tok = Token('STRING', + b"", + start_pos=surrogate_high.start_pos+1, + end_pos=surrogate_low.end_pos+1, + line=surrogate_high.line, + end_line=surrogate_low.end_line, + column=surrogate_high.column, + end_column=surrogate_low.end_column) + children[i] = blank_tok + surrogate_start = None + surrogate_high = None + except UnicodeDecodeError as _e: + if use_ASCII_alternatives_on_unicode_decode_failure is True: + children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]]) + children[i] = b"".join([i.value for i in ascii_map[surrogate_low]]) + else: + raise _e + else: + log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low)) + if use_ASCII_alternatives_on_unicode_decode_failure is True: + children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]]) + else: + raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.") + return children + + + +def is_unicode_char_byte_count(item: Token) -> bool: + if isinstance(item, Token): + if item.type == "CONTROLWORD": + if item.value.startswith(b'\\uc'): + return True + return False + +def get_unicode_char_byte_count(item: Token) -> int: + item = item.value.decode() + cur_uc = int(item[3:]) + return cur_uc + + +# Hex Encoded Chars +def has_hexarray(children: List[Union[Token, Tree]]) -> bool: + """Checks if an tree's children includes a hexarray tree. + + children (array): the children object from a tree. + """ + for item in children: + if is_hexarray(item): + return True + return False + +def is_hexarray(item): + """Checks if an item is a hexarray tree. + + item (Tree or Token): an item to check to see if its a hex array + """ + if isinstance(item, Tree): + if item.data.value == 'hexarray': + return True + return False + +def get_bytes_from_hex_encoded(item): + """Convert hex encoded string to bytes. + + item (str): a hex encoded string in format \\'XX + """ + hexstring = item.replace(b"\\'", b"") + hex_bytes = bytes.fromhex(hexstring.decode()) + return hex_bytes + +def decode_hex_char(item, codec): + """Decode a bytes object using a specified codec. + + item (bytes): A bytes object. + codec (str): The name of the codec to use to decode the bytes + """ + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction("decoding char {0} with font {1}".format(item, codec)) + if codec is None: + # Default to U.S. Windows default codepage + codec = 'CP1252' + decoded = item.decode(codec) + decoded = decoded.encode() + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec)) + return decoded + + +class TextDecoder: + + def __init__(self, keep_fontdef=False, + initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False): + """ + keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. + initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information. + use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for. + + """ + self.keep_fontdef = keep_fontdef + self.ucbc = initial_byte_count + self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure + + # Font table values set set_font_info + self.default_font = None + self.font_stack = [] + self.font_table = {} + + + def set_font_info(self, obj: Tree): + """ + + obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. + """ + self.default_font = get_default_font(obj) + self.font_stack = [self.default_font] + raw_fonttbl = get_font_table(obj.children[1]) + self.font_table = parse_font_tree(raw_fonttbl) + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}") + + + def update_children(self, obj: Tree): + """ + + obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. + """ + # Reset font info + self.set_font_info(obj) + children = obj.children + obj.children = [i for i in self.iterate_on_children(children)] + + def prep_unicode(self, children: List[Token]): + if includes_unicode_chars(children): + # Clean out all replacement chars + # log_text_extraction("Prepping Unicode Chars:" + repr(children)) + children, ascii_map = remove_unicode_replacements(children, + byte_count=self.ucbc) + # print("===\nCHILD:" + repr(children)) + # print("===\nASCII:" + repr(ascii_map)) + # Merge all surrogate pairs + children = merge_surrogate_chars(children, + ascii_map, + self.use_ASCII_alternatives_on_unicode_decode_failure) + # print("FINAL CHILDREN") + # log_text_extraction("Replaced Unicode Chars With: " + repr(children)) + return children + + def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly. + set_fonts = [] + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction("Starting to iterate on text extraction children...") + log_text_extraction("PREP-BEFORE: "+repr(children)) + children = self.prep_unicode(children) + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction("PREP-AFTER: "+repr(children)) + + for item in children: + if is_font_number(item): # Font Definitions + self.font_stack.append(item.value.strip()) + set_fonts.append(item.value) + if self.keep_fontdef is True: + yield item + elif is_unicode_char_byte_count(item): + bc = get_unicode_char_byte_count(item) + elif is_unicode_encoded(item): # Unicode Chars + decoded = unicode_escape_to_chr(item.value).encode() + # Convert into STRING token + decoded_tok = Token('STRING', + decoded, + start_pos=item.start_pos, + end_pos=item.end_pos, + line=item.line, + end_line=item.end_line, + column=item.column, + end_column=item.end_column) + if is_logger_on("RTFDE.text_extraction") is True: + log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}") + yield decoded_tok + # Decode a hex array + elif is_hexarray(item): + # print("IS Hex?? {0}".format(item)) + base_bytes = None + for hexchild in item.children: + if base_bytes is None: + base_bytes = get_bytes_from_hex_encoded(hexchild.value) + else: + base_bytes += get_bytes_from_hex_encoded(hexchild.value) + current_fontdef = self.font_table[self.font_stack[-1]] + current_codec = current_fontdef.codec + decoded_hex = decode_hex_char(base_bytes, current_codec) + # We are replacing a Tree. So, need item.data to access it's info token + decoded_hex_tok = Token('STRING', + decoded_hex, + start_pos=item.data.start_pos, + end_pos=item.data.end_pos, + line=item.data.line, + end_line=item.data.end_line, + column=item.data.column, + end_column=item.data.end_column) + yield decoded_hex_tok + elif isinstance(item, Tree): + # Run this same function recursively on nested trees + item.children = [i for i in self.iterate_on_children(item.children)] + yield item + else: + yield item + for i in set_fonts: + # Remove all fonts defined while in this group + self.font_stack.pop() diff --git a/RTFDE/transformers.py b/RTFDE/transformers.py index 2b35e4c..04d1df0 100644 --- a/RTFDE/transformers.py +++ b/RTFDE/transformers.py @@ -13,96 +13,90 @@ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. -import re -from lark.visitors import Transformer, Visitor_Recursive, v_args -from lark.tree import Tree -from lark.lexer import Token +from typing import Union, List, Tuple +from typing import TypedDict +# from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator +from collections.abc import Generator -class RTFUnicodeDecoder(Visitor_Recursive): - """Visits each Token in provided RTF Trees and decodes any/all unicode characters which it finds. - """ - - def __init__(self): - """Create the initial \\ucN keyword stack with the default scope included.""" - # A default of 1 should be assumed if no \uc keyword has been seen in the current or outer scopes. - RTF Spec - self.cur_uc = [1] +from lark.visitors import Transformer +from lark.visitors import v_args, Discard +from lark.tree import Tree +from lark.lexer import Token +import re - def visit_topdown(self, tree:Tree) -> Tree: - """Visit each Token in the RTF Tree to decode any/all unicode characters. - - This decoder starts at the root of the tree, and ends at the leaves (top-down) so it can track the state of the '\\ucN' keyword. The \\ucN keyword identifies when the count of bytes for how Unicode characters translate into ANSI character streams differ from the current Unicode Character Byte Count. When unicode encodings (\\uNNNN) are encountered, the code has to ignore the first N bytes, where N corresponds to the last \\ucN value encountered. If that sounds overly complex to you then then we are in agreement. - - Parameters: - tree: (Tree): An RTF Tree object which needs its values decoded. - """ - self._call_userfunc(tree) - cur_uc = None - for child in tree.children: - # The bytelen values (\ucN) are scoped like character properties. That is, a \ucN keyword applies only to text following the keyword, and within the same (or deeper) nested braces. - # This is covered in more detail in the RTF spec. No matter how much detail it goes into doesn't make up for how much I hate this kludge they implemented. - if isinstance(child, Token) and child.value.startswith('\\uc'): - strip_int = re.compile('^[^0-9]+([0-9]+)$') - cur_uc = int(strip_int.search(child.value).groups()[0]) - self.cur_uc.append(cur_uc) - elif isinstance(child, Tree): - self.visit_topdown(child) - else: - strip_u = re.compile(r'\\u[-]?[0-9]+[\s]?\??') - rtfencoded = strip_u.findall(child.value) - for enc_str in rtfencoded: - char_num = int(enc_str[2:].strip("?").strip()) - if char_num < 0: - # RTF control words generally accept signed 16-bit numbers as arguments. For this reason, Unicode decimal values greater than 32767 must be expressed as negative numbers. For example, if Hexdecimal value is 1D703 then the decimal value (120579) is greater than 32767 so the negative Hex-decimal value (03B8) is used and its decimal code is 952 - # The value of \ucN which appears before a \u-NNNN (a negative decimal unicode char) will tell us how many bytes the -NNNN value occupies when converted to a Unicode character. - # For instance, \uc1 would be one byte, thus while \u-3913 would convert to 0xF0B7 (U+F0B7 PRIVATE USE CHARACTER) if it were simply converted into unicode once only one byte (\uc1) is extracted it becomes 0xB7 (U+00B7 MIDDLE DOT) which is the correct character. - char = chr(bytearray(chr(65536+char_num).encode())[-self.cur_uc[-1]]) - else: - char = chr(char_num) - child.value = child.value.replace(enc_str, char) - # On exiting the group, the previous \uc value is restored. When leaving an RTF group which specified a \uc value, the reader must revert to the previous value. - # If we captured a unicode bytelen in this scope we get rid of it when we exit the scope. - if cur_uc is not None: - self.cur_uc.pop() - return tree +from RTFDE.utils import log_htmlrtf_stripping, is_logger_on +import logging +log = logging.getLogger("RTFDE") class StripNonVisibleRTFGroups(Transformer): """Visits each Token in provided RTF Trees and strips out any RTF groups which are non-visible when de-encapsulated into HTML. """ @v_args(tree=True) - def group(self, tree): + def group(self, tree: Tree): """Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings. - NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed. +NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed. - Parameters: - tree: (Tree): An RTF Tree object which needs its values decoded. - """ - args = tree.children - first_control = self._first_controlword(args) - non_visible_control_words = ["\\context", "\\colortbl", "\\fonttbl"] - if args == []: - return '' - # "Ignore all groups with the RTF ignore control symbol that are not used in RTF<->HTML conversions" - # See: https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfex/752835a4-ad5e-49e3-acce-6b654b828de5 - if isinstance(args[1], Token) and args[1].value == "\\*": - if not first_control.startswith("\\htmltag"): - return "" - # Currently deleting all groups that don't have an htmltag. - elif args[1].type == "CONTROLWORD": - if not first_control.startswith("\\htmltag"): - return "" - # Removing known non-visible objects - # TODO: Add more based on research you haven't done yet - elif first_control in non_visible_control_words: - return "" +Args: + tree: A .rtf group (Tree object) which needs its contents decoded. +""" + children = tree.children + if len(children) == 0: + return b"" + first_child = children[0] + + known_control_groups = ["htmltag_group"] + if isinstance(first_child, Tree): + if first_child.data in known_control_groups: + return tree + known_non_visible_control_groups = ["mhtmltag_group"] + if isinstance(first_child, Tree): + if first_child.data in known_non_visible_control_groups: + # print(f"DELETING: {first_child} : because mhtmltag") + return b"" + + # process known non-visible groups + non_visible_control_words = [b"\\context", b"\\colortbl", b"\\fonttbl"] + first_control = self.get_first_controlword(children) + # print(f"FIRST: {first_control}") + if first_control in non_visible_control_words: + return b"" + + # Process star escaped groups + # NOTE: `understood_commands` is where we can include commands we decide to actively process during deencapsulation in the future. + # For example, if we added support for `destination text` we would need to add '\\bkmkstart' and '\\ud' so our processor doesn't delete those groups + understood_commands: List[str] = [] + is_star_escaped = None + if (isinstance(first_child, Tree) and + len(first_child.children) != 0 ): + first_item = first_child.children[0] + if isinstance(first_item, Token): + if first_item.type == "STAR_ESCAPE": + is_star_escaped = True + control_word = None + if is_star_escaped is True: + # print(f"STAR: {children}") + first_token = children[1] + if isinstance(first_token, Token): + if first_token.type == "CONTROLWORD": + control_word = first_token + if control_word.value in understood_commands: + return tree + return b"" return tree @staticmethod - def _first_controlword(children) -> str: - """Extracts the first control word from a group to make identifying groups easier. + def get_first_controlword(children: List) -> Union[str,None]: + """Extracts the first control word from a .rtf group. + +Args: + children: A list of child objects within a .rtf group + +Returns: + The first controlword found in a group. Returns None if no controls words are found. """ for i in children: try: @@ -110,111 +104,321 @@ def _first_controlword(children) -> str: return i.value except AttributeError: continue + return None class RTFCleaner(Transformer): """Visits each Token in provided RTF Trees. Converts all tokens that need converting. Deletes all tokens that shouldn't be visible. And, joins all strings that are left into one final string. """ - def __init__(self, rtf_codec=None): - """Setup the RTF codec. + def start(self, args: List) -> bytes: + """Joins the .rtf object's string representations together at highest level object `start`. - Parameters: - rtf_codec: (str): The python codec to use when decoding strings. - """ - if rtf_codec is None: - self.rtf_codec = 'CP1252' - else: - self.rtf_codec = rtf_codec +This is the final string combination. """ + return b"".join(args) - def group(self, args): - """Join the strings in all groups.""" - return "".join(args) + def STRING(self, string: Token) -> bytes: + """Convert a string object into a raw string.""" + if string.value is not None: + return string.value + return b"" - def document(self, args): - """Join the final set of strings to make the final html string.""" - return "".join(args) + def SPACE_SAVE(self, string: Token) -> bytes: + return string.value - def OPENPAREN(self, args): + def string(self, strings: List) -> bytes: + """Convert all string objects withing a string group into a single string.""" + # print(strings) + return b"".join(strings) + + def group(self, grp: List) -> bytes: + """Join the strings in all group objects.""" + _new_children = [] + for i in grp: + if isinstance(i, type(Discard)): + pass + else: + _new_children.append(i) + return b"".join(_new_children) + + def document(self, args: List) -> bytes: + """Join the all the strings in an .rtf object into a single string representation of the document.""" + args = [i for i in args if i is not None] + return b"".join(args) + + def OPENPAREN(self, args: Token) -> bytes: """Delete all open parens.""" - return "" + return b"" - def CLOSEPAREN(self, args): + def CLOSEPAREN(self, args: Token) -> bytes: """Delete all closed parens.""" - return "" + return b"" + + def mhtmltag_group(self, args: List): + """Process MHTMLTAG groups + + Currently discarding because they don't need to be processed. - def CONTROLSYMBOL(self, args): - """Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones. +Returns: + Always returns a discard object.""" + return Discard + + def htmltag_group(self, strings: List) -> bytes: + """HTMLTAG processing. + +Takes any string values within an HTMLTAG and returns them. """ + return b"".join(strings) + + def HTMLTAG(self, htmltag: Token) -> bytes: + """Delete all HTMLTAG objects""" + return b"" + + def STAR_ESCAPE(self, char: Token) -> bytes: + """Delete all star escape objects""" + # '\\*': '' + return b"" + + def control_symbol(self, symbols: List) -> bytes: + """Join all visible symbols from in control symbol groups.""" + return b"".join(symbols) + + def NONBREAKING_SPACE(self, args: Token) -> bytes: + """Convert non-breaking spaces into visible representation.""" + # '\\~': '\u00A0', + return u'\u00A0'.encode() + + def NONBREAKING_HYPHEN(self, args: Token) -> bytes: + """Convert non-breaking hyphens into visible representation.""" + # '\\_': '\u00AD' + return u'\u00AD'.encode() + + def OPTIONAL_HYPHEN(self, args: Token) -> bytes: + """Convert hyphen control char into visible representation.""" + # '\\-': '\u2027' + return u'\u2027'.encode() + + def FORMULA_CHARACTER(self, args: Token) -> bytes: + """Convert a formula character into an empty string. + +If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands.""" + return b"" + + def INDEX_SUBENTRY(self, args: Token) -> bytes: + """Process index subentry items + +Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time.""" + return b"" + + def CONTROLSYMBOL(self, args: Token) -> bytes: + """Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones.""" symbols = { - '\\{': '\x7B', - '\\}': '\x7D', - '\\\\': '\x5C', - '\\~': '\u00A0', - '\\_': '\u00AD' + b'\\{': b'\x7B', + b'\\}': b'\x7D', + b'\\\\': b'\x5C', } replacement = symbols.get(args.value, None) # If this is simply a character to replace then return the value if replacement is not None: return replacement - else: - return "" + return b"" - def CONTROLWORD(self, args): + def CONTROLWORD(self, args: Token) -> bytes: """Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones. """ words = { - '\\par': '\n', - '\\tab': '\t', - '\\line': '\n', - '\\lquote': '\u2018', - '\\rquote': '\u2019', - '\\ldblquote': '\u201C', - '\\rdblquote': '\u201D', - '\\bullet': '\u2022', - '\\endash': '\u2013', - '\\emdash': '\u2014' + b'\\par': b'\n', + b'\\tab': b'\t', + b'\\line': b'\n', + b'\\lquote': b'\u2018', + b'\\rquote': b'\u2019', + b'\\ldblquote': b'\u201C', + b'\\rdblquote': b'\u201D', + b'\\bullet': b'\u2022', + b'\\endash': b'\u2013', + b'\\emdash': b'\u2014' } replacement = words.get(args.value, None) # If this is simply a character to replace then return the value as a string if replacement is not None: return replacement - return "" + return b"" - def RTFESCAPE(self, args): - """Decode hex encoded chars using the codec provided. Insert unicode chars directly since we already decoded those earlier. - """ - if args.value.startswith("\\'"): - hexstring = args.value.replace("\\'", "") - hex_bytes = bytes.fromhex(hexstring) - decoded = hex_bytes.decode(self.rtf_codec) +def get_stripped_HTMLRTF_values(tree: Tree, current_state: Union[bool,None] = None) -> Generator: + """Get a list of Tokens which should be suppressed by HTMLRTF control words. - return decoded - # \\u RTFEscapes need to have the \ucN value identified in order to do byte manipulation before being converted. So, we converted those previously in RTFUnicodeDecoder. + + NOTE: This de-encapsulation supports the HTMLRTF control word within nested groups. The state of the HTMLRTF control word transfers when entering groups and is restored when exiting groups, as specified in [MSFT-RTF]. + +Returns: + A list of Tokens which should be suppressed by HTMLRTF control words. + """ + if current_state is None: + htmlrtf_stack = [False] + else: + htmlrtf_stack = [current_state] + for child in tree.children: + is_htmlrtf = None + if isinstance(child, Tree): + # A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups, as specified in [MSFT-RTF]. + for toggle in get_stripped_HTMLRTF_values(child, htmlrtf_stack[-1]): + yield toggle else: - # We should have handled all escaped chars by here - # We can simply insert the values from \u RTF Escapes now - return args.value - - def TEXT(self, args): - """Converts escaped values in text and then return the text as a raw string.""" - escapes = { - '\\par': '\n', - '\\tab': '\t', - '\\line': '\n', - '\\lquote': '\u2018;', - '\\rquote': '\u2019;', - '\\ldblquote': '\u201C;', - '\\rdblquote': '\u201D;', - '\\bullet': '\u2022;', - '\\endash': '\u2013;', - '\\emdash': '\u2014;', - '\\{': '\x7B', - '\\}': '\x7D', - '\\\\': '\x5C', - '\\~': '\u00A0', - '\\_': '\u00AD' - } - text = args.value - for match,rep in escapes.items(): - text = text.replace(match, rep) - return text + is_htmlrtf = toggle_htmlrtf(child) + if is_htmlrtf is not None: + htmlrtf_stack.append(is_htmlrtf) + yield child + elif htmlrtf_stack[-1] is True: + yield child + +def toggle_htmlrtf(child: Union[Token,str]) -> Union[bool,None]: + """Identify if htmlrtf is being turned on or off. + +Returns: + Bool representing if htmlrtf is being enabled or disabled. None if object is not an HTMLRTF token. +""" + if isinstance(child, Token): + if child.type == "HTMLRTF": + htmlrtfstr = child.value.decode().strip() + if (len(htmlrtfstr) > 0 and htmlrtfstr[-1] == "0"): + return False + return True + return None + +class DeleteTokensFromTree(Transformer): + """Removes a series of tokens from a Tree. + +Parameters: + tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete) + +Attributes: + to_delete: A list of tokens to delete from the Tree object. + delete_start_pos: The starting position for all the identified tokens. Used to identify which tokens to delete. +""" + + def __init__(self, tokens_to_delete: List[Token]): + """Setup attributes including token start_pos tracking. + +Args: + tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete) +""" + super().__init__() + self.to_delete = tokens_to_delete + self.delete_start_pos = {i.start_pos for i in self.to_delete} + + def __default_token__(self, token: Token): + """Discard any identified tokens. + +Args: + token: All tokens within the transformed tree. + +Returns: + Returns all non-identified tokens. Returns Discard objects for any identified tokens. +""" + # print"(Evaluating token {0} at {1} to consider deleting".format(child.value, child.end_pos)) + if isinstance(token, Token): + if token.start_pos in self.delete_start_pos: + for i in self.to_delete: + if (i.start_pos == token.start_pos and + i.end_pos == token.end_pos and + i.value == token.value): + if is_logger_on("RTFDE.HTMLRTF_Stripping_logger") is True: + log_htmlrtf_stripping(i) + # print(f"DELETING: {i}") + return Discard + return token + +class StripUnusedSpecialCharacters(Transformer): + """Strip all unused tokens which lark has extracted from the RTF. + +These tokens are largely artifacts of the RTF format. + +We have to do this because we use the "keep_all_tokens" option in our lark parser. It's better to be explicit then to allow for ambiguity because of the grammar. + """ + + def _LBRACE(self, token: Token): + """Remove RTF braces. + +Returns: + Always returns a discard object.""" + return Discard + + def _RBRACE(self, token: Token): + """Remove RTF braces. + +Returns: + Always returns a discard object.""" + return Discard + + def _SPACE_DELETE(self, token: Token): + """Remove spaces which are not a part of the content + +These are mostly spaces used to separate control words from the content they precede. + +Returns: + Always returns a discard object. + """ + return Discard + + +class StripControlWords(Transformer): + """Visits each control word and strips the whitespace from around it. + """ + + def CONTROLWORD(self, token: Token): + """Strips the whitespace from around a provided control word. + +Args: + token: A CONTROLWORD token to strip whitespace from. + """ + tok = token.update(value=token.value.strip()) + return tok + + +def strip_binary_objects(raw_rtf: bytes) -> tuple: + """Extracts binary objects from a rtf file. + +Parameters: + raw_rtf: (bytes): It's the raw RTF file as bytes. + +Returns: + A tuple containing (new_raw, found_bytes) + new_raw: (bytes) A bytes object where any binary data has been removed. + found_bytes: (list) List of dictionaries containing binary data extracted from the rtf file. Each dictionary includes the data extracted, where it was extracted from in the original rtf file and where it can be inserted back into the stripped output. + + Description of found_bytes dictionaries: + + "bytes": (bytes) The binary data contained which was extracted. + "ctrl_char": (tuple) Tuple containing the binary control word and its numeric parameter + "start_pos": (int) The position (in the original raw rtf data) where the binary control word started. + "bin_start_pos": (int) The position (in the original raw rtf data) where the binary data starts. + "end_pos": (int) The position (in the original raw rtf data) where the binary data ends. + + Here is an example of what this looks like (by displaying the printable representation so you can see the bytes and then splitting the dict keys on new lines to make it readable.) + >> print(repr(found_bytes)) + + "{'bytes': b'\\xf4UP\\x13\\xdb\\xe4\\xe6CO\\xa8\\x16\\x10\\x8b\\n\\xfbA\\x9d\\xc5\\xd1C', + 'ctrl_char': (b'\\\\bin', b'20'), + 'start_pos': 56, + 'end_pos': 83, + 'bin_start_pos': 63}" + """ + found_bytes = [] + byte_finder = rb'(\\bin)([0-9]+)[ ]?' + for matchitem in re.finditer(byte_finder, raw_rtf): + param = int(matchitem[2]) + bin_start_pos = matchitem.span()[-1] + byte_obj = {"bytes": raw_rtf[bin_start_pos:bin_start_pos+param], + "ctrl_char": matchitem.groups(), + "start_pos": matchitem.span()[0], + "end_pos": bin_start_pos+param, + "bin_start_pos": bin_start_pos + } + # byte_obj : dict[str, Union[bytes, int, Tuple[bytes, bytes]]] + found_bytes.append(byte_obj) + new_raw = b'' + start_buffer = 0 + for new_bytes in found_bytes: + new_raw += raw_rtf[start_buffer:new_bytes["start_pos"]] + start_buffer = new_bytes["end_pos"] + new_raw += raw_rtf[start_buffer:] + return (new_raw, found_bytes) diff --git a/RTFDE/utils.py b/RTFDE/utils.py new file mode 100644 index 0000000..f10b0a1 --- /dev/null +++ b/RTFDE/utils.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This file is part of package name, a package description short. +# Copyright © 2022 seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + +import difflib +import sys +import re +from typing import Union, AnyStr, Any +# from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator +from collections.abc import Generator +from lark.lexer import Token +from lark.tree import Tree +from lark import Lark + + +import logging +log = logging.getLogger("RTFDE") + +def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str: + """Returns the hex encoded value of a .rtf control parameter. + +Args: + control_parameter: (int/str) Int or a string which represents an int. + +Returns: + Zero padded 6 char long hexedecimal string. +""" + try: + return f"{control_parameter:#06x}" + except ValueError: + # If passed as string convert first + control_parameter = int(control_parameter) + return f"{control_parameter:#06x}" + +def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str): + """Prints binary object to a dump file for quick debugging. + +Warning: Not for normal use. Only use when debugging. + +Args: + data (bytes|str): Data to write to path + path (str): The file path to write data to + """ + # Be able to print binary objects easily + if isinstance(data, (bytes, bytearray)) is True: + open_as = 'wb+' + else: + open_as = 'w+' + with open(path, open_as) as fp: + original_stdout = sys.stdout + sys.stdout = fp + print(data) + sys.stdout = original_stdout + +def encode_escaped_control_chars(raw_text: bytes) -> bytes: + """Replaces escaped control chars within the text with their RTF encoded versions \\'HH. + +Args: + raw_text (str): string which needs escape characters encoded + +Returns: + A string with escaped control chars + """ + cleaned = raw_text.replace(b'\\\\', b"\\'5c") + cleaned = cleaned.replace(b'\\{', b"\\'7b") + cleaned = cleaned.replace(b'\\}', b"\\'7d") + return cleaned + +def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool: + """Checks if a Token is a codeword with a numeric argument. + +Returns: + True if a Token is a codeword with a numeric argument. False if not. +""" + try: + val = token.value.strip() + # print(val, codeword) + if (val.startswith(codeword) and + val[len(codeword):].isdigit()): + return True + except AttributeError: + return False + return False + +def print_lark_parser_evaluated_grammar(parser): + """Prints the final evaluated grammar. + +Can be useful for debugging possible errors in grammar evaluation. + +Args: + parser (Lark obj): Lark object to extract grammar from. + """ + if not isinstance(parser, Lark): + raise ValueError("Requires a Lark object.") + eq = "="*15 + eq = " " + eq + " " + print(eq + "RULES" + eq + "\n") + for i in parser.rules: + print(" " + i) + print(eq + "TERMINALS" + eq + "\n") + for i in parser.terminals: + print(" " + i) + print(eq + "IGNORED TOKENS" + eq + "\n") + for i in parser.ignore_tokens: + print(" " + i) + +def log_validators(data): + """Log validator logging only if RTFDE.validation_logger set to debug. + """ + logger = logging.getLogger("RTFDE.validation_logger") + if logger.level == logging.DEBUG: + logger.debug(data) + +def log_transformations(data): + """Log transform logging only if RTFDE.transform_logger set to debug. + """ + logger = logging.getLogger("RTFDE.transform_logger") + if logger.level == logging.DEBUG: + logger.debug(data) + +def is_logger_on(logger_name, level=logging.DEBUG): + """Check if a logger is enabled and on debug. + """ + logger = logging.getLogger(logger_name) + if logger.level == level: + return True + return False + +def log_text_extraction(data): + """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug. + """ + logger = logging.getLogger("RTFDE.text_extraction") + if logger.level == logging.DEBUG: + logger.debug(data) + +def log_htmlrtf_stripping(data: Token): + """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug. + +Raises: + AttributeError: Will occur if you pass this something that is not a token. +""" + logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger") + if logger.level == logging.DEBUG: + if not isinstance(data, Token): + raise AttributeError("HTMLRTF Stripping logger only logs Tokens") + tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}" + log_msg = tok_desc.format(value=data.value, + line=data.line, + end_line=data.end_line, + start_pos=data.start_pos, + end_pos = data.end_pos) + logger.debug(log_msg) + +def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None): + """Log diff of two strings. Defaults to splitting by newlines and keeping the ends. + +Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging. + +Args: + original: The original string + revised: The changed version of the string + sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. +""" + log.debug(get_string_diff(original, revised, sep)) + +def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None): + """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends. + +Args: + original: The original string + revised: The changed version of the string + sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. + +Returns: + A string object representing the diff of the two strings provided. +""" + if sep is None: + orig_split = original.decode().splitlines(keepends=True) + revised_split = revised.decode().splitlines(keepends=True) + else: + original = original.replace(b'\n',b'') + revised = revised.replace(b'\n',b'') + orig_split = [i.decode() for i in re.split(sep, original) if i != b''] + revised_split = [i.decode() for i in re.split(sep, revised) if i != b''] + return "\n".join(list(difflib.context_diff(orig_split, + revised_split))) + +def get_tree_diff(original: Tree, revised: Tree): + """Get the diff of two trees. + +Args: + original (lark Tree): A lark tree before transformation + revised (lark Tree): A lark tree after transformation + +Returns: + A string object representing the diff of the two Trees provided. + +Example: + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) + get_tree_diff(rtf_obj.full_tree, transformed_tree) + + """ + log = logging.getLogger("RTFDE") + flat_original = list(flatten_tree(original)) + flat_revised = list(flatten_tree(revised)) + return "\n".join(list(difflib.context_diff(flat_original, + flat_revised))) +def flatten_tree(tree: Tree) -> Generator: + """Flatten a lark Tree into a list of repr's of tree objects. + +Args: + tree (lark Tree): A lark tree +""" + yield f"Tree('{tree.data}')" + for child in tree.children: + if isinstance(child, Token): + yield repr(child) + elif isinstance(child, Tree): + for i in flatten_tree(child): + yield i + else: + yield repr(child) + +def flatten_tree_to_string_array(tree: Tree) -> Generator: + """Flatten a lark Tree into a list of repr's of tree objects. + +Args: + tree (lark Tree): A lark tree +""" + for child in tree.children: + if isinstance(child, Tree): + for i in flatten_tree_to_string_array(child): + yield i + elif isinstance(child, Token): + yield child.value + else: + yield child + +def make_token_replacement(ttype, value, example): + if isinstance(example, Token): + fake_tok = Token(ttype, + value, + start_pos=example.start_pos, + end_pos=example.end_pos, + line=example.line, + end_line=example.end_line, + column=example.column, + end_column=example.end_column) + elif isinstance(example, Tree): + fake_tok = Token(ttype, + value, + start_pos=example.meta.start_pos, + end_pos=example.meta.end_pos, + line=example.meta.line, + end_line=example.meta.end_line, + column=example.meta.column, + end_column=example.meta.end_column) + + return fake_tok + + +def embed(): + import os + import readline + import rlcompleter + import code + import inspect + import traceback + + history = os.path.join(os.path.expanduser('~'), '.python_history') + if os.path.isfile(history): + readline.read_history_file(history) + + frame = inspect.currentframe().f_back + namespace = frame.f_locals.copy() + namespace.update(frame.f_globals) + + readline.set_completer(rlcompleter.Completer(namespace).complete) + readline.parse_and_bind("tab: complete") + + file = frame.f_code.co_filename + line = frame.f_lineno + function = frame.f_code.co_name + + stack = ''.join(traceback.format_stack()[:-1]) + print(stack) + banner = f" [ {os.path.basename(file)}:{line} in {function}() ]" + banner += "\n Entering interactive mode (Ctrl-D to exit) ..." + try: + code.interact(banner=banner, local=namespace) + finally: + readline.write_history_file(history) diff --git a/docs/RTFDE/deencapsulate.html b/docs/RTFDE/deencapsulate.html new file mode 100644 index 0000000..82077c1 --- /dev/null +++ b/docs/RTFDE/deencapsulate.html @@ -0,0 +1,1366 @@ + + + + + + +RTFDE.deencapsulate API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.deencapsulate

+
+
+
+ +Expand source code + +
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+from typing import Union, AnyStr, Tuple, Dict, Any
+from io import BufferedReader
+
+from lark import Lark
+from lark.tree import Tree
+from lark.lexer import Token
+
+from RTFDE.transformers import RTFCleaner, StripControlWords
+from RTFDE.transformers import StripNonVisibleRTFGroups
+from RTFDE.transformers import StripUnusedSpecialCharacters
+from RTFDE.utils import encode_escaped_control_chars
+from RTFDE.utils import log_validators, log_transformations
+from RTFDE.transformers import get_stripped_HTMLRTF_values, DeleteTokensFromTree, strip_binary_objects
+from RTFDE.grammar import make_concise_grammar
+from RTFDE.text_extraction import TextDecoder
+from RTFDE.text_extraction import validate_ansi_cpg
+
+# For catching exceptions
+from RTFDE.exceptions import NotEncapsulatedRtf, MalformedEncapsulatedRtf, MalformedRtf
+
+import logging
+log = logging.getLogger("RTFDE")
+
+class DeEncapsulator():
+    """De-Encapsulating RTF converter of HTML/TEXT found in .msg files.
+
+De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.
+
+
+Parameters:
+    raw_rtf: (bytes): It's the raw RTF file as bytes.
+    grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.
+
+Attributes:
+    content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function.
+    html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function.
+    text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function.
+    found_binary: List of dictionaries containing binary data extracted from the rtf file.
+    content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function.
+    full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function.
+    doc_tree: The `document` portion of the .rtf full_tree object.
+    raw_rtf: The raw encapsulated .rtf data in byte format.
+    grammar: The Lark parsing grammer used to parse the .rtf data.
+    content_type_token: The .rtf header token identifying the content type. (\\fromhtml1 OR \\fromtext)
+    parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.
+    """
+
+    def __init__(self, raw_rtf:bytes, grammar: Union[str,None] = None):
+        """Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.
+
+NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.
+
+Parameters:
+        raw_rtf: (bytes): It's the raw RTF string.
+        grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.
+
+Raises:
+        TypeError: The raw_rtf data passed is not the correct type of data (string/byte string).
+"""
+        self.content: str
+        self.content_type: str
+        self.content_type_token: str
+        self.parser: Any
+
+        self.html: str
+        self.text: str
+        self.found_binary: list
+        self.full_tree: Tree
+        self.doc_tree: Tree
+        self.catch_common_validation_issues(raw_rtf)
+        if isinstance(raw_rtf, bytes):
+            raw_rtf_bytes = raw_rtf
+        else:
+            raise TypeError("DeEncapssulator only accepts RTF files in string or byte-string formats")
+        raw_rtf_bytes = raw_rtf_bytes.rstrip(b'\x00')
+        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r\n',b'\n')
+        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r',b'\n')
+        self.raw_rtf: bytes = raw_rtf_bytes
+        if grammar is not None:
+            self.grammar: str = grammar
+        else:
+            self.grammar = make_concise_grammar()
+
+    def deencapsulate(self):
+        """De-encapsulate the RTF content loaded into the De-Encapsulator.
+
+Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
+        """
+        stripped_data = strip_binary_objects(self.raw_rtf)
+        non_binary_rtf = stripped_data[0]
+        found_binary = stripped_data[1]
+        if len(found_binary) > 0:
+            self.found_binary = found_binary
+            log.info("Binary data found and extracted from rtf file.")
+        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
+        log_transformations(escaped_rtf)
+        self.parse_rtf(escaped_rtf)
+        Decoder = TextDecoder()
+        Decoder.update_children(self.full_tree)
+        self.get_doc_tree()
+        self.validate_encapsulation()
+
+        # remove htmlrtf escaped values
+        htmlrtf_stripped = self.strip_htmlrtf_tokens()
+        # Strips whitespace from control words
+        control_stripped = StripControlWords().transform(htmlrtf_stripped)
+        # Strip unused control chars
+        special_stripper = StripUnusedSpecialCharacters()
+        non_special_tree = special_stripper.transform(control_stripped)
+        # Strip out non-visible RTF groups
+        stripper = StripNonVisibleRTFGroups()
+        stripped_tree = stripper.transform(non_special_tree)
+        # Converts any remaining tokens
+        cleaner = RTFCleaner(visit_tokens=True)
+        cleaned_text = cleaner.transform(stripped_tree)
+
+        self.content = cleaned_text
+        self.set_content() # Populates self.html || self.text
+
+    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
+        """Validate and return the RTF charset keyword from the RTF streams header.
+
+Args:
+        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.
+
+Raises:
+        MalformedRtf: RTF stream does not include charset control word.
+
+Returns:
+        The RTF charset keyword from the RTF streams header.
+"""
+        main_headers = self.get_header_control_words_before_first_group()
+
+        for token in main_headers:
+            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
+                return token
+
+        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
+        if fallback_to_default is False:
+            raise MalformedRtf("RTF stream does not include charset control word.")
+
+        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
+        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
+        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
+        return b"\\ansi"
+
+    def set_content(self):
+        """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
+        self.content_type = self.get_content_type()
+        if self.content_type == 'html':
+            self.html = self.content
+        else:
+            self.text = self.content
+
+    def get_doc_tree(self):
+        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.
+
+Raises:
+        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
+"""
+        if self.full_tree.children[1].data == "document":
+            self.doc_tree = self.full_tree.children[1]
+        else:
+            raise ValueError("Document object in the wrong place after parsing.")
+
+    def get_content_type(self):
+        """Provide the type of content encapsulated in RTF.
+
+NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.
+
+Raises:
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+"""
+        if self.content_type_token is None:
+            self.validate_FROM_in_doc_header()
+        elif self.content_type_token == b'\\fromhtml1':
+            return 'html'
+        elif self.content_type_token == b'\\fromtext':
+            return "text"
+
+        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")
+
+    def validate_encapsulation(self):
+        """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
+        self.validate_rtf_doc_header(self.doc_tree)
+        self.validate_charset()
+        self.validate_FROM_in_doc_header()
+        ansicpg = self.get_ansicpg_header()
+        if ansicpg is not None: # ansicpg is not manditory
+            validate_ansi_cpg(ansicpg.value)
+
+    def get_ansicpg_header(self) -> Union[Token,None]:
+        """Extract the ansicpg control word from the .rtf header.
+
+Returns:
+        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
+"""
+        headers = self.get_header_control_words_before_first_group()
+        for item in headers:
+            if item.value.startswith(b'\\ansicpg'):
+                return item
+        return None
+
+    def parse_rtf(self, rtf: str):
+        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.
+
+Args:
+        rtf: The .rtf string to parse with the projects lark grammar.
+"""
+        # Uncomment Lark debug argument if you want to enable logging.
+        # Note, this not enable ALL lark debug logging.
+        # To do that we would not be able to use the Lark convinence class which we are using here.
+        self.parser = Lark(self.grammar,
+                           parser='lalr',
+                           keep_all_tokens=True,
+                           use_bytes=True,
+                           # debug=True,
+                           propagate_positions=True)
+        self.full_tree = self.parser.parse(rtf)
+        log_transformations(self.full_tree)
+
+
+    def strip_htmlrtf_tokens(self) -> Tree:
+        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.
+
+Returns:
+        .rtf doc_tree stripped of all non-original tokens.
+"""
+        # remove htmlrtf escaped values
+        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
+        tokens_to_delete = list(delete_generator)
+        deleter = DeleteTokensFromTree(tokens_to_delete)
+        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
+        return htmlrtf_cleaned_tree
+
+
+    def get_header_control_words_before_first_group(self) -> list:
+        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)
+
+This is used to extract initial header values for validation functions.
+
+Returns:
+        A list containing the header tokens in the .rtf data.
+        """
+        initial_control_words = []
+        for token in self.doc_tree.children[:20]:
+            if isinstance(token, Token):
+                initial_control_words.append(token)
+            else:
+                return initial_control_words
+        return initial_control_words
+
+
+    def validate_FROM_in_doc_header(self):
+        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.
+
+NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX
+
+Raises:
+        MalformedEncapsulatedRtf: The .rtf headers are malformed.
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+        """
+        cw_found = {"rtf1":False,
+                    "from":False,
+                    "fonttbl":False,
+                    "malformed":False}
+        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
+        decoded_tree = StripControlWords().transform(self.doc_tree)
+        first_ten_tokens = decoded_tree.children[:10]
+        operating_tokens = []
+        found_token = None
+        for token in first_ten_tokens:
+            if isinstance(token, Token):
+                operating_tokens.append(token)
+            else:
+                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
+        log_validators(f"Header tokens being evaluated: {operating_tokens}")
+
+        for token in operating_tokens:
+            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
+            if cw_found['from'] is True and cw_found["malformed"] is True:
+                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
+            # Save content type token available for id-ing type of content later
+            if found_token is not None:
+                self.content_type_token = found_token
+
+        if cw_found['from'] is False:
+            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
+            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")
+
+    @staticmethod
+    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
+        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.
+
+Args:
+        token: The token to check for in the cw_found state dictionary.
+        cw_found: The state dictionary which is used to track the position of the from token within the header.
+
+        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`
+
+
+Returns:
+        cw_found: Updated state dictionary
+        found_token: The content_type_token found in the header.
+
+        """
+        from_cws = [b'\\fromhtml1', b'\\fromtext']
+        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
+        rtf1_cw = b"\\rtf1"
+        found_token = None
+        fonttbl_cw = b"\\fonttbl"
+        if token.type == "CONTROLWORD":
+            if token.value.strip() in from_cws:
+                if cw_found['from'] is True:
+                    cw_found["malformed"] = True
+                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
+                if cw_found['rtf1'] is True:
+                    cw_found['from'] = True
+                    found_token = token.value
+                else:
+                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
+                    cw_found['from'] = True
+                    cw_found["malformed"] = True
+            elif token.value.strip() == rtf1_cw:
+                cw_found['rtf1'] = True
+            elif token.value.strip() == fonttbl_cw:
+                cw_found['fonttbl'] = True
+                if cw_found['from'] is not True:
+                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
+                    cw_found["malformed"] = True
+        return cw_found, found_token
+
+
+    @staticmethod
+    def validate_rtf_doc_header(doc_tree: Tree):
+        """Check if doc starts with a valid RTF header `\\rtf1`.
+
+        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX
+
+Raises:
+        MalformedRtf: The .rtf headers do not include \\rtf1.
+"""
+        first_token = doc_tree.children[0].value
+        if first_token != b"\\rtf1":
+            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
+            log_validators(f"First child object in document tree is: {first_token!r}")
+            raise MalformedRtf("RTF stream does not start with {\\rtf1")
+
+    @staticmethod
+    def catch_common_validation_issues(raw_rtf: AnyStr):
+        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.
+
+Args:
+        raw_rtf: A raw .rtf string or byte-string.
+
+Raises:
+        TypeError: The data passed is the wrong type of data.
+        MalformedRtf: The data passed is not a correctly formatted .rtf string.
+"""
+        if isinstance(raw_rtf, BufferedReader):
+            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
+        if raw_rtf is None:
+            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
+        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
+            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
+        if raw_rtf in (b'', ''):
+            raise MalformedRtf("Data passed as raw RTF file is an empty string.")
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class DeEncapsulator +(raw_rtf: bytes, grammar: Optional[str] = None) +
+
+

De-Encapsulating RTF converter of HTML/TEXT found in .msg files.

+

De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.

+

Parameters

+

raw_rtf: (bytes): It's the raw RTF file as bytes. +grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

+

Attributes

+
+
content
+
(bytes) The deencapsulated content no matter what format it is in. Populated by the deencapsulate function.
+
html
+
(bytes) The deencapsulated content IF it is HTML content. Populated by the set_content function.
+
text
+
(bytes) The deencapsulated content IF it is plain text content. Populated by the set_content function.
+
found_binary
+
List of dictionaries containing binary data extracted from the rtf file.
+
content_type
+
The type of content encapsulated in .rtf data (html or text). Populated by the get_content_type function.
+
full_tree
+
The full .rtf object parsed into an object Tree using the grammar. Populated by the parse_rtf function.
+
doc_tree
+
The document portion of the .rtf full_tree object.
+
raw_rtf
+
The raw encapsulated .rtf data in byte format.
+
grammar
+
The Lark parsing grammer used to parse the .rtf data.
+
content_type_token
+
The .rtf header token identifying the content type. (\fromhtml1 OR \fromtext)
+
parser
+
The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.
+
+

Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.

+

NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.

+

Parameters

+

raw_rtf: (bytes): It's the raw RTF string. +grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

+

Raises

+
+
TypeError
+
The raw_rtf data passed is not the correct type of data (string/byte string).
+
+
+ +Expand source code + +
class DeEncapsulator():
+    """De-Encapsulating RTF converter of HTML/TEXT found in .msg files.
+
+De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.
+
+
+Parameters:
+    raw_rtf: (bytes): It's the raw RTF file as bytes.
+    grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.
+
+Attributes:
+    content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function.
+    html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function.
+    text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function.
+    found_binary: List of dictionaries containing binary data extracted from the rtf file.
+    content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function.
+    full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function.
+    doc_tree: The `document` portion of the .rtf full_tree object.
+    raw_rtf: The raw encapsulated .rtf data in byte format.
+    grammar: The Lark parsing grammer used to parse the .rtf data.
+    content_type_token: The .rtf header token identifying the content type. (\\fromhtml1 OR \\fromtext)
+    parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.
+    """
+
+    def __init__(self, raw_rtf:bytes, grammar: Union[str,None] = None):
+        """Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.
+
+NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.
+
+Parameters:
+        raw_rtf: (bytes): It's the raw RTF string.
+        grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.
+
+Raises:
+        TypeError: The raw_rtf data passed is not the correct type of data (string/byte string).
+"""
+        self.content: str
+        self.content_type: str
+        self.content_type_token: str
+        self.parser: Any
+
+        self.html: str
+        self.text: str
+        self.found_binary: list
+        self.full_tree: Tree
+        self.doc_tree: Tree
+        self.catch_common_validation_issues(raw_rtf)
+        if isinstance(raw_rtf, bytes):
+            raw_rtf_bytes = raw_rtf
+        else:
+            raise TypeError("DeEncapssulator only accepts RTF files in string or byte-string formats")
+        raw_rtf_bytes = raw_rtf_bytes.rstrip(b'\x00')
+        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r\n',b'\n')
+        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r',b'\n')
+        self.raw_rtf: bytes = raw_rtf_bytes
+        if grammar is not None:
+            self.grammar: str = grammar
+        else:
+            self.grammar = make_concise_grammar()
+
+    def deencapsulate(self):
+        """De-encapsulate the RTF content loaded into the De-Encapsulator.
+
+Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
+        """
+        stripped_data = strip_binary_objects(self.raw_rtf)
+        non_binary_rtf = stripped_data[0]
+        found_binary = stripped_data[1]
+        if len(found_binary) > 0:
+            self.found_binary = found_binary
+            log.info("Binary data found and extracted from rtf file.")
+        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
+        log_transformations(escaped_rtf)
+        self.parse_rtf(escaped_rtf)
+        Decoder = TextDecoder()
+        Decoder.update_children(self.full_tree)
+        self.get_doc_tree()
+        self.validate_encapsulation()
+
+        # remove htmlrtf escaped values
+        htmlrtf_stripped = self.strip_htmlrtf_tokens()
+        # Strips whitespace from control words
+        control_stripped = StripControlWords().transform(htmlrtf_stripped)
+        # Strip unused control chars
+        special_stripper = StripUnusedSpecialCharacters()
+        non_special_tree = special_stripper.transform(control_stripped)
+        # Strip out non-visible RTF groups
+        stripper = StripNonVisibleRTFGroups()
+        stripped_tree = stripper.transform(non_special_tree)
+        # Converts any remaining tokens
+        cleaner = RTFCleaner(visit_tokens=True)
+        cleaned_text = cleaner.transform(stripped_tree)
+
+        self.content = cleaned_text
+        self.set_content() # Populates self.html || self.text
+
+    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
+        """Validate and return the RTF charset keyword from the RTF streams header.
+
+Args:
+        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.
+
+Raises:
+        MalformedRtf: RTF stream does not include charset control word.
+
+Returns:
+        The RTF charset keyword from the RTF streams header.
+"""
+        main_headers = self.get_header_control_words_before_first_group()
+
+        for token in main_headers:
+            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
+                return token
+
+        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
+        if fallback_to_default is False:
+            raise MalformedRtf("RTF stream does not include charset control word.")
+
+        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
+        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
+        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
+        return b"\\ansi"
+
+    def set_content(self):
+        """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
+        self.content_type = self.get_content_type()
+        if self.content_type == 'html':
+            self.html = self.content
+        else:
+            self.text = self.content
+
+    def get_doc_tree(self):
+        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.
+
+Raises:
+        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
+"""
+        if self.full_tree.children[1].data == "document":
+            self.doc_tree = self.full_tree.children[1]
+        else:
+            raise ValueError("Document object in the wrong place after parsing.")
+
+    def get_content_type(self):
+        """Provide the type of content encapsulated in RTF.
+
+NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.
+
+Raises:
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+"""
+        if self.content_type_token is None:
+            self.validate_FROM_in_doc_header()
+        elif self.content_type_token == b'\\fromhtml1':
+            return 'html'
+        elif self.content_type_token == b'\\fromtext':
+            return "text"
+
+        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")
+
+    def validate_encapsulation(self):
+        """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
+        self.validate_rtf_doc_header(self.doc_tree)
+        self.validate_charset()
+        self.validate_FROM_in_doc_header()
+        ansicpg = self.get_ansicpg_header()
+        if ansicpg is not None: # ansicpg is not manditory
+            validate_ansi_cpg(ansicpg.value)
+
+    def get_ansicpg_header(self) -> Union[Token,None]:
+        """Extract the ansicpg control word from the .rtf header.
+
+Returns:
+        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
+"""
+        headers = self.get_header_control_words_before_first_group()
+        for item in headers:
+            if item.value.startswith(b'\\ansicpg'):
+                return item
+        return None
+
+    def parse_rtf(self, rtf: str):
+        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.
+
+Args:
+        rtf: The .rtf string to parse with the projects lark grammar.
+"""
+        # Uncomment Lark debug argument if you want to enable logging.
+        # Note, this not enable ALL lark debug logging.
+        # To do that we would not be able to use the Lark convinence class which we are using here.
+        self.parser = Lark(self.grammar,
+                           parser='lalr',
+                           keep_all_tokens=True,
+                           use_bytes=True,
+                           # debug=True,
+                           propagate_positions=True)
+        self.full_tree = self.parser.parse(rtf)
+        log_transformations(self.full_tree)
+
+
+    def strip_htmlrtf_tokens(self) -> Tree:
+        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.
+
+Returns:
+        .rtf doc_tree stripped of all non-original tokens.
+"""
+        # remove htmlrtf escaped values
+        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
+        tokens_to_delete = list(delete_generator)
+        deleter = DeleteTokensFromTree(tokens_to_delete)
+        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
+        return htmlrtf_cleaned_tree
+
+
+    def get_header_control_words_before_first_group(self) -> list:
+        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)
+
+This is used to extract initial header values for validation functions.
+
+Returns:
+        A list containing the header tokens in the .rtf data.
+        """
+        initial_control_words = []
+        for token in self.doc_tree.children[:20]:
+            if isinstance(token, Token):
+                initial_control_words.append(token)
+            else:
+                return initial_control_words
+        return initial_control_words
+
+
+    def validate_FROM_in_doc_header(self):
+        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.
+
+NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX
+
+Raises:
+        MalformedEncapsulatedRtf: The .rtf headers are malformed.
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+        """
+        cw_found = {"rtf1":False,
+                    "from":False,
+                    "fonttbl":False,
+                    "malformed":False}
+        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
+        decoded_tree = StripControlWords().transform(self.doc_tree)
+        first_ten_tokens = decoded_tree.children[:10]
+        operating_tokens = []
+        found_token = None
+        for token in first_ten_tokens:
+            if isinstance(token, Token):
+                operating_tokens.append(token)
+            else:
+                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
+        log_validators(f"Header tokens being evaluated: {operating_tokens}")
+
+        for token in operating_tokens:
+            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
+            if cw_found['from'] is True and cw_found["malformed"] is True:
+                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
+            # Save content type token available for id-ing type of content later
+            if found_token is not None:
+                self.content_type_token = found_token
+
+        if cw_found['from'] is False:
+            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
+            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")
+
+    @staticmethod
+    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
+        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.
+
+Args:
+        token: The token to check for in the cw_found state dictionary.
+        cw_found: The state dictionary which is used to track the position of the from token within the header.
+
+        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`
+
+
+Returns:
+        cw_found: Updated state dictionary
+        found_token: The content_type_token found in the header.
+
+        """
+        from_cws = [b'\\fromhtml1', b'\\fromtext']
+        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
+        rtf1_cw = b"\\rtf1"
+        found_token = None
+        fonttbl_cw = b"\\fonttbl"
+        if token.type == "CONTROLWORD":
+            if token.value.strip() in from_cws:
+                if cw_found['from'] is True:
+                    cw_found["malformed"] = True
+                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
+                if cw_found['rtf1'] is True:
+                    cw_found['from'] = True
+                    found_token = token.value
+                else:
+                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
+                    cw_found['from'] = True
+                    cw_found["malformed"] = True
+            elif token.value.strip() == rtf1_cw:
+                cw_found['rtf1'] = True
+            elif token.value.strip() == fonttbl_cw:
+                cw_found['fonttbl'] = True
+                if cw_found['from'] is not True:
+                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
+                    cw_found["malformed"] = True
+        return cw_found, found_token
+
+
+    @staticmethod
+    def validate_rtf_doc_header(doc_tree: Tree):
+        """Check if doc starts with a valid RTF header `\\rtf1`.
+
+        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX
+
+Raises:
+        MalformedRtf: The .rtf headers do not include \\rtf1.
+"""
+        first_token = doc_tree.children[0].value
+        if first_token != b"\\rtf1":
+            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
+            log_validators(f"First child object in document tree is: {first_token!r}")
+            raise MalformedRtf("RTF stream does not start with {\\rtf1")
+
+    @staticmethod
+    def catch_common_validation_issues(raw_rtf: AnyStr):
+        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.
+
+Args:
+        raw_rtf: A raw .rtf string or byte-string.
+
+Raises:
+        TypeError: The data passed is the wrong type of data.
+        MalformedRtf: The data passed is not a correctly formatted .rtf string.
+"""
+        if isinstance(raw_rtf, BufferedReader):
+            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
+        if raw_rtf is None:
+            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
+        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
+            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
+        if raw_rtf in (b'', ''):
+            raise MalformedRtf("Data passed as raw RTF file is an empty string.")
+
+

Static methods

+
+
+def catch_common_validation_issues(raw_rtf: ~AnyStr) +
+
+

Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.

+

Args

+
+
raw_rtf
+
A raw .rtf string or byte-string.
+
+

Raises

+
+
TypeError
+
The data passed is the wrong type of data.
+
MalformedRtf
+
The data passed is not a correctly formatted .rtf string.
+
+
+ +Expand source code + +
    @staticmethod
+    def catch_common_validation_issues(raw_rtf: AnyStr):
+        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.
+
+Args:
+        raw_rtf: A raw .rtf string or byte-string.
+
+Raises:
+        TypeError: The data passed is the wrong type of data.
+        MalformedRtf: The data passed is not a correctly formatted .rtf string.
+"""
+        if isinstance(raw_rtf, BufferedReader):
+            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
+        if raw_rtf is None:
+            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
+        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
+            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
+        if raw_rtf in (b'', ''):
+            raise MalformedRtf("Data passed as raw RTF file is an empty string.")
+
+
+
+def check_from_token(token: lark.lexer.Token, cw_found: dict) ‑> Tuple[Dict, Optional[None]] +
+
+

Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.

+

Args

+
+
token
+
The token to check for in the cw_found state dictionary.
+
cw_found
+
The state dictionary which is used to track the position of the from token within the header.
+
+

cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}

+

Returns

+
+
cw_found
+
Updated state dictionary
+
found_token
+
The content_type_token found in the header.
+
+
+ +Expand source code + +
    @staticmethod
+    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
+        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.
+
+Args:
+        token: The token to check for in the cw_found state dictionary.
+        cw_found: The state dictionary which is used to track the position of the from token within the header.
+
+        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`
+
+
+Returns:
+        cw_found: Updated state dictionary
+        found_token: The content_type_token found in the header.
+
+        """
+        from_cws = [b'\\fromhtml1', b'\\fromtext']
+        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
+        rtf1_cw = b"\\rtf1"
+        found_token = None
+        fonttbl_cw = b"\\fonttbl"
+        if token.type == "CONTROLWORD":
+            if token.value.strip() in from_cws:
+                if cw_found['from'] is True:
+                    cw_found["malformed"] = True
+                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
+                if cw_found['rtf1'] is True:
+                    cw_found['from'] = True
+                    found_token = token.value
+                else:
+                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
+                    cw_found['from'] = True
+                    cw_found["malformed"] = True
+            elif token.value.strip() == rtf1_cw:
+                cw_found['rtf1'] = True
+            elif token.value.strip() == fonttbl_cw:
+                cw_found['fonttbl'] = True
+                if cw_found['from'] is not True:
+                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
+                    cw_found["malformed"] = True
+        return cw_found, found_token
+
+
+
+def validate_rtf_doc_header(doc_tree: lark.tree.Tree) +
+
+

Check if doc starts with a valid RTF header \rtf1.

+
    "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\rtf1")." - MS-OXRTFEX
+
+

Raises

+
+
MalformedRtf
+
The .rtf headers do not include \rtf1.
+
+
+ +Expand source code + +
    @staticmethod
+    def validate_rtf_doc_header(doc_tree: Tree):
+        """Check if doc starts with a valid RTF header `\\rtf1`.
+
+        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX
+
+Raises:
+        MalformedRtf: The .rtf headers do not include \\rtf1.
+"""
+        first_token = doc_tree.children[0].value
+        if first_token != b"\\rtf1":
+            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
+            log_validators(f"First child object in document tree is: {first_token!r}")
+            raise MalformedRtf("RTF stream does not start with {\\rtf1")
+
+
+
+

Methods

+
+
+def deencapsulate(self) +
+
+

De-encapsulate the RTF content loaded into the De-Encapsulator.

+

Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The content property will store the content no matter what format it is in. The html and text properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)

+
+ +Expand source code + +
    def deencapsulate(self):
+        """De-encapsulate the RTF content loaded into the De-Encapsulator.
+
+Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
+        """
+        stripped_data = strip_binary_objects(self.raw_rtf)
+        non_binary_rtf = stripped_data[0]
+        found_binary = stripped_data[1]
+        if len(found_binary) > 0:
+            self.found_binary = found_binary
+            log.info("Binary data found and extracted from rtf file.")
+        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
+        log_transformations(escaped_rtf)
+        self.parse_rtf(escaped_rtf)
+        Decoder = TextDecoder()
+        Decoder.update_children(self.full_tree)
+        self.get_doc_tree()
+        self.validate_encapsulation()
+
+        # remove htmlrtf escaped values
+        htmlrtf_stripped = self.strip_htmlrtf_tokens()
+        # Strips whitespace from control words
+        control_stripped = StripControlWords().transform(htmlrtf_stripped)
+        # Strip unused control chars
+        special_stripper = StripUnusedSpecialCharacters()
+        non_special_tree = special_stripper.transform(control_stripped)
+        # Strip out non-visible RTF groups
+        stripper = StripNonVisibleRTFGroups()
+        stripped_tree = stripper.transform(non_special_tree)
+        # Converts any remaining tokens
+        cleaner = RTFCleaner(visit_tokens=True)
+        cleaned_text = cleaner.transform(stripped_tree)
+
+        self.content = cleaned_text
+        self.set_content() # Populates self.html || self.text
+
+
+
+def get_ansicpg_header(self) ‑> Optional[lark.lexer.Token] +
+
+

Extract the ansicpg control word from the .rtf header.

+

Returns

+

A lark CONTROLWORD Token with the \ansicpg value. Returns None if the \ansicpg control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.

+
+ +Expand source code + +
    def get_ansicpg_header(self) -> Union[Token,None]:
+        """Extract the ansicpg control word from the .rtf header.
+
+Returns:
+        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
+"""
+        headers = self.get_header_control_words_before_first_group()
+        for item in headers:
+            if item.value.startswith(b'\\ansicpg'):
+                return item
+        return None
+
+
+
+def get_content_type(self) +
+
+

Provide the type of content encapsulated in RTF.

+

NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.

+

Raises

+
+
NotEncapsulatedRtf
+
The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+
+
+ +Expand source code + +
    def get_content_type(self):
+        """Provide the type of content encapsulated in RTF.
+
+NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.
+
+Raises:
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+"""
+        if self.content_type_token is None:
+            self.validate_FROM_in_doc_header()
+        elif self.content_type_token == b'\\fromhtml1':
+            return 'html'
+        elif self.content_type_token == b'\\fromtext':
+            return "text"
+
+        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")
+
+
+
+def get_doc_tree(self) +
+
+

Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.

+

Raises

+
+
ValueError
+
The .rtf document object is missing or mis-located in the .rtf's full_tree object.
+
+
+ +Expand source code + +
    def get_doc_tree(self):
+        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.
+
+Raises:
+        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
+"""
+        if self.full_tree.children[1].data == "document":
+            self.doc_tree = self.full_tree.children[1]
+        else:
+            raise ValueError("Document object in the wrong place after parsing.")
+
+
+
+def get_header_control_words_before_first_group(self) ‑> list +
+
+

Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)

+

This is used to extract initial header values for validation functions.

+

Returns

+

A list containing the header tokens in the .rtf data.

+
+ +Expand source code + +
    def get_header_control_words_before_first_group(self) -> list:
+        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)
+
+This is used to extract initial header values for validation functions.
+
+Returns:
+        A list containing the header tokens in the .rtf data.
+        """
+        initial_control_words = []
+        for token in self.doc_tree.children[:20]:
+            if isinstance(token, Token):
+                initial_control_words.append(token)
+            else:
+                return initial_control_words
+        return initial_control_words
+
+
+
+def parse_rtf(self, rtf: str) +
+
+

Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.

+

Args

+
+
rtf
+
The .rtf string to parse with the projects lark grammar.
+
+
+ +Expand source code + +
    def parse_rtf(self, rtf: str):
+        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.
+
+Args:
+        rtf: The .rtf string to parse with the projects lark grammar.
+"""
+        # Uncomment Lark debug argument if you want to enable logging.
+        # Note, this not enable ALL lark debug logging.
+        # To do that we would not be able to use the Lark convinence class which we are using here.
+        self.parser = Lark(self.grammar,
+                           parser='lalr',
+                           keep_all_tokens=True,
+                           use_bytes=True,
+                           # debug=True,
+                           propagate_positions=True)
+        self.full_tree = self.parser.parse(rtf)
+        log_transformations(self.full_tree)
+
+
+
+def set_content(self) +
+
+

Populate the html or text content based on the content type. Populates self.html and/or self.text variables.

+
+ +Expand source code + +
def set_content(self):
+    """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
+    self.content_type = self.get_content_type()
+    if self.content_type == 'html':
+        self.html = self.content
+    else:
+        self.text = self.content
+
+
+
+def strip_htmlrtf_tokens(self) ‑> lark.tree.Tree +
+
+

Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.

+

Returns

+

.rtf doc_tree stripped of all non-original tokens.

+
+ +Expand source code + +
    def strip_htmlrtf_tokens(self) -> Tree:
+        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.
+
+Returns:
+        .rtf doc_tree stripped of all non-original tokens.
+"""
+        # remove htmlrtf escaped values
+        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
+        tokens_to_delete = list(delete_generator)
+        deleter = DeleteTokensFromTree(tokens_to_delete)
+        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
+        return htmlrtf_cleaned_tree
+
+
+
+def validate_FROM_in_doc_header(self) +
+
+

Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.

+

NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX

+

Raises

+
+
MalformedEncapsulatedRtf
+
The .rtf headers are malformed.
+
NotEncapsulatedRtf
+
The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+
+
+ +Expand source code + +
    def validate_FROM_in_doc_header(self):
+        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.
+
+NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX
+
+Raises:
+        MalformedEncapsulatedRtf: The .rtf headers are malformed.
+        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
+        """
+        cw_found = {"rtf1":False,
+                    "from":False,
+                    "fonttbl":False,
+                    "malformed":False}
+        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
+        decoded_tree = StripControlWords().transform(self.doc_tree)
+        first_ten_tokens = decoded_tree.children[:10]
+        operating_tokens = []
+        found_token = None
+        for token in first_ten_tokens:
+            if isinstance(token, Token):
+                operating_tokens.append(token)
+            else:
+                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
+        log_validators(f"Header tokens being evaluated: {operating_tokens}")
+
+        for token in operating_tokens:
+            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
+            if cw_found['from'] is True and cw_found["malformed"] is True:
+                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
+            # Save content type token available for id-ing type of content later
+            if found_token is not None:
+                self.content_type_token = found_token
+
+        if cw_found['from'] is False:
+            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
+            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")
+
+
+
+def validate_charset(self, fallback_to_default: bool = False) ‑> bytes +
+
+

Validate and return the RTF charset keyword from the RTF streams header.

+

Args

+
+
fallback_to_default : bool
+
Allows you to force the use of the default charset "\ansi" if one is not found.
+
+

Raises

+
+
MalformedRtf
+
RTF stream does not include charset control word.
+
+

Returns

+

The RTF charset keyword from the RTF streams header.

+
+ +Expand source code + +
    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
+        """Validate and return the RTF charset keyword from the RTF streams header.
+
+Args:
+        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.
+
+Raises:
+        MalformedRtf: RTF stream does not include charset control word.
+
+Returns:
+        The RTF charset keyword from the RTF streams header.
+"""
+        main_headers = self.get_header_control_words_before_first_group()
+
+        for token in main_headers:
+            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
+                return token
+
+        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
+        if fallback_to_default is False:
+            raise MalformedRtf("RTF stream does not include charset control word.")
+
+        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
+        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
+        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
+        return b"\\ansi"
+
+
+
+def validate_encapsulation(self) +
+
+

Runs simple tests to validate that the file in question is an rtf document which contains encapsulation.

+
+ +Expand source code + +
def validate_encapsulation(self):
+    """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
+    self.validate_rtf_doc_header(self.doc_tree)
+    self.validate_charset()
+    self.validate_FROM_in_doc_header()
+    ansicpg = self.get_ansicpg_header()
+    if ansicpg is not None: # ansicpg is not manditory
+        validate_ansi_cpg(ansicpg.value)
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/deencapsulate.md b/docs/RTFDE/deencapsulate.md new file mode 100644 index 0000000..0921bd4 --- /dev/null +++ b/docs/RTFDE/deencapsulate.md @@ -0,0 +1,148 @@ +Module RTFDE.deencapsulate +========================== + +Classes +------- + +`DeEncapsulator(raw_rtf: bytes, grammar: Optional[str] = None)` +: De-Encapsulating RTF converter of HTML/TEXT found in .msg files. + + De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content. + + + Parameters: + raw_rtf: (bytes): It's the raw RTF file as bytes. + grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. + + Attributes: + content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function. + html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function. + text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function. + found_binary: List of dictionaries containing binary data extracted from the rtf file. + content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function. + full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function. + doc_tree: The `document` portion of the .rtf full_tree object. + raw_rtf: The raw encapsulated .rtf data in byte format. + grammar: The Lark parsing grammer used to parse the .rtf data. + content_type_token: The .rtf header token identifying the content type. (\fromhtml1 OR \fromtext) + parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object. + + + Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF. + + NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step. + + Parameters: + raw_rtf: (bytes): It's the raw RTF string. + grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request. + + Raises: + TypeError: The raw_rtf data passed is not the correct type of data (string/byte string). + + ### Static methods + + `catch_common_validation_issues(raw_rtf: ~AnyStr)` + : Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them. + + Args: + raw_rtf: A raw .rtf string or byte-string. + + Raises: + TypeError: The data passed is the wrong type of data. + MalformedRtf: The data passed is not a correctly formatted .rtf string. + + `check_from_token(token: lark.lexer.Token, cw_found: dict) ‑> Tuple[Dict, Optional[None]]` + : Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function. + + Args: + token: The token to check for in the cw_found state dictionary. + cw_found: The state dictionary which is used to track the position of the from token within the header. + + `cw_found = {"rtf1":, "from":, "fonttbl":, "malformed":}` + + + Returns: + cw_found: Updated state dictionary + found_token: The content_type_token found in the header. + + `validate_rtf_doc_header(doc_tree: lark.tree.Tree)` + : Check if doc starts with a valid RTF header `\rtf1`. + + "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\rtf1")." - MS-OXRTFEX + + Raises: + MalformedRtf: The .rtf headers do not include \rtf1. + + ### Methods + + `deencapsulate(self)` + : De-encapsulate the RTF content loaded into the De-Encapsulator. + + Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.) + + `get_ansicpg_header(self) ‑> Optional[lark.lexer.Token]` + : Extract the ansicpg control word from the .rtf header. + + Returns: + A lark CONTROLWORD Token with the `\ansicpg` value. Returns None if the `\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file. + + `get_content_type(self)` + : Provide the type of content encapsulated in RTF. + + NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data. + + Raises: + NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file. + + `get_doc_tree(self)` + : Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute. + + Raises: + ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object. + + `get_header_control_words_before_first_group(self) ‑> list` + : Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.) + + This is used to extract initial header values for validation functions. + + Returns: + A list containing the header tokens in the .rtf data. + + `parse_rtf(self, rtf: str)` + : Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute. + + Args: + rtf: The .rtf string to parse with the projects lark grammar. + + `set_content(self)` + : Populate the html or text content based on the content type. Populates self.html and/or self.text variables. + + `strip_htmlrtf_tokens(self) ‑> lark.tree.Tree` + : Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content. + + Returns: + .rtf doc_tree stripped of all non-original tokens. + + `validate_FROM_in_doc_header(self)` + : Inspect the header to identify what type of content (html/plain text) is encapsulated within the document. + + NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX + + Raises: + MalformedEncapsulatedRtf: The .rtf headers are malformed. + NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file. + + `validate_charset(self, fallback_to_default: bool = False) ‑> bytes` + : Validate and return the RTF charset keyword from the RTF streams header. + + Args: + fallback_to_default (bool): Allows you to force the use of the default charset "\ansi" if one is not found. + + Raises: + MalformedRtf: RTF stream does not include charset control word. + + Returns: + The RTF charset keyword from the RTF streams header. + + `validate_encapsulation(self)` + : Runs simple tests to validate that the file in question is an rtf document which contains encapsulation. \ No newline at end of file diff --git a/docs/RTFDE/exceptions.html b/docs/RTFDE/exceptions.html new file mode 100644 index 0000000..5b18b63 --- /dev/null +++ b/docs/RTFDE/exceptions.html @@ -0,0 +1,203 @@ + + + + + + +RTFDE.exceptions API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.exceptions

+
+
+
+ +Expand source code + +
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+class UnsupportedRTFFormat(Exception):
+    """An exception which signifies that the file might be a totally valid RTF encapsulation. But, that it is unsupported at this time."""
+
+class NotEncapsulatedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF encapsulation.
+
+    You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+class MalformedEncapsulatedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF encapsulation.
+
+    You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+class MalformedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF.
+
+    You might have passed us a new variation of RTF lazily created by someone who only read the spec in passing; it's possibly some polyglot file; a RTF file that is intended to be malicious; or even somthing that only looks like RTF in passing. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class MalformedEncapsulatedRtf +(*args, **kwargs) +
+
+

An exception which signifies that the data being provided in not a valid RTF encapsulation.

+

You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.

+
+ +Expand source code + +
class MalformedEncapsulatedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF encapsulation.
+
+    You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+

Ancestors

+
    +
  • builtins.TypeError
  • +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
+class MalformedRtf +(*args, **kwargs) +
+
+

An exception which signifies that the data being provided in not a valid RTF.

+

You might have passed us a new variation of RTF lazily created by someone who only read the spec in passing; it's possibly some polyglot file; a RTF file that is intended to be malicious; or even somthing that only looks like RTF in passing. We'll give more information in the error message, but we're not going to try to de-encapsulate it.

+
+ +Expand source code + +
class MalformedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF.
+
+    You might have passed us a new variation of RTF lazily created by someone who only read the spec in passing; it's possibly some polyglot file; a RTF file that is intended to be malicious; or even somthing that only looks like RTF in passing. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+

Ancestors

+
    +
  • builtins.TypeError
  • +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
+class NotEncapsulatedRtf +(*args, **kwargs) +
+
+

An exception which signifies that the data being provided in not a valid RTF encapsulation.

+

You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.

+
+ +Expand source code + +
class NotEncapsulatedRtf(TypeError):
+    """An exception which signifies that the data being provided in not a valid RTF encapsulation.
+
+    You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it.
+    """
+
+

Ancestors

+
    +
  • builtins.TypeError
  • +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
+class UnsupportedRTFFormat +(*args, **kwargs) +
+
+

An exception which signifies that the file might be a totally valid RTF encapsulation. But, that it is unsupported at this time.

+
+ +Expand source code + +
class UnsupportedRTFFormat(Exception):
+    """An exception which signifies that the file might be a totally valid RTF encapsulation. But, that it is unsupported at this time."""
+
+

Ancestors

+
    +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/exceptions.md b/docs/RTFDE/exceptions.md new file mode 100644 index 0000000..7421842 --- /dev/null +++ b/docs/RTFDE/exceptions.md @@ -0,0 +1,46 @@ +Module RTFDE.exceptions +======================= + +Classes +------- + +`MalformedEncapsulatedRtf(*args, **kwargs)` +: An exception which signifies that the data being provided in not a valid RTF encapsulation. + + You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it. + + ### Ancestors (in MRO) + + * builtins.TypeError + * builtins.Exception + * builtins.BaseException + +`MalformedRtf(*args, **kwargs)` +: An exception which signifies that the data being provided in not a valid RTF. + + You might have passed us a new variation of RTF lazily created by someone who only read the spec in passing; it's possibly some polyglot file; a RTF file that is intended to be malicious; or even somthing that only looks like RTF in passing. We'll give more information in the error message, but we're not going to try to de-encapsulate it. + + ### Ancestors (in MRO) + + * builtins.TypeError + * builtins.Exception + * builtins.BaseException + +`NotEncapsulatedRtf(*args, **kwargs)` +: An exception which signifies that the data being provided in not a valid RTF encapsulation. + + You might have passed us a RTF file with no HTML/RTF encapsulation or it may simply be that the tool which did the encapsulation didn't follow the spec so the encapsulation is incorrect. We'll give more information in the error message, but we're not going to try to de-encapsulate it. + + ### Ancestors (in MRO) + + * builtins.TypeError + * builtins.Exception + * builtins.BaseException + +`UnsupportedRTFFormat(*args, **kwargs)` +: An exception which signifies that the file might be a totally valid RTF encapsulation. But, that it is unsupported at this time. + + ### Ancestors (in MRO) + + * builtins.Exception + * builtins.BaseException \ No newline at end of file diff --git a/docs/RTFDE/grammar.html b/docs/RTFDE/grammar.html new file mode 100644 index 0000000..2552307 --- /dev/null +++ b/docs/RTFDE/grammar.html @@ -0,0 +1,598 @@ + + + + + + +RTFDE.grammar API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.grammar

+
+
+
+ +Expand source code + +
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+
+# TODO: Remove
+HTMLRTF_GRAMMAR = """
+start : obj+
+obj : HTMLRTF | OTHER | WS
+%import common.DIGIT
+%import common.LETTER
+%import common.WS
+_SPACE_DELETE : " "
+HTMLRTF : "\\htmlrtf" DIGIT~0..3
+OTHER : /((?!\\\\htmlrtf).)+/s
+"""
+
+
+GRAMMAR = {
+    "imports": r"""
+%import common.ESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.DIGIT
+%import common.NEWLINE
+%import common.LETTER""",
+    "ignore": r"""%ignore NEWLINE""",
+    "_LBRACE": r'"{"',
+    "_RBRACE": r'"}"',
+    "BACKSLASH": r'"\\"',
+    "start": r"_LBRACE document _RBRACE",
+    "document": r"""(CONTROLWORD
+                    | control_symbol
+                    | string
+                    | group
+                    | HTMLRTF
+                    | hexarray
+                    | _SPACE_DELETE
+                    | SPACE_SAVE
+                    | UNICODE)+""",
+    "group": r"""_LBRACE (CONTROLWORD
+                        | control_symbol
+                        | string
+                        | htmltag_group
+                        | mhtmltag_group
+                        | group
+                        | SPACE_SAVE
+                        | _SPACE_DELETE
+                        | HTMLRTF
+                        | UNICODE
+                        | hexarray
+                        | NEWLINE )* _RBRACE""",
+    "htmltag_group": r"STAR_ESCAPE HTMLTAG ( string | group )*",
+    "HTMLTAG": r'"\\htmltag" DIGIT~0..3 _SPACE_DELETE?',
+    "MHTMLTAG": r'"\\mhtmltag" DIGIT~0..3 _SPACE_DELETE?',
+    "mhtmltag_group": r"STAR_ESCAPE MHTMLTAG ( string | group )*",
+    "NUMERICALDEL": r"SIGNED_NUMBER*",
+    "_SPACE_DELETE": r'" "',
+    "SPACE_SAVE": r'" "',
+    "DELIMITER": r"NUMERICALDEL _SPACE_DELETE?",
+    "ASCIILETTERSEQUENCE" : r"LETTER+",
+    "CONTROLWORD": "BACKSLASH ASCIILETTERSEQUENCE~1..32 DELIMITER",
+    "STAR_ESCAPE": r'BACKSLASH "*"',
+    "NONBREAKING_HYPHEN": r'BACKSLASH "_"',
+    "OPTIONAL_HYPHEN": r'BACKSLASH "-"',
+    "NONBREAKING_SPACE": r'BACKSLASH "~"',
+    "FORMULA_CHARACTER": r'BACKSLASH "|"',
+    "INDEX_SUBENTRY": r'BACKSLASH ":"',
+    "control_symbol": r"(STAR_ESCAPE | INDEX_SUBENTRY | FORMULA_CHARACTER | NONBREAKING_SPACE | OPTIONAL_HYPHEN | NONBREAKING_HYPHEN )",
+    "STRING": r'/.+?/',
+    "?string": r"STRING+ SPACE_SAVE?",
+    "_QUESTION_MARK": r'"?"',
+    "UNICODE" : r"""("\\u" /[-]*[0-9]+/)+""",
+    "HEXENCODED": """("\\'" /[0-9A-Fa-f]/~2)""",
+    "hexarray": "HEXENCODED+",
+    "HTMLRTF": r'"\\htmlrtf" DIGIT~0..3 _SPACE_DELETE?',
+   }
+
+
+# // == Priority Levels ==
+# This dictionary sets the priority level for each type of object in the lexer.
+# Higher numbers give a greater priority.
+# All must start with a period
+# EXPLICIT IS BETTER THEN RELYING ON DEFAULTS
+# // 0 = Raw String Matching // text should defer to everything else if conflicting
+# // 1 = Generic undefined object (i.e. group, CONTROL_WORD, CONTROL_SYMBOL, etc.)
+# // 2 = Specific instances of objects (i.e. HTMLTAG, MHTMLTAG, etc.)
+
+PRIORITY_LEVELS = {
+    "_LBRACE": ".2",
+    "_RBRACE": ".2",
+    "BACKSLASH" : ".1",
+    "start" : ".1",
+    "document": ".1",
+    "group": ".1",
+    "htmltag_group" : ".2",
+    "HTMLRTF" : ".2",
+    "HTMLTAG" : ".2",
+    "MHTMLTAG" : ".2",
+    "mhtmltag_group" : ".2",
+    "NUMERICALDEL" : ".1",
+    "_SPACE_DELETE" : ".1",
+    "SPACE_SAVE" : ".1",
+    "DELIMITER" : ".1",
+    "ASCIILETTERSEQUENCE" : ".1",
+    "CONTROLWORD": ".1",
+    "STAR_ESCAPE": ".1",
+    "NONBREAKING_HYPHEN": ".1",
+    "OPTIONAL_HYPHEN": ".1",
+    "NONBREAKING_SPACE": ".1",
+    "FORMULA_CHARACTER": ".1",
+    "INDEX_SUBENTRY": ".1",
+    "control_symbol": ".1",
+    "STRING" : ".0",
+    "_QUESTION_MARK": ".1",
+    "?string" : ".0",
+    "UNICODE" : ".2",
+    "HEXENCODED" : ".1",
+    "hexarray" : ".2",
+}
+
+def make_concise_grammar():
+    """Make a grammar string to use with the lexer.
+    """
+    grammar = r""""""
+    for key, priority in PRIORITY_LEVELS.items():
+        grammar += "{0}{1} : {2}\n".format(key,priority,GRAMMAR[key])
+    grammar += GRAMMAR['imports'] + "\n"
+    grammar += GRAMMAR['ignore'] + "\n"
+    return (grammar)
+
+
+
+def make_literate_grammar():
+    """Create a VERBOSE grammar string which can be used to understand the grammar.
+
+    This SHOULD be updated to include and changes to the grammar.
+    This is valuable when debugging and/or otherwise trying to understand the grammar.
+    """
+    grammar = r"""
+
+// ===== Precedence =========
+// Literals are matched according to the following precedence:
+// 1. Highest priority first (priority is specified as: TERM.number: …)
+// 2. Length of match (for regexps, the longest theoretical match is used)
+// 3. Length of literal / pattern definition
+// 4. Name
+//
+// == Priority Levels ==
+// WARNING: Priority Levels are not shown in this literate grammar.
+// NOTE: Look at PRIORITY_LEVELS for the prioritized levels used in production.
+// 0 = Raw String Matching // text should defer to everything else if conflicting
+// 1 = Generic undefined object (i.e. group, CONTROL_WORD, CONTROL_SYMBOL, etc.)
+// 2 = Specific instances of objects (i.e. HTMLTAG, MHTMLTAG, etc.)
+
+// ====== GRAMMAR OBJECT IMPORTS FROM LARK COMMONS ======
+// https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark
+{imports}
+
+
+// ====== Ignore Newlines ======
+// The real carriage returns are stored in \par or \line tags.
+{ignore}
+
+// ====== SIMPLE GRAMMAR OBJECTS USED THROUGHOUT ======
+// RTF is braces all the way down
+// We don't have to worry about escaped braces since we are pre-processing out escaped braces already
+_LBRACE: {_LBRACE}
+_RBRACE: {_RBRACE}
+
+// We don't have to worry about escaped backslashes since we are pre-processing out escaped braces already
+BACKSLASH: {BACKSLASH}
+
+// RTF control words are made up of ASCII alphabetical characters (a through z and A through Z)
+ASCIILETTERSEQUENCE: {ASCIILETTERSEQUENCE}
+
+// A space that should be deleted (See Delimiters below)
+_SPACE_DELETE: {_SPACE_DELETE}
+
+// But, we want to save spaces within strings. So, we have a special space for that.
+SPACE_SAVE : {SPACE_SAVE}
+
+// ====== UNMATCHED RAW TEXT ======
+// In order to split out everything that is simply plain text and not a special RTF object I've had to match all raw text characters individually. This allows us to store them all in their own rule branch (string) for tranformation later on.
+
+STRING : {STRING}
+
+// We use the ? char to inline this rule to remove the branch and replace it with its children if it has one match. This will make it easier to parse later and remove uneccesary matches of it.
+?string: {?string}
+
+
+
+
+// ====== HIGH LEVEL DOCUMENT PARSING ======
+
+// The start object is the top level object in the tree
+// An RTF file has the following syntax: '{{' <header & document> '}}'
+start: {start}
+
+// Parse <header & document>
+document: {document}
+
+// A group consists of text and control words or control symbols enclosed in braces ({{}}).
+// The opening brace ({{ ) indicates the start of the group and the closing brace ( }}) indicates the end of the group.
+group: {group}
+
+
+// ====== CONTROL WORD(s) ======
+
+// A control word is defined by: \<ASCII Letter Sequence><Delimiter>
+// A control word’s name cannot be longer than 32 letters.
+CONTROLWORD: {CONTROLWORD}
+
+// === Delimiter ==
+
+DELIMITER: {DELIMITER}
+
+// The <Delimiter> can be one of the following:
+// 1. A numeric digit or an ASCII minus sign (-), which indicates that a numeric parameter is associated with the control word.
+NUMERICALDEL: {NUMERICALDEL}
+// 2. A space: When a space is used as a the delimiter, it is discarded. This means that it’s not included in subsequent processing. So, we are using a discarded terminal (by putting a underscore in front of the name) to ensure it is tossed.
+// See: "_SPACE_DELETE" under SIMPLE GRAMMAR OBJECTS
+
+// 3. Any character other than a letter or a digit. In this case, the delimiting character terminates the control word and is not part of the control word. So, it's not included in the grammar here.
+
+
+// ====== CONTROL SYMBOLS(s) ======
+
+// A control symbol consists of a backslash followed by a single, nonalphabetic character.
+// For example, \~ represents a nonbreaking space.
+
+// The STAR_ESCAPE special construct means that if the program understands the \command, it takes this to mean {\command ...}, but if it doesn’t understand \command, the program ignores not just \command (as it would anyway) but everything in this group.
+STAR_ESCAPE: {STAR_ESCAPE}
+NONBREAKING_HYPHEN: {NONBREAKING_HYPHEN}
+OPTIONAL_HYPHEN: {OPTIONAL_HYPHEN}
+NONBREAKING_SPACE: {NONBREAKING_SPACE}
+FORMULA_CHARACTER: {FORMULA_CHARACTER}
+INDEX_SUBENTRY: {INDEX_SUBENTRY}
+
+// Control symbols take no delimiters.
+control_symbol: {control_symbol}
+
+
+
+
+
+// ====== SPECIAL CONTROL WORD(s) ======
+
+// ====== HEADER OBJECTS ======
+
+// The FROMHTML control word specifies that the RTF document contains encapsulated HTML text.
+// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated
+// FROMTEXT: {FROMTEXT}
+
+//The FROMHTML control word specifies that the RTF document contains encapsulated HTML text.
+// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated.
+//FROMHTML : {FROMHTML}
+
+
+// ====== SPECIFIC CONTROL WORD OBJECTS ======
+
+// HTMLRTF Toggle Control Word
+// The HTMLRTF control word identifies fragments of RTF that were not in the original HTML content
+// If the flag is "\htmlrtf" or "\htmlrtf1" then do not process anything else until you encounter "\htmlrtf0" which will toggle this off again.
+// A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups.
+// This means that you can only turn this off on it's own level (turning it off in an object nested within it does nothing). And, if the object it's in ends then it doesn't transfer up the tree to objects that contain it. So, if you don't find a closing "\htmlrtf0" you can delete from the opening "\htmlrtf" all the way until the end of the current object, but not above.
+HTMLRTF : {HTMLRTF}
+
+// The HTMLTAG destination group encapsulates HTML fragments that cannot be directly represented in RTF
+htmltag_group: {htmltag_group}
+
+// The "DIGIT~0..3" in the following definition is the HTMLTagParameter from the spec.
+    // A space MUST be used to separate the CONTENT HTML fragment from the HTMLTagParameter HTML fragment if the text starts with a DIGIT, or if the HTMLTagParameter HTML fragment is omitted. As such, we throw away this space by using _SPACE_DELETE if we encounter one.
+HTMLTAG: {HTMLTAG}
+
+
+
+content : {content}
+
+// \*\mhtmltag[HTMLTagParameter] [CONTENT]
+// The values and format of the numeric parameter are identical to the numeric parameter in the HTMLTAG destination group.
+// This RTF control word SHOULD be skipped on de-encapsulation and SHOULD NOT be written when encapsulating.
+# TODO: https://datatracker.ietf.org/doc/html/draft-ietf-mhtml-cid-00#section-1
+// NOTE: mhtmltag's contain original URL which has been replaced in the corresponding  htmltag with the CID of an object. As such, it contains possibly useful URI data that, while not useful for the direct output, should be saved.
+MHTMLTAG : {MHTMLTAG}
+mhtmltag_group: {mhtmltag_group}
+
+
+// TODO: Check if really neeeded
+// Increased priority of escape chars to make unescaping easier
+// Multiple char acceptance is important here because if you just catch one escape at a time you mess up multi-byte values.
+_QUESTION_MARK: {_QUESTION_MARK}
+
+// TODO Define these objects
+
+// RTFESCAPE no longer used
+// RTFESCAPE : {RTFESCAPE}
+
+// UNICODE unicode chars
+UNICODE : {UNICODE}
+
+// Hex chars [HEXENCODED] are stored in an array [hexarray]
+// We often need to parse hex chars as a set so this is the easiest way
+HEXENCODED : {HEXENCODED}
+hexarray : {hexarray}
+
+    """.format(**GRAMMAR)
+    return grammar
+
+
+if __name__ == '__main__':
+    # print(make_literate_grammar())
+    print(make_concise_grammar())
+
+
+
+
+
+
+
+

Functions

+
+
+def make_concise_grammar() +
+
+

Make a grammar string to use with the lexer.

+
+ +Expand source code + +
def make_concise_grammar():
+    """Make a grammar string to use with the lexer.
+    """
+    grammar = r""""""
+    for key, priority in PRIORITY_LEVELS.items():
+        grammar += "{0}{1} : {2}\n".format(key,priority,GRAMMAR[key])
+    grammar += GRAMMAR['imports'] + "\n"
+    grammar += GRAMMAR['ignore'] + "\n"
+    return (grammar)
+
+
+
+def make_literate_grammar() +
+
+

Create a VERBOSE grammar string which can be used to understand the grammar.

+

This SHOULD be updated to include and changes to the grammar. +This is valuable when debugging and/or otherwise trying to understand the grammar.

+
+ +Expand source code + +
def make_literate_grammar():
+    """Create a VERBOSE grammar string which can be used to understand the grammar.
+
+    This SHOULD be updated to include and changes to the grammar.
+    This is valuable when debugging and/or otherwise trying to understand the grammar.
+    """
+    grammar = r"""
+
+// ===== Precedence =========
+// Literals are matched according to the following precedence:
+// 1. Highest priority first (priority is specified as: TERM.number: …)
+// 2. Length of match (for regexps, the longest theoretical match is used)
+// 3. Length of literal / pattern definition
+// 4. Name
+//
+// == Priority Levels ==
+// WARNING: Priority Levels are not shown in this literate grammar.
+// NOTE: Look at PRIORITY_LEVELS for the prioritized levels used in production.
+// 0 = Raw String Matching // text should defer to everything else if conflicting
+// 1 = Generic undefined object (i.e. group, CONTROL_WORD, CONTROL_SYMBOL, etc.)
+// 2 = Specific instances of objects (i.e. HTMLTAG, MHTMLTAG, etc.)
+
+// ====== GRAMMAR OBJECT IMPORTS FROM LARK COMMONS ======
+// https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark
+{imports}
+
+
+// ====== Ignore Newlines ======
+// The real carriage returns are stored in \par or \line tags.
+{ignore}
+
+// ====== SIMPLE GRAMMAR OBJECTS USED THROUGHOUT ======
+// RTF is braces all the way down
+// We don't have to worry about escaped braces since we are pre-processing out escaped braces already
+_LBRACE: {_LBRACE}
+_RBRACE: {_RBRACE}
+
+// We don't have to worry about escaped backslashes since we are pre-processing out escaped braces already
+BACKSLASH: {BACKSLASH}
+
+// RTF control words are made up of ASCII alphabetical characters (a through z and A through Z)
+ASCIILETTERSEQUENCE: {ASCIILETTERSEQUENCE}
+
+// A space that should be deleted (See Delimiters below)
+_SPACE_DELETE: {_SPACE_DELETE}
+
+// But, we want to save spaces within strings. So, we have a special space for that.
+SPACE_SAVE : {SPACE_SAVE}
+
+// ====== UNMATCHED RAW TEXT ======
+// In order to split out everything that is simply plain text and not a special RTF object I've had to match all raw text characters individually. This allows us to store them all in their own rule branch (string) for tranformation later on.
+
+STRING : {STRING}
+
+// We use the ? char to inline this rule to remove the branch and replace it with its children if it has one match. This will make it easier to parse later and remove uneccesary matches of it.
+?string: {?string}
+
+
+
+
+// ====== HIGH LEVEL DOCUMENT PARSING ======
+
+// The start object is the top level object in the tree
+// An RTF file has the following syntax: '{{' <header & document> '}}'
+start: {start}
+
+// Parse <header & document>
+document: {document}
+
+// A group consists of text and control words or control symbols enclosed in braces ({{}}).
+// The opening brace ({{ ) indicates the start of the group and the closing brace ( }}) indicates the end of the group.
+group: {group}
+
+
+// ====== CONTROL WORD(s) ======
+
+// A control word is defined by: \<ASCII Letter Sequence><Delimiter>
+// A control word’s name cannot be longer than 32 letters.
+CONTROLWORD: {CONTROLWORD}
+
+// === Delimiter ==
+
+DELIMITER: {DELIMITER}
+
+// The <Delimiter> can be one of the following:
+// 1. A numeric digit or an ASCII minus sign (-), which indicates that a numeric parameter is associated with the control word.
+NUMERICALDEL: {NUMERICALDEL}
+// 2. A space: When a space is used as a the delimiter, it is discarded. This means that it’s not included in subsequent processing. So, we are using a discarded terminal (by putting a underscore in front of the name) to ensure it is tossed.
+// See: "_SPACE_DELETE" under SIMPLE GRAMMAR OBJECTS
+
+// 3. Any character other than a letter or a digit. In this case, the delimiting character terminates the control word and is not part of the control word. So, it's not included in the grammar here.
+
+
+// ====== CONTROL SYMBOLS(s) ======
+
+// A control symbol consists of a backslash followed by a single, nonalphabetic character.
+// For example, \~ represents a nonbreaking space.
+
+// The STAR_ESCAPE special construct means that if the program understands the \command, it takes this to mean {\command ...}, but if it doesn’t understand \command, the program ignores not just \command (as it would anyway) but everything in this group.
+STAR_ESCAPE: {STAR_ESCAPE}
+NONBREAKING_HYPHEN: {NONBREAKING_HYPHEN}
+OPTIONAL_HYPHEN: {OPTIONAL_HYPHEN}
+NONBREAKING_SPACE: {NONBREAKING_SPACE}
+FORMULA_CHARACTER: {FORMULA_CHARACTER}
+INDEX_SUBENTRY: {INDEX_SUBENTRY}
+
+// Control symbols take no delimiters.
+control_symbol: {control_symbol}
+
+
+
+
+
+// ====== SPECIAL CONTROL WORD(s) ======
+
+// ====== HEADER OBJECTS ======
+
+// The FROMHTML control word specifies that the RTF document contains encapsulated HTML text.
+// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated
+// FROMTEXT: {FROMTEXT}
+
+//The FROMHTML control word specifies that the RTF document contains encapsulated HTML text.
+// This control word MUST be \fromhtml1. Any other form, such as \fromhtml or \fromhtml0, will not be considered encapsulated.
+//FROMHTML : {FROMHTML}
+
+
+// ====== SPECIFIC CONTROL WORD OBJECTS ======
+
+// HTMLRTF Toggle Control Word
+// The HTMLRTF control word identifies fragments of RTF that were not in the original HTML content
+// If the flag is "\htmlrtf" or "\htmlrtf1" then do not process anything else until you encounter "\htmlrtf0" which will toggle this off again.
+// A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups.
+// This means that you can only turn this off on it's own level (turning it off in an object nested within it does nothing). And, if the object it's in ends then it doesn't transfer up the tree to objects that contain it. So, if you don't find a closing "\htmlrtf0" you can delete from the opening "\htmlrtf" all the way until the end of the current object, but not above.
+HTMLRTF : {HTMLRTF}
+
+// The HTMLTAG destination group encapsulates HTML fragments that cannot be directly represented in RTF
+htmltag_group: {htmltag_group}
+
+// The "DIGIT~0..3" in the following definition is the HTMLTagParameter from the spec.
+    // A space MUST be used to separate the CONTENT HTML fragment from the HTMLTagParameter HTML fragment if the text starts with a DIGIT, or if the HTMLTagParameter HTML fragment is omitted. As such, we throw away this space by using _SPACE_DELETE if we encounter one.
+HTMLTAG: {HTMLTAG}
+
+
+
+content : {content}
+
+// \*\mhtmltag[HTMLTagParameter] [CONTENT]
+// The values and format of the numeric parameter are identical to the numeric parameter in the HTMLTAG destination group.
+// This RTF control word SHOULD be skipped on de-encapsulation and SHOULD NOT be written when encapsulating.
+# TODO: https://datatracker.ietf.org/doc/html/draft-ietf-mhtml-cid-00#section-1
+// NOTE: mhtmltag's contain original URL which has been replaced in the corresponding  htmltag with the CID of an object. As such, it contains possibly useful URI data that, while not useful for the direct output, should be saved.
+MHTMLTAG : {MHTMLTAG}
+mhtmltag_group: {mhtmltag_group}
+
+
+// TODO: Check if really neeeded
+// Increased priority of escape chars to make unescaping easier
+// Multiple char acceptance is important here because if you just catch one escape at a time you mess up multi-byte values.
+_QUESTION_MARK: {_QUESTION_MARK}
+
+// TODO Define these objects
+
+// RTFESCAPE no longer used
+// RTFESCAPE : {RTFESCAPE}
+
+// UNICODE unicode chars
+UNICODE : {UNICODE}
+
+// Hex chars [HEXENCODED] are stored in an array [hexarray]
+// We often need to parse hex chars as a set so this is the easiest way
+HEXENCODED : {HEXENCODED}
+hexarray : {hexarray}
+
+    """.format(**GRAMMAR)
+    return grammar
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/grammar.md b/docs/RTFDE/grammar.md new file mode 100644 index 0000000..079cb2a --- /dev/null +++ b/docs/RTFDE/grammar.md @@ -0,0 +1,16 @@ +Module RTFDE.grammar +==================== + +Functions +--------- + + +`make_concise_grammar()` +: Make a grammar string to use with the lexer. + + +`make_literate_grammar()` +: Create a VERBOSE grammar string which can be used to understand the grammar. + + This SHOULD be updated to include and changes to the grammar. + This is valuable when debugging and/or otherwise trying to understand the grammar. \ No newline at end of file diff --git a/docs/RTFDE/index.html b/docs/RTFDE/index.html new file mode 100644 index 0000000..0991f2d --- /dev/null +++ b/docs/RTFDE/index.html @@ -0,0 +1,127 @@ + + + + + + +RTFDE API documentation + + + + + + + + + + + +
+
+
+

Package RTFDE

+
+
+

RTFDE: A python3 library for extracting HTML content from RTF encapsulated HTML.

+

https://github.com/seamustuohy/RTFDE

+
+ +Expand source code + +
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Date Format: YYYY-MM-DD
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+"""
+RTFDE: A python3 library for extracting HTML content from RTF encapsulated HTML.
+
+https://github.com/seamustuohy/RTFDE
+"""
+
+__author__ = 'seamus tuohy'
+__date__ = '2023-06-18'
+__version__ = '0.1.0'
+
+import logging
+from logging import NullHandler
+
+logging.getLogger(__name__).addHandler(NullHandler())
+logging.getLogger(__name__ + ".tree_logger").addHandler(NullHandler())
+
+
+
+from RTFDE.deencapsulate import DeEncapsulator
+
+
+
+

Sub-modules

+
+
RTFDE.deencapsulate
+
+
+
+
RTFDE.exceptions
+
+
+
+
RTFDE.grammar
+
+
+
+
RTFDE.text_extraction
+
+
+
+
RTFDE.transformers
+
+
+
+
RTFDE.utils
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/index.md b/docs/RTFDE/index.md new file mode 100644 index 0000000..c655016 --- /dev/null +++ b/docs/RTFDE/index.md @@ -0,0 +1,14 @@ +Module RTFDE +============ +RTFDE: A python3 library for extracting HTML content from RTF encapsulated HTML. + +https://github.com/seamustuohy/RTFDE + +Sub-modules +----------- +* RTFDE.deencapsulate +* RTFDE.exceptions +* RTFDE.grammar +* RTFDE.text_extraction +* RTFDE.transformers +* RTFDE.utils \ No newline at end of file diff --git a/docs/RTFDE/text_extraction.html b/docs/RTFDE/text_extraction.html new file mode 100644 index 0000000..f3feeb6 --- /dev/null +++ b/docs/RTFDE/text_extraction.html @@ -0,0 +1,2154 @@ + + + + + + +RTFDE.text_extraction API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.text_extraction

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+import re
+from collections import namedtuple
+from typing import Union, Any, List, Tuple, Dict
+
+from oletools.common import codepages
+
+from lark.lexer import Token
+from lark.tree import Tree
+
+from RTFDE.exceptions import MalformedRtf
+from RTFDE.utils import is_codeword_with_numeric_arg
+from RTFDE.utils import flatten_tree_to_string_array
+from RTFDE.utils import log_text_extraction
+
+import logging
+log = logging.getLogger("RTFDE")
+
+fontdef = namedtuple("fontdef", ["fnum", "codepage", "codec", "fontdef_tree"])
+
+
+def get_font_table(tree: Tree) -> Tree:
+    """Extract the font table group from the first 20 tokens of a .rtf document.
+
+Args:
+    tree (Tree): A .rtf document object parsed into a Tree object
+
+Raises:
+    ValueError: If no group with a `\\fonttbl` token as its first controlword is found.
+
+Returns:
+    {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'),
+    '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'),
+    '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'),
+    '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'),
+    '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'),
+    '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'),
+    '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')}
+    """
+    for item in tree.children[:20]:
+        if isinstance(item, Tree):
+            try:
+                ctrl_value = item.children[1]
+            except IndexError as _e:
+                continue
+            if isinstance(ctrl_value, Token):
+                table_type = ctrl_value.value.strip()
+                if table_type == b"\\fonttbl":
+                    return item
+    raise ValueError("No font table found in tree")
+
+
+def is_font_number(token: Token) -> bool:
+    """Checks if an object is a "font number".
+
+Returns:
+    True if an object is a "font number" controlword `\\fN`. False if not.
+
+"""
+    try:
+        if is_codeword_with_numeric_arg(token, b'\\f'):
+            return True
+    except AttributeError: # pragma: no cover
+        return False
+    return False
+
+def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]:
+    """Return the codepage to use with a specific fcharsetN.
+
+Args:
+    fcharsetN (int): The numeric argument N for a \fcharsetN control word.
+
+Returns:
+    (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)
+
+    """
+    # Charset table retrieved on 2022-08-19
+    # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN
+    charsets: dict[int,dict[str,Any]] = {
+        0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252},
+        1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None},
+        2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None},
+        128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932},
+        129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949},
+        134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936},
+        136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950},
+        161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253},
+        162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254},
+        177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255},
+        178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256},
+        186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257},
+        204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251},
+        222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874},
+        238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250},
+        255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None},
+}
+    log_text_extraction(f"Getting charset for {fcharsetN}")
+    charset = charsets.get(fcharsetN, None)
+    if charset is not None:
+        charset_id = charset.get('id', None)
+        return charset_id
+    return None
+
+
+def get_default_font(tree: Tree) -> Union[str,None]:
+    """Extract the font number controlword default font if it exists.
+
+If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group.
+
+Args:
+    tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object.
+
+Returns:
+    The default font control number if it exists from the first `\\deffN`. None if not found.
+"""
+    deff_gen = tree.scan_values(
+        lambda v: is_codeword_with_numeric_arg(v, b'\\deff')
+    )
+    deff_options = list(deff_gen)
+    try:
+        # We just want the first \\deffN. It shouldn't be set multiple times.
+        deff = deff_options[0]
+        deff_num = deff.value[5:]
+        return b'\\f' + deff_num
+    except IndexError:
+        return None
+
+def parse_font_tree(font_tree: Tree) -> dict:
+    """Create a font tree dictionary with appropriate codeces to decode text.
+
+Args:
+    font_tree (Tree): The .rtf font table object decoded as a tree.
+
+Returns:
+    A dictionary which maps font numbers to appropriate python codeces needed to decode text.
+"""
+    parsed_font_tree = {}
+    for tree in font_tree.children:
+        if isinstance(tree, Tree):
+            fnum = None
+            fcharset = None
+            cpg_num = None
+            for tok in tree.children:
+                if is_codeword_with_numeric_arg(tok, b'\\f'):
+                    fnum = tok.value
+                elif is_codeword_with_numeric_arg(tok, b'\\fcharset'):
+                    fchar_num = int(tok.value[9:])
+                    fcharset = get_codepage_num_from_fcharset(fchar_num)
+                elif is_codeword_with_numeric_arg(tok, b'\\cpg'):
+                    cpg_num = int(tok.value[4:])
+            if fnum is not None:
+                # get the codepage
+                codepage_num = None
+
+                if fcharset is not None:
+                    try:
+                        codepage_num = check_codepage_num(fcharset)
+                    except ValueError: # pragma: no cover
+                        codepage_num = None
+                # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored.
+                if ((codepage_num is None) and (cpg_num is not None)):
+                    try:
+                        codepage_num = check_codepage_num(cpg_num)
+                    except ValueError: # pragma: no cover
+                        codepage_num = None
+                # Get the appropriate codec
+                if codepage_num is not None:
+                    codec = get_python_codec(codepage_num)
+                else:
+                    codec = None
+                # Only add if there is a font definition
+                tree_str =  b"".join(list(flatten_tree_to_string_array(tree)))
+                parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str)
+    return parsed_font_tree
+
+
+def get_python_codec(codepage_num: int) -> str:
+    """Returns the python codec needed to decode bytes to unicode.
+
+Args:
+    codepage_num (int): A codepage number.
+
+Returns:
+    The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
+"""
+    text_codec = codepages.codepage2codec(codepage_num)
+    log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec))
+    return text_codec
+
+def check_codepage_num(codepage_num: int) -> int:
+    """Provide the codepage number back to you if it is valid.
+
+Args:
+    codepage_num (int): A possible codepage number.
+
+Returns:
+    The codepage number IF it is a valid codepage number
+
+Raises:
+    ValueError: The codepage_num provided isn't a valid codepage number.
+
+"""
+    # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it.
+    # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
+    # Retrieved on 2020-12-18
+    allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001])
+    if codepage_num in allowed_codepage_nums:
+        return codepage_num
+    # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages.
+    # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
+    raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")
+
+
+def validate_ansi_cpg(header: str) -> None:
+    """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage.
+
+Args:
+    header (str): The value from the lark `\\ansicpg` CONTROLWORD Token.
+
+Raises:
+    MalformedRtf: If the value passed is not a valid ansi codepage.
+"""
+    try:
+        possible_cpg_num = int(header.strip()[8:])
+        check_codepage_num(possible_cpg_num)
+    except ValueError as _e:
+        raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e
+
+
+# UNICODE CHARS
+def unicode_escape_to_chr(item: bytes) -> str:
+    """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.
+
+Args:
+    item (str): A RTF Escape in the format \\u[-]NNNNN.
+
+Returns:
+    The unicode character representation of the identified character
+
+Raises:
+    ValueError: The escaped unicode character is not valid.
+"""
+    try:
+        nnnn = int(item.removeprefix(b'\\u')) # raises ValueError if not int.
+    except ValueError as _e:
+        raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e
+    if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits
+        ncr = 65536 + nnnn
+    else: # § NNNNN is a positive integer expressed in decimal digits
+        ncr = nnnn
+    # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN
+    return chr(ncr)
+
+def is_hex_encoded(item: Token) -> bool:
+    """Identify if a token contains a HEXENCODED token.
+Args:
+    item (token): A token to check if it is HEXENCODED.
+
+Return:
+    True if HEXENCODED. False if not.
+    """
+    if isinstance(item, Token):
+        if item.type == "HEXENCODED":
+            return True
+    return False
+
+def is_valid_ANSI_representation_char(item: Token) -> bool:
+    """Is token contain a valid ANSI representation string for a Unicode char.
+
+Args:
+    item (token): A token to check if it is a valid ANSI representation.
+
+Return:
+    True if token is an ansi representation of a unicode char. False if not.
+"""
+    if isinstance(item, Token):
+        # print(f"found TOKEN posssible ansi {repr(item)}")
+        if is_hex_encoded(item):
+            # print(f"found hex posssible ansi {repr(item)}")
+            return True
+        if item.type == 'STRING':
+            # print(f"found STRING posssible ansi {repr(item)}")
+            if not item.value.isspace(): # whitespace doesn't count.
+                # print(f"found posssible ansi {repr(item)}")
+                return True
+            # else:
+            #     print(f"found SPACE posssible ansi {repr(item)}")
+    # print(f"found NON TOKEN posssible ansi {repr(item)}")
+    return False
+
+def is_unicode_encoded(item: Token) -> bool:
+    """Is token contain a unicode char.
+
+Args:
+    item (token): A token to check if contains a unicode char.
+
+Return:
+    True if token contains a unicode char. False if not.
+"""
+    if isinstance(item, Token):
+        if item.type == "UNICODE":
+            return True
+    return False
+
+def includes_unicode_chars(children: List[Token]) -> bool:
+    """Does a list include Tokens which contain unicode characters. Not recursive.
+
+Args:
+    children (list): A Tree.children list to check to see if it includes unicode characters.
+
+Returns:
+    True if list includes tokens which contain unicode chars. False if not.
+"""
+    for child in children:
+        if is_unicode_encoded(child):
+            return True
+    return False
+
+
+def remove_unicode_replacements(children: List[Token],
+                                return_ascii_map: bool = True,
+                                byte_count: int = 1) -> Union[
+                                    Tuple[List[Token], Dict[Token,List[Token]]],
+                                    List[Token]]:
+    """Remove all unicode replacement characters from a list of Tokens.
+
+Args:
+    children (list): A Tree.children list to remove unicode replacement characters from.
+    return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed.
+    byte_count (int): The number of bytes corresponding to a given \\uN Unicode character.  A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes.
+
+Returns:
+    new_children (list): The list of Tokens with all unicode replacement characters removed.
+    ascii_map (dict): All the Tokens which were removed from the provided children list keyed by
+
+"""
+    byte_count = 1
+    ascii_map: Dict[Token,List[Token]]  = {}
+    new_children = []
+    removal_map: List[Token] = []
+    log_text_extraction(f"Removing unicode replacements on {repr(children)}")
+    for child in children:
+        if len(removal_map) > 0:
+            if isinstance(child, Token):
+                # Delete all spaces between a unicode char and the last ANSI representation
+                # print(f"FOUND SPACE STRING with RM: {removal_map}")
+                if child.value.isspace():
+                    ascii_map.setdefault(removal_map[0], []).append(child)
+                    continue
+            if is_valid_ANSI_representation_char(child):
+                # Found an ansi representation removing unicode char from removal map.
+                # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}")
+                ascii_map.setdefault(removal_map.pop(), []).append(child)
+                continue
+            elif isinstance(child, Tree) and (
+                    (child.data == "string") or (child.data == "hexarray")):
+                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
+                ansi_children = child.children
+                new_ansi_children = []
+                for aci,ac in enumerate(ansi_children):
+                    # print(f"AC CHILD {repr(ac)}")
+                    if is_valid_ANSI_representation_char(ac):
+                        # print(f"AC CHILD VALID {repr(ac)}")
+                        if len(removal_map) > 0:
+                            # print(f"AC CHILD MAP >0 {repr(ac)}")
+                            # print(f"Popping removal for {repr(ac)}")
+                            ascii_map.setdefault(removal_map.pop(), []).append(ac)
+                        else:
+                            # print(f"AC CHILD MAP < 0 {repr(ac)}")
+                            new_ansi_children.append(ac)
+                    else:
+                        # print(f"AC CHILD NOT VALID {repr(ac)}")
+                        new_ansi_children.append(ac)
+                # print(f"NEW Children = {new_ansi_children}")
+                if new_ansi_children == []:
+                    from RTFDE.utils import make_token_replacement
+                    # from RTFDE.utils import embed
+                    # embed()
+                    child = make_token_replacement("STRING", b"", child)
+                else:
+                    child.children = new_ansi_children
+                # print(f"NEW Tree = {child}")
+            # else:
+                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
+                # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}")
+        # Modify char byte count if we encounter it.
+        if is_unicode_char_byte_count(child):
+            byte_count = get_unicode_char_byte_count(child)
+            # print(f"Changing byte count because {child} to {byte_count}")
+        if is_unicode_encoded(child):
+            # print(f"Found unicode {child}")
+            for j in range(byte_count):
+                # Add the unicode key to the removal map once per byte
+                # This ensures we remove the right number of ANSI representation chars
+                removal_map.append(child)
+        new_children.append(child)
+    if return_ascii_map is True:
+        return new_children, ascii_map
+    return new_children
+
+
+# UNICODE SURROGATE CHARACTERS
+def is_surrogate_high_char(item: bytes) -> bool:
+    """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.
+
+        item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179"
+    """
+    if item.startswith(b"\\u"):
+        item = item[2:]
+    if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
+        return True
+    # In case unicode is NOT using the 16 bit signed integer
+    elif 0xD800 <= int(item) <= 0xDBFF:
+        return True
+    return False
+
+def is_surrogate_low_char(item: bytes) -> bool:
+    """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF."  Low-surrogate also sometimes known as following surrogates.
+
+        item (bytes): A bytes representation of a string representing a unicode character.
+    """
+    if item.startswith(b"\\u"):
+        item = item[2:]
+    if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
+        return True
+    # In case unicode is NOT using the 16 bit signed integer
+    elif 0xDC00 <= int(item) <= 0xDFFF:
+        return True
+    return False
+
+def is_surrogate_16bit(item: bytes, cp_range) -> bool:
+    """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.
+
+Args:
+    item (bytes): A bytes representation of a string representing a unicode character.
+    cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
+    """
+    if cp_range == 'low':
+        if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
+            return True
+    elif cp_range == 'high':
+        if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
+            return True
+    else:
+        raise ValueError("cp_range must be either 'low' or 'high'")
+    return False
+
+
+def is_surrogate_pair(first: bytes, second: bytes) -> bool:
+    """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.
+
+Args:
+    first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char.
+    second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char.
+    """
+    if is_surrogate_high_char(first):
+        if is_surrogate_low_char(second):
+            return True
+        else:
+            log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second))
+    return False
+
+def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes:
+    """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.
+
+Args:
+        high (bytes): the high-surrogate code point
+        low (bytes): the low-surrogate code point
+        encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because:  Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
+    """
+    # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard.
+    # Unicode scalar value means the same thing as "code position" or "code point"
+     # https://www.unicode.org/versions/Unicode3.0.0/
+     # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9
+    if high.startswith(b"\\u"):
+        high = high[2:]
+    if low.startswith(b"\\u"):
+        low = low[2:]
+    if is_surrogate_16bit(high, "high"):
+        char_high = chr(65536+int(high))
+    else:
+        char_high = chr(int(high))
+    if is_surrogate_16bit(low, "low"):
+        char_low = chr(65536+int(low))
+    else:
+        char_low = chr(int(low))
+    unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000
+    unicode_bytes = chr(unicode_scalar_value).encode(encoding)
+    return unicode_bytes.decode(encoding).encode()
+
+def merge_surrogate_chars(children,
+                          ascii_map,
+                          use_ASCII_alternatives_on_unicode_decode_failure = False):
+    """
+
+
+Raises:
+    ValueError:  A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
+    """
+    surrogate_start = None
+    surrogate_high = None
+    for i,c in enumerate(children):
+        if isinstance(c, Tree):
+            continue
+        if is_unicode_encoded(c):
+            if is_surrogate_high_char(c.value):
+                surrogate_start = i
+                surrogate_high = c
+            elif surrogate_start is not None:
+                if is_surrogate_low_char(c.value):
+                    surrogate_low = c
+                    try:
+                        surrogate_value = decode_surrogate_pair(surrogate_high.value,
+                                                                surrogate_low.value)
+                        # Convert into STRING token
+                        surrogate_tok = Token('STRING',
+                                              surrogate_value,
+                                              start_pos=surrogate_high.start_pos,
+                                              end_pos=surrogate_low.end_pos,
+                                              line=surrogate_high.line,
+                                              end_line=surrogate_low.end_line,
+                                              column=surrogate_high.column,
+                                              end_column=surrogate_low.end_column)
+                        children[surrogate_start] = surrogate_tok
+                        blank_tok = Token('STRING',
+                                          b"",
+                                          start_pos=surrogate_high.start_pos+1,
+                                          end_pos=surrogate_low.end_pos+1,
+                                          line=surrogate_high.line,
+                                          end_line=surrogate_low.end_line,
+                                          column=surrogate_high.column,
+                                          end_column=surrogate_low.end_column)
+                        children[i] = blank_tok
+                        surrogate_start = None
+                        surrogate_high = None
+                    except UnicodeDecodeError as _e:
+                        if use_ASCII_alternatives_on_unicode_decode_failure is True:
+                            children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
+                            children[i] = b"".join([i.value for i in ascii_map[surrogate_low]])
+                        else:
+                            raise _e
+                else:
+                    log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low))
+                    if use_ASCII_alternatives_on_unicode_decode_failure is True:
+                        children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
+                    else:
+                        raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.")
+    return children
+
+
+
+def is_unicode_char_byte_count(item: Token) -> bool:
+    if isinstance(item, Token):
+        if item.type == "CONTROLWORD":
+            if item.value.startswith(b'\\uc'):
+                return True
+    return False
+
+def get_unicode_char_byte_count(item: Token) -> int:
+    item = item.value.decode()
+    cur_uc = int(item[3:])
+    return cur_uc
+
+
+# Hex Encoded Chars
+def has_hexarray(children: List[Union[Token, Tree]]) -> bool:
+    """Checks if an tree's children includes a hexarray tree.
+
+    children (array): the children object from a tree.
+    """
+    for item in children:
+        if is_hexarray(item):
+            return True
+    return False
+
+def is_hexarray(item):
+    """Checks if an item is a hexarray tree.
+
+    item (Tree or Token): an item to check to see if its a hex array
+    """
+    if isinstance(item, Tree):
+        if item.data.value == 'hexarray':
+            return True
+    return False
+
+def get_bytes_from_hex_encoded(item):
+    """Convert hex encoded string to bytes.
+
+    item (str): a hex encoded string in format \\'XX
+    """
+    hexstring = item.replace(b"\\'", b"")
+    hex_bytes = bytes.fromhex(hexstring.decode())
+    return hex_bytes
+
+def decode_hex_char(item, codec):
+    """Decode a bytes object using a specified codec.
+
+    item (bytes): A bytes object.
+    codec (str): The name of the codec to use to decode the bytes
+    """
+    log_text_extraction("decoding char {0} with font {1}".format(item, codec))
+    if codec is None:
+        # Default to U.S. Windows default codepage
+        codec = 'CP1252'
+    decoded = item.decode(codec)
+    decoded = decoded.encode()
+    log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec))
+    return decoded
+
+
+class TextDecoder:
+
+    def __init__(self, keep_fontdef=False,
+               initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False):
+        """
+        keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed.
+        initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the  information.
+        use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.
+
+        """
+        self.keep_fontdef = keep_fontdef
+        self.ucbc = initial_byte_count
+        self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure
+
+        # Font table values set set_font_info
+        self.default_font = None
+        self.font_stack = []
+        self.font_table = {}
+
+
+    def set_font_info(self, obj: Tree):
+        """
+
+        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+        """
+        self.default_font = get_default_font(obj)
+        self.font_stack = [self.default_font]
+        raw_fonttbl = get_font_table(obj.children[1])
+        self.font_table = parse_font_tree(raw_fonttbl)
+        log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
+
+
+    def update_children(self, obj: Tree):
+        """
+
+        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+        """
+        # Reset font info
+        self.set_font_info(obj)
+        children = obj.children
+        obj.children = [i for i in self.iterate_on_children(children)]
+
+    def prep_unicode(self, children: List[Token]):
+        if includes_unicode_chars(children):
+            # Clean out all replacement chars
+            # log_text_extraction("Prepping Unicode Chars:" + repr(children))
+            children, ascii_map = remove_unicode_replacements(children,
+                                                              byte_count=self.ucbc)
+            # print("===\nCHILD:" + repr(children))
+            # print("===\nASCII:" + repr(ascii_map))
+            # Merge all surrogate pairs
+            children = merge_surrogate_chars(children,
+                                             ascii_map,
+                                             self.use_ASCII_alternatives_on_unicode_decode_failure)
+            # print("FINAL CHILDREN")
+            # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
+        return children
+
+    def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
+        set_fonts = []
+        log_text_extraction("Starting to iterate on text extraction children...")
+        log_text_extraction("PREP-BEFORE: "+repr(children))
+        children = self.prep_unicode(children)
+        log_text_extraction("PREP-AFTER: "+repr(children))
+
+        for item in children:
+            if is_font_number(item): # Font Definitions
+                self.font_stack.append(item.value.strip())
+                set_fonts.append(item.value)
+                if self.keep_fontdef is True:
+                    yield item
+            elif is_unicode_char_byte_count(item):
+                bc = get_unicode_char_byte_count(item)
+            elif is_unicode_encoded(item): # Unicode Chars
+                decoded = unicode_escape_to_chr(item.value).encode()
+                # Convert into STRING token
+                decoded_tok = Token('STRING',
+                                    decoded,
+                                    start_pos=item.start_pos,
+                                    end_pos=item.end_pos,
+                                    line=item.line,
+                                    end_line=item.end_line,
+                                    column=item.column,
+                                    end_column=item.end_column)
+                print(f"UNICODE TOKEN {item}: {decoded_tok}")
+                yield decoded_tok
+            # Decode a hex array
+            elif is_hexarray(item):
+                # print("IS Hex?? {0}".format(item))
+                base_bytes = None
+                for hexchild in item.children:
+                    if base_bytes is None:
+                        base_bytes = get_bytes_from_hex_encoded(hexchild.value)
+                    else:
+                        base_bytes += get_bytes_from_hex_encoded(hexchild.value)
+                current_fontdef = self.font_table[self.font_stack[-1]]
+                current_codec = current_fontdef.codec
+                decoded_hex = decode_hex_char(base_bytes, current_codec)
+                # We are replacing a Tree. So, need item.data to access it's info token
+                decoded_hex_tok = Token('STRING',
+                                        decoded_hex,
+                                        start_pos=item.data.start_pos,
+                                        end_pos=item.data.end_pos,
+                                        line=item.data.line,
+                                        end_line=item.data.end_line,
+                                        column=item.data.column,
+                                        end_column=item.data.end_column)
+                yield decoded_hex_tok
+            elif isinstance(item, Tree):
+                # Run this same function recursively on nested trees
+                item.children = [i for i in self.iterate_on_children(item.children)]
+                yield item
+            else:
+                yield item
+        for i in set_fonts:
+            # Remove all fonts defined while in this group
+            self.font_stack.pop()
+
+
+
+
+
+
+
+

Functions

+
+
+def check_codepage_num(codepage_num: int) ‑> int +
+
+

Provide the codepage number back to you if it is valid.

+

Args

+
+
codepage_num : int
+
A possible codepage number.
+
+

Returns

+

The codepage number IF it is a valid codepage number

+

Raises

+
+
ValueError
+
The codepage_num provided isn't a valid codepage number.
+
+
+ +Expand source code + +
def check_codepage_num(codepage_num: int) -> int:
+    """Provide the codepage number back to you if it is valid.
+
+Args:
+    codepage_num (int): A possible codepage number.
+
+Returns:
+    The codepage number IF it is a valid codepage number
+
+Raises:
+    ValueError: The codepage_num provided isn't a valid codepage number.
+
+"""
+    # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it.
+    # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
+    # Retrieved on 2020-12-18
+    allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001])
+    if codepage_num in allowed_codepage_nums:
+        return codepage_num
+    # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages.
+    # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
+    raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")
+
+
+
+def decode_hex_char(item, codec) +
+
+

Decode a bytes object using a specified codec.

+

item (bytes): A bytes object. +codec (str): The name of the codec to use to decode the bytes

+
+ +Expand source code + +
def decode_hex_char(item, codec):
+    """Decode a bytes object using a specified codec.
+
+    item (bytes): A bytes object.
+    codec (str): The name of the codec to use to decode the bytes
+    """
+    log_text_extraction("decoding char {0} with font {1}".format(item, codec))
+    if codec is None:
+        # Default to U.S. Windows default codepage
+        codec = 'CP1252'
+    decoded = item.decode(codec)
+    decoded = decoded.encode()
+    log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec))
+    return decoded
+
+
+
+def decode_surrogate_pair(high: bytes, low: bytes, encoding: str = 'utf-16-le') ‑> bytes +
+
+

Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.

+

Args

+
+
high : bytes
+
the high-surrogate code point
+
low : bytes
+
the low-surrogate code point
+
encoding : str
+
The encoding to apply to the final value. Defaults to 'utf-16-le' because: +Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
+
+
+ +Expand source code + +
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes:
+    """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.
+
+Args:
+        high (bytes): the high-surrogate code point
+        low (bytes): the low-surrogate code point
+        encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because:  Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
+    """
+    # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard.
+    # Unicode scalar value means the same thing as "code position" or "code point"
+     # https://www.unicode.org/versions/Unicode3.0.0/
+     # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9
+    if high.startswith(b"\\u"):
+        high = high[2:]
+    if low.startswith(b"\\u"):
+        low = low[2:]
+    if is_surrogate_16bit(high, "high"):
+        char_high = chr(65536+int(high))
+    else:
+        char_high = chr(int(high))
+    if is_surrogate_16bit(low, "low"):
+        char_low = chr(65536+int(low))
+    else:
+        char_low = chr(int(low))
+    unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000
+    unicode_bytes = chr(unicode_scalar_value).encode(encoding)
+    return unicode_bytes.decode(encoding).encode()
+
+
+
+def get_bytes_from_hex_encoded(item) +
+
+

Convert hex encoded string to bytes.

+

item (str): a hex encoded string in format 'XX

+
+ +Expand source code + +
def get_bytes_from_hex_encoded(item):
+    """Convert hex encoded string to bytes.
+
+    item (str): a hex encoded string in format \\'XX
+    """
+    hexstring = item.replace(b"\\'", b"")
+    hex_bytes = bytes.fromhex(hexstring.decode())
+    return hex_bytes
+
+
+
+def get_codepage_num_from_fcharset(fcharsetN: int) ‑> Optional[int] +
+
+

Return the codepage to use with a specific fcharsetN.

+

Args

+
+
fcharsetN : int
+
The numeric argument N for a +charsetN control word.
+
+

Returns

+

(int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)

+
+ +Expand source code + +
def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]:
+    """Return the codepage to use with a specific fcharsetN.
+
+Args:
+    fcharsetN (int): The numeric argument N for a \fcharsetN control word.
+
+Returns:
+    (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)
+
+    """
+    # Charset table retrieved on 2022-08-19
+    # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN
+    charsets: dict[int,dict[str,Any]] = {
+        0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252},
+        1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None},
+        2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None},
+        128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932},
+        129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949},
+        134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936},
+        136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950},
+        161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253},
+        162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254},
+        177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255},
+        178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256},
+        186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257},
+        204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251},
+        222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874},
+        238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250},
+        255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None},
+}
+    log_text_extraction(f"Getting charset for {fcharsetN}")
+    charset = charsets.get(fcharsetN, None)
+    if charset is not None:
+        charset_id = charset.get('id', None)
+        return charset_id
+    return None
+
+
+
+def get_default_font(tree: lark.tree.Tree) ‑> Optional[str] +
+
+

Extract the font number controlword default font if it exists.

+

If an RTF file uses a default font, the default font number is specified with the \deffN control word, which must precede the font-table group.

+

Args

+
+
tree : Tree
+
A lark Tree object. Should be the DeEncapsulator.full_tree object.
+
+

Returns

+

The default font control number if it exists from the first \deffN. None if not found.

+
+ +Expand source code + +
def get_default_font(tree: Tree) -> Union[str,None]:
+    """Extract the font number controlword default font if it exists.
+
+If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group.
+
+Args:
+    tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object.
+
+Returns:
+    The default font control number if it exists from the first `\\deffN`. None if not found.
+"""
+    deff_gen = tree.scan_values(
+        lambda v: is_codeword_with_numeric_arg(v, b'\\deff')
+    )
+    deff_options = list(deff_gen)
+    try:
+        # We just want the first \\deffN. It shouldn't be set multiple times.
+        deff = deff_options[0]
+        deff_num = deff.value[5:]
+        return b'\\f' + deff_num
+    except IndexError:
+        return None
+
+
+
+def get_font_table(tree: lark.tree.Tree) ‑> lark.tree.Tree +
+
+

Extract the font table group from the first 20 tokens of a .rtf document.

+

Args

+
+
tree : Tree
+
A .rtf document object parsed into a Tree object
+
+

Raises

+
+
ValueError
+
If no group with a \fonttbl token as its first controlword is found.
+
+

Returns

+

{'\f0': fontdef(fnum='\f0', codepage=932, codec='cp932', fontdef_tree='{\f0\fswiss\fcharset128 MS PGothic;}'), +'\f1': fontdef(fnum='\f1', codepage=None, codec=None, fontdef_tree='{\f1\fmodern MS Gothic;}'), +'\f2': fontdef(fnum='\f2', codepage=None, codec=None, fontdef_tree='{\f2\fnil\fcharset2 Symbol;}'), +'\f3': fontdef(fnum='\f3', codepage=1252, codec='cp1252', fontdef_tree='{\f3\fmodern\fcharset0 Courier New;}'), +'\f4': fontdef(fnum='\f4', codepage=932, codec='cp932', fontdef_tree='{\f4\fswiss\fcharset128 "PMingLiU";}'), +'\f5': fontdef(fnum='\f5', codepage=None, codec=None, fontdef_tree='{\f5\fswiss "Amnesty Trade Gothic";}'), +'\f6': fontdef(fnum='\f6', codepage=None, codec=None, fontdef_tree='{\f6\fswiss "Arial";}')}

+
+ +Expand source code + +
def get_font_table(tree: Tree) -> Tree:
+    """Extract the font table group from the first 20 tokens of a .rtf document.
+
+Args:
+    tree (Tree): A .rtf document object parsed into a Tree object
+
+Raises:
+    ValueError: If no group with a `\\fonttbl` token as its first controlword is found.
+
+Returns:
+    {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'),
+    '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'),
+    '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'),
+    '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'),
+    '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'),
+    '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'),
+    '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')}
+    """
+    for item in tree.children[:20]:
+        if isinstance(item, Tree):
+            try:
+                ctrl_value = item.children[1]
+            except IndexError as _e:
+                continue
+            if isinstance(ctrl_value, Token):
+                table_type = ctrl_value.value.strip()
+                if table_type == b"\\fonttbl":
+                    return item
+    raise ValueError("No font table found in tree")
+
+
+
+def get_python_codec(codepage_num: int) ‑> str +
+
+

Returns the python codec needed to decode bytes to unicode.

+

Args

+
+
codepage_num : int
+
A codepage number.
+
+

Returns

+

The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.

+
+ +Expand source code + +
def get_python_codec(codepage_num: int) -> str:
+    """Returns the python codec needed to decode bytes to unicode.
+
+Args:
+    codepage_num (int): A codepage number.
+
+Returns:
+    The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
+"""
+    text_codec = codepages.codepage2codec(codepage_num)
+    log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec))
+    return text_codec
+
+
+
+def get_unicode_char_byte_count(item: lark.lexer.Token) ‑> int +
+
+
+
+ +Expand source code + +
def get_unicode_char_byte_count(item: Token) -> int:
+    item = item.value.decode()
+    cur_uc = int(item[3:])
+    return cur_uc
+
+
+
+def has_hexarray(children: List[Union[lark.lexer.Token, lark.tree.Tree]]) ‑> bool +
+
+

Checks if an tree's children includes a hexarray tree.

+

children (array): the children object from a tree.

+
+ +Expand source code + +
def has_hexarray(children: List[Union[Token, Tree]]) -> bool:
+    """Checks if an tree's children includes a hexarray tree.
+
+    children (array): the children object from a tree.
+    """
+    for item in children:
+        if is_hexarray(item):
+            return True
+    return False
+
+
+
+def includes_unicode_chars(children: List[lark.lexer.Token]) ‑> bool +
+
+

Does a list include Tokens which contain unicode characters. Not recursive.

+

Args

+
+
children : list
+
A Tree.children list to check to see if it includes unicode characters.
+
+

Returns

+

True if list includes tokens which contain unicode chars. False if not.

+
+ +Expand source code + +
def includes_unicode_chars(children: List[Token]) -> bool:
+    """Does a list include Tokens which contain unicode characters. Not recursive.
+
+Args:
+    children (list): A Tree.children list to check to see if it includes unicode characters.
+
+Returns:
+    True if list includes tokens which contain unicode chars. False if not.
+"""
+    for child in children:
+        if is_unicode_encoded(child):
+            return True
+    return False
+
+
+
+def is_font_number(token: lark.lexer.Token) ‑> bool +
+
+

Checks if an object is a "font number".

+

Returns

+

True if an object is a "font number" controlword \fN. False if not.

+
+ +Expand source code + +
def is_font_number(token: Token) -> bool:
+    """Checks if an object is a "font number".
+
+Returns:
+    True if an object is a "font number" controlword `\\fN`. False if not.
+
+"""
+    try:
+        if is_codeword_with_numeric_arg(token, b'\\f'):
+            return True
+    except AttributeError: # pragma: no cover
+        return False
+    return False
+
+
+
+def is_hex_encoded(item: lark.lexer.Token) ‑> bool +
+
+

Identify if a token contains a HEXENCODED token.

+

Args

+
+
item : token
+
A token to check if it is HEXENCODED.
+
+

Return

+

True if HEXENCODED. False if not.

+
+ +Expand source code + +
def is_hex_encoded(item: Token) -> bool:
+    """Identify if a token contains a HEXENCODED token.
+Args:
+    item (token): A token to check if it is HEXENCODED.
+
+Return:
+    True if HEXENCODED. False if not.
+    """
+    if isinstance(item, Token):
+        if item.type == "HEXENCODED":
+            return True
+    return False
+
+
+
+def is_hexarray(item) +
+
+

Checks if an item is a hexarray tree.

+

item (Tree or Token): an item to check to see if its a hex array

+
+ +Expand source code + +
def is_hexarray(item):
+    """Checks if an item is a hexarray tree.
+
+    item (Tree or Token): an item to check to see if its a hex array
+    """
+    if isinstance(item, Tree):
+        if item.data.value == 'hexarray':
+            return True
+    return False
+
+
+
+def is_surrogate_16bit(item: bytes, cp_range) ‑> bool +
+
+

Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.

+

Args

+
+
item : bytes
+
A bytes representation of a string representing a unicode character.
+
cp_range : str
+
['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
+
+
+ +Expand source code + +
def is_surrogate_16bit(item: bytes, cp_range) -> bool:
+    """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.
+
+Args:
+    item (bytes): A bytes representation of a string representing a unicode character.
+    cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
+    """
+    if cp_range == 'low':
+        if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
+            return True
+    elif cp_range == 'high':
+        if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
+            return True
+    else:
+        raise ValueError("cp_range must be either 'low' or 'high'")
+    return False
+
+
+
+def is_surrogate_high_char(item: bytes) ‑> bool +
+
+

Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.

+

item (bytes): A bytes representation of a string representing a unicode character. "\u-10179"

+
+ +Expand source code + +
def is_surrogate_high_char(item: bytes) -> bool:
+    """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.
+
+        item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179"
+    """
+    if item.startswith(b"\\u"):
+        item = item[2:]
+    if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
+        return True
+    # In case unicode is NOT using the 16 bit signed integer
+    elif 0xD800 <= int(item) <= 0xDBFF:
+        return True
+    return False
+
+
+
+def is_surrogate_low_char(item: bytes) ‑> bool +
+
+

Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." +Low-surrogate also sometimes known as following surrogates.

+

item (bytes): A bytes representation of a string representing a unicode character.

+
+ +Expand source code + +
def is_surrogate_low_char(item: bytes) -> bool:
+    """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF."  Low-surrogate also sometimes known as following surrogates.
+
+        item (bytes): A bytes representation of a string representing a unicode character.
+    """
+    if item.startswith(b"\\u"):
+        item = item[2:]
+    if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
+        return True
+    # In case unicode is NOT using the 16 bit signed integer
+    elif 0xDC00 <= int(item) <= 0xDFFF:
+        return True
+    return False
+
+
+
+def is_surrogate_pair(first: bytes, second: bytes) ‑> bool +
+
+

Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.

+

Args

+
+
first : bytes
+
A bytes representation of a string representing the high-order byte in a surrogate char.
+
second : bytes
+
A bytes representation of a string representing the low-order byte in a surrogate char.
+
+
+ +Expand source code + +
def is_surrogate_pair(first: bytes, second: bytes) -> bool:
+    """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.
+
+Args:
+    first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char.
+    second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char.
+    """
+    if is_surrogate_high_char(first):
+        if is_surrogate_low_char(second):
+            return True
+        else:
+            log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second))
+    return False
+
+
+
+def is_unicode_char_byte_count(item: lark.lexer.Token) ‑> bool +
+
+
+
+ +Expand source code + +
def is_unicode_char_byte_count(item: Token) -> bool:
+    if isinstance(item, Token):
+        if item.type == "CONTROLWORD":
+            if item.value.startswith(b'\\uc'):
+                return True
+    return False
+
+
+
+def is_unicode_encoded(item: lark.lexer.Token) ‑> bool +
+
+

Is token contain a unicode char.

+

Args

+
+
item : token
+
A token to check if contains a unicode char.
+
+

Return

+

True if token contains a unicode char. False if not.

+
+ +Expand source code + +
def is_unicode_encoded(item: Token) -> bool:
+    """Is token contain a unicode char.
+
+Args:
+    item (token): A token to check if contains a unicode char.
+
+Return:
+    True if token contains a unicode char. False if not.
+"""
+    if isinstance(item, Token):
+        if item.type == "UNICODE":
+            return True
+    return False
+
+
+
+def is_valid_ANSI_representation_char(item: lark.lexer.Token) ‑> bool +
+
+

Is token contain a valid ANSI representation string for a Unicode char.

+

Args

+
+
item : token
+
A token to check if it is a valid ANSI representation.
+
+

Return

+

True if token is an ansi representation of a unicode char. False if not.

+
+ +Expand source code + +
def is_valid_ANSI_representation_char(item: Token) -> bool:
+    """Is token contain a valid ANSI representation string for a Unicode char.
+
+Args:
+    item (token): A token to check if it is a valid ANSI representation.
+
+Return:
+    True if token is an ansi representation of a unicode char. False if not.
+"""
+    if isinstance(item, Token):
+        # print(f"found TOKEN posssible ansi {repr(item)}")
+        if is_hex_encoded(item):
+            # print(f"found hex posssible ansi {repr(item)}")
+            return True
+        if item.type == 'STRING':
+            # print(f"found STRING posssible ansi {repr(item)}")
+            if not item.value.isspace(): # whitespace doesn't count.
+                # print(f"found posssible ansi {repr(item)}")
+                return True
+            # else:
+            #     print(f"found SPACE posssible ansi {repr(item)}")
+    # print(f"found NON TOKEN posssible ansi {repr(item)}")
+    return False
+
+
+
+def merge_surrogate_chars(children, ascii_map, use_ASCII_alternatives_on_unicode_decode_failure=False) +
+
+

Raises

+
+
ValueError
+
A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
+
+
+ +Expand source code + +
def merge_surrogate_chars(children,
+                          ascii_map,
+                          use_ASCII_alternatives_on_unicode_decode_failure = False):
+    """
+
+
+Raises:
+    ValueError:  A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
+    """
+    surrogate_start = None
+    surrogate_high = None
+    for i,c in enumerate(children):
+        if isinstance(c, Tree):
+            continue
+        if is_unicode_encoded(c):
+            if is_surrogate_high_char(c.value):
+                surrogate_start = i
+                surrogate_high = c
+            elif surrogate_start is not None:
+                if is_surrogate_low_char(c.value):
+                    surrogate_low = c
+                    try:
+                        surrogate_value = decode_surrogate_pair(surrogate_high.value,
+                                                                surrogate_low.value)
+                        # Convert into STRING token
+                        surrogate_tok = Token('STRING',
+                                              surrogate_value,
+                                              start_pos=surrogate_high.start_pos,
+                                              end_pos=surrogate_low.end_pos,
+                                              line=surrogate_high.line,
+                                              end_line=surrogate_low.end_line,
+                                              column=surrogate_high.column,
+                                              end_column=surrogate_low.end_column)
+                        children[surrogate_start] = surrogate_tok
+                        blank_tok = Token('STRING',
+                                          b"",
+                                          start_pos=surrogate_high.start_pos+1,
+                                          end_pos=surrogate_low.end_pos+1,
+                                          line=surrogate_high.line,
+                                          end_line=surrogate_low.end_line,
+                                          column=surrogate_high.column,
+                                          end_column=surrogate_low.end_column)
+                        children[i] = blank_tok
+                        surrogate_start = None
+                        surrogate_high = None
+                    except UnicodeDecodeError as _e:
+                        if use_ASCII_alternatives_on_unicode_decode_failure is True:
+                            children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
+                            children[i] = b"".join([i.value for i in ascii_map[surrogate_low]])
+                        else:
+                            raise _e
+                else:
+                    log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low))
+                    if use_ASCII_alternatives_on_unicode_decode_failure is True:
+                        children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
+                    else:
+                        raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.")
+    return children
+
+
+
+def parse_font_tree(font_tree: lark.tree.Tree) ‑> dict +
+
+

Create a font tree dictionary with appropriate codeces to decode text.

+

Args

+
+
font_tree : Tree
+
The .rtf font table object decoded as a tree.
+
+

Returns

+

A dictionary which maps font numbers to appropriate python codeces needed to decode text.

+
+ +Expand source code + +
def parse_font_tree(font_tree: Tree) -> dict:
+    """Create a font tree dictionary with appropriate codeces to decode text.
+
+Args:
+    font_tree (Tree): The .rtf font table object decoded as a tree.
+
+Returns:
+    A dictionary which maps font numbers to appropriate python codeces needed to decode text.
+"""
+    parsed_font_tree = {}
+    for tree in font_tree.children:
+        if isinstance(tree, Tree):
+            fnum = None
+            fcharset = None
+            cpg_num = None
+            for tok in tree.children:
+                if is_codeword_with_numeric_arg(tok, b'\\f'):
+                    fnum = tok.value
+                elif is_codeword_with_numeric_arg(tok, b'\\fcharset'):
+                    fchar_num = int(tok.value[9:])
+                    fcharset = get_codepage_num_from_fcharset(fchar_num)
+                elif is_codeword_with_numeric_arg(tok, b'\\cpg'):
+                    cpg_num = int(tok.value[4:])
+            if fnum is not None:
+                # get the codepage
+                codepage_num = None
+
+                if fcharset is not None:
+                    try:
+                        codepage_num = check_codepage_num(fcharset)
+                    except ValueError: # pragma: no cover
+                        codepage_num = None
+                # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored.
+                if ((codepage_num is None) and (cpg_num is not None)):
+                    try:
+                        codepage_num = check_codepage_num(cpg_num)
+                    except ValueError: # pragma: no cover
+                        codepage_num = None
+                # Get the appropriate codec
+                if codepage_num is not None:
+                    codec = get_python_codec(codepage_num)
+                else:
+                    codec = None
+                # Only add if there is a font definition
+                tree_str =  b"".join(list(flatten_tree_to_string_array(tree)))
+                parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str)
+    return parsed_font_tree
+
+
+
+def remove_unicode_replacements(children: List[lark.lexer.Token], return_ascii_map: bool = True, byte_count: int = 1) ‑> Union[Tuple[List[lark.lexer.Token], Dict[lark.lexer.Token, List[lark.lexer.Token]]], List[lark.lexer.Token]] +
+
+

Remove all unicode replacement characters from a list of Tokens.

+

Args

+
+
children : list
+
A Tree.children list to remove unicode replacement characters from.
+
return_ascii_map : bool
+
On True, have this function return a map of the ASCII token that were removed.
+
byte_count : int
+
The number of bytes corresponding to a given \uN Unicode character. +A default of 1 should be assumed if no \uc keyword has been seen in the current or outer scopes.
+
+

Returns

+

new_children (list): The list of Tokens with all unicode replacement characters removed. +ascii_map (dict): All the Tokens which were removed from the provided children list keyed by

+
+ +Expand source code + +
def remove_unicode_replacements(children: List[Token],
+                                return_ascii_map: bool = True,
+                                byte_count: int = 1) -> Union[
+                                    Tuple[List[Token], Dict[Token,List[Token]]],
+                                    List[Token]]:
+    """Remove all unicode replacement characters from a list of Tokens.
+
+Args:
+    children (list): A Tree.children list to remove unicode replacement characters from.
+    return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed.
+    byte_count (int): The number of bytes corresponding to a given \\uN Unicode character.  A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes.
+
+Returns:
+    new_children (list): The list of Tokens with all unicode replacement characters removed.
+    ascii_map (dict): All the Tokens which were removed from the provided children list keyed by
+
+"""
+    byte_count = 1
+    ascii_map: Dict[Token,List[Token]]  = {}
+    new_children = []
+    removal_map: List[Token] = []
+    log_text_extraction(f"Removing unicode replacements on {repr(children)}")
+    for child in children:
+        if len(removal_map) > 0:
+            if isinstance(child, Token):
+                # Delete all spaces between a unicode char and the last ANSI representation
+                # print(f"FOUND SPACE STRING with RM: {removal_map}")
+                if child.value.isspace():
+                    ascii_map.setdefault(removal_map[0], []).append(child)
+                    continue
+            if is_valid_ANSI_representation_char(child):
+                # Found an ansi representation removing unicode char from removal map.
+                # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}")
+                ascii_map.setdefault(removal_map.pop(), []).append(child)
+                continue
+            elif isinstance(child, Tree) and (
+                    (child.data == "string") or (child.data == "hexarray")):
+                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
+                ansi_children = child.children
+                new_ansi_children = []
+                for aci,ac in enumerate(ansi_children):
+                    # print(f"AC CHILD {repr(ac)}")
+                    if is_valid_ANSI_representation_char(ac):
+                        # print(f"AC CHILD VALID {repr(ac)}")
+                        if len(removal_map) > 0:
+                            # print(f"AC CHILD MAP >0 {repr(ac)}")
+                            # print(f"Popping removal for {repr(ac)}")
+                            ascii_map.setdefault(removal_map.pop(), []).append(ac)
+                        else:
+                            # print(f"AC CHILD MAP < 0 {repr(ac)}")
+                            new_ansi_children.append(ac)
+                    else:
+                        # print(f"AC CHILD NOT VALID {repr(ac)}")
+                        new_ansi_children.append(ac)
+                # print(f"NEW Children = {new_ansi_children}")
+                if new_ansi_children == []:
+                    from RTFDE.utils import make_token_replacement
+                    # from RTFDE.utils import embed
+                    # embed()
+                    child = make_token_replacement("STRING", b"", child)
+                else:
+                    child.children = new_ansi_children
+                # print(f"NEW Tree = {child}")
+            # else:
+                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
+                # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}")
+        # Modify char byte count if we encounter it.
+        if is_unicode_char_byte_count(child):
+            byte_count = get_unicode_char_byte_count(child)
+            # print(f"Changing byte count because {child} to {byte_count}")
+        if is_unicode_encoded(child):
+            # print(f"Found unicode {child}")
+            for j in range(byte_count):
+                # Add the unicode key to the removal map once per byte
+                # This ensures we remove the right number of ANSI representation chars
+                removal_map.append(child)
+        new_children.append(child)
+    if return_ascii_map is True:
+        return new_children, ascii_map
+    return new_children
+
+
+
+def unicode_escape_to_chr(item: bytes) ‑> str +
+
+

Convert unicode char from it's decimal to its unicode character representation. From "\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.

+

Args

+
+
item : str
+
A RTF Escape in the format \u[-]NNNNN.
+
+

Returns

+

The unicode character representation of the identified character

+

Raises

+
+
ValueError
+
The escaped unicode character is not valid.
+
+
+ +Expand source code + +
def unicode_escape_to_chr(item: bytes) -> str:
+    """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.
+
+Args:
+    item (str): A RTF Escape in the format \\u[-]NNNNN.
+
+Returns:
+    The unicode character representation of the identified character
+
+Raises:
+    ValueError: The escaped unicode character is not valid.
+"""
+    try:
+        nnnn = int(item.removeprefix(b'\\u')) # raises ValueError if not int.
+    except ValueError as _e:
+        raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e
+    if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits
+        ncr = 65536 + nnnn
+    else: # § NNNNN is a positive integer expressed in decimal digits
+        ncr = nnnn
+    # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN
+    return chr(ncr)
+
+
+
+def validate_ansi_cpg(header: str) ‑> None +
+
+

Check an '\ansicpgNNNN' string to see if the number NNNN is an actual codepage.

+

Args

+
+
header : str
+
The value from the lark \ansicpg CONTROLWORD Token.
+
+

Raises

+
+
MalformedRtf
+
If the value passed is not a valid ansi codepage.
+
+
+ +Expand source code + +
def validate_ansi_cpg(header: str) -> None:
+    """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage.
+
+Args:
+    header (str): The value from the lark `\\ansicpg` CONTROLWORD Token.
+
+Raises:
+    MalformedRtf: If the value passed is not a valid ansi codepage.
+"""
+    try:
+        possible_cpg_num = int(header.strip()[8:])
+        check_codepage_num(possible_cpg_num)
+    except ValueError as _e:
+        raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e
+
+
+
+
+
+

Classes

+
+
+class TextDecoder +(keep_fontdef=False, initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False) +
+
+

keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. +initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the +information. +use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.

+
+ +Expand source code + +
class TextDecoder:
+
+    def __init__(self, keep_fontdef=False,
+               initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False):
+        """
+        keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed.
+        initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the  information.
+        use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.
+
+        """
+        self.keep_fontdef = keep_fontdef
+        self.ucbc = initial_byte_count
+        self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure
+
+        # Font table values set set_font_info
+        self.default_font = None
+        self.font_stack = []
+        self.font_table = {}
+
+
+    def set_font_info(self, obj: Tree):
+        """
+
+        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+        """
+        self.default_font = get_default_font(obj)
+        self.font_stack = [self.default_font]
+        raw_fonttbl = get_font_table(obj.children[1])
+        self.font_table = parse_font_tree(raw_fonttbl)
+        log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
+
+
+    def update_children(self, obj: Tree):
+        """
+
+        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+        """
+        # Reset font info
+        self.set_font_info(obj)
+        children = obj.children
+        obj.children = [i for i in self.iterate_on_children(children)]
+
+    def prep_unicode(self, children: List[Token]):
+        if includes_unicode_chars(children):
+            # Clean out all replacement chars
+            # log_text_extraction("Prepping Unicode Chars:" + repr(children))
+            children, ascii_map = remove_unicode_replacements(children,
+                                                              byte_count=self.ucbc)
+            # print("===\nCHILD:" + repr(children))
+            # print("===\nASCII:" + repr(ascii_map))
+            # Merge all surrogate pairs
+            children = merge_surrogate_chars(children,
+                                             ascii_map,
+                                             self.use_ASCII_alternatives_on_unicode_decode_failure)
+            # print("FINAL CHILDREN")
+            # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
+        return children
+
+    def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
+        set_fonts = []
+        log_text_extraction("Starting to iterate on text extraction children...")
+        log_text_extraction("PREP-BEFORE: "+repr(children))
+        children = self.prep_unicode(children)
+        log_text_extraction("PREP-AFTER: "+repr(children))
+
+        for item in children:
+            if is_font_number(item): # Font Definitions
+                self.font_stack.append(item.value.strip())
+                set_fonts.append(item.value)
+                if self.keep_fontdef is True:
+                    yield item
+            elif is_unicode_char_byte_count(item):
+                bc = get_unicode_char_byte_count(item)
+            elif is_unicode_encoded(item): # Unicode Chars
+                decoded = unicode_escape_to_chr(item.value).encode()
+                # Convert into STRING token
+                decoded_tok = Token('STRING',
+                                    decoded,
+                                    start_pos=item.start_pos,
+                                    end_pos=item.end_pos,
+                                    line=item.line,
+                                    end_line=item.end_line,
+                                    column=item.column,
+                                    end_column=item.end_column)
+                print(f"UNICODE TOKEN {item}: {decoded_tok}")
+                yield decoded_tok
+            # Decode a hex array
+            elif is_hexarray(item):
+                # print("IS Hex?? {0}".format(item))
+                base_bytes = None
+                for hexchild in item.children:
+                    if base_bytes is None:
+                        base_bytes = get_bytes_from_hex_encoded(hexchild.value)
+                    else:
+                        base_bytes += get_bytes_from_hex_encoded(hexchild.value)
+                current_fontdef = self.font_table[self.font_stack[-1]]
+                current_codec = current_fontdef.codec
+                decoded_hex = decode_hex_char(base_bytes, current_codec)
+                # We are replacing a Tree. So, need item.data to access it's info token
+                decoded_hex_tok = Token('STRING',
+                                        decoded_hex,
+                                        start_pos=item.data.start_pos,
+                                        end_pos=item.data.end_pos,
+                                        line=item.data.line,
+                                        end_line=item.data.end_line,
+                                        column=item.data.column,
+                                        end_column=item.data.end_column)
+                yield decoded_hex_tok
+            elif isinstance(item, Tree):
+                # Run this same function recursively on nested trees
+                item.children = [i for i in self.iterate_on_children(item.children)]
+                yield item
+            else:
+                yield item
+        for i in set_fonts:
+            # Remove all fonts defined while in this group
+            self.font_stack.pop()
+
+

Methods

+
+
+def iterate_on_children(self, children) +
+
+
+
+ +Expand source code + +
def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
+    set_fonts = []
+    log_text_extraction("Starting to iterate on text extraction children...")
+    log_text_extraction("PREP-BEFORE: "+repr(children))
+    children = self.prep_unicode(children)
+    log_text_extraction("PREP-AFTER: "+repr(children))
+
+    for item in children:
+        if is_font_number(item): # Font Definitions
+            self.font_stack.append(item.value.strip())
+            set_fonts.append(item.value)
+            if self.keep_fontdef is True:
+                yield item
+        elif is_unicode_char_byte_count(item):
+            bc = get_unicode_char_byte_count(item)
+        elif is_unicode_encoded(item): # Unicode Chars
+            decoded = unicode_escape_to_chr(item.value).encode()
+            # Convert into STRING token
+            decoded_tok = Token('STRING',
+                                decoded,
+                                start_pos=item.start_pos,
+                                end_pos=item.end_pos,
+                                line=item.line,
+                                end_line=item.end_line,
+                                column=item.column,
+                                end_column=item.end_column)
+            print(f"UNICODE TOKEN {item}: {decoded_tok}")
+            yield decoded_tok
+        # Decode a hex array
+        elif is_hexarray(item):
+            # print("IS Hex?? {0}".format(item))
+            base_bytes = None
+            for hexchild in item.children:
+                if base_bytes is None:
+                    base_bytes = get_bytes_from_hex_encoded(hexchild.value)
+                else:
+                    base_bytes += get_bytes_from_hex_encoded(hexchild.value)
+            current_fontdef = self.font_table[self.font_stack[-1]]
+            current_codec = current_fontdef.codec
+            decoded_hex = decode_hex_char(base_bytes, current_codec)
+            # We are replacing a Tree. So, need item.data to access it's info token
+            decoded_hex_tok = Token('STRING',
+                                    decoded_hex,
+                                    start_pos=item.data.start_pos,
+                                    end_pos=item.data.end_pos,
+                                    line=item.data.line,
+                                    end_line=item.data.end_line,
+                                    column=item.data.column,
+                                    end_column=item.data.end_column)
+            yield decoded_hex_tok
+        elif isinstance(item, Tree):
+            # Run this same function recursively on nested trees
+            item.children = [i for i in self.iterate_on_children(item.children)]
+            yield item
+        else:
+            yield item
+    for i in set_fonts:
+        # Remove all fonts defined while in this group
+        self.font_stack.pop()
+
+
+
+def prep_unicode(self, children: List[lark.lexer.Token]) +
+
+
+
+ +Expand source code + +
def prep_unicode(self, children: List[Token]):
+    if includes_unicode_chars(children):
+        # Clean out all replacement chars
+        # log_text_extraction("Prepping Unicode Chars:" + repr(children))
+        children, ascii_map = remove_unicode_replacements(children,
+                                                          byte_count=self.ucbc)
+        # print("===\nCHILD:" + repr(children))
+        # print("===\nASCII:" + repr(ascii_map))
+        # Merge all surrogate pairs
+        children = merge_surrogate_chars(children,
+                                         ascii_map,
+                                         self.use_ASCII_alternatives_on_unicode_decode_failure)
+        # print("FINAL CHILDREN")
+        # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
+    return children
+
+
+
+def set_font_info(self, obj: lark.tree.Tree) +
+
+

obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.

+
+ +Expand source code + +
def set_font_info(self, obj: Tree):
+    """
+
+    obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+    """
+    self.default_font = get_default_font(obj)
+    self.font_stack = [self.default_font]
+    raw_fonttbl = get_font_table(obj.children[1])
+    self.font_table = parse_font_tree(raw_fonttbl)
+    log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
+
+
+
+def update_children(self, obj: lark.tree.Tree) +
+
+

obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.

+
+ +Expand source code + +
def update_children(self, obj: Tree):
+    """
+
+    obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
+    """
+    # Reset font info
+    self.set_font_info(obj)
+    children = obj.children
+    obj.children = [i for i in self.iterate_on_children(children)]
+
+
+
+
+
+class fontdef +(fnum, codepage, codec, fontdef_tree) +
+
+

fontdef(fnum, codepage, codec, fontdef_tree)

+

Ancestors

+
    +
  • builtins.tuple
  • +
+

Instance variables

+
+
var codec
+
+

Alias for field number 2

+
+
var codepage
+
+

Alias for field number 1

+
+
var fnum
+
+

Alias for field number 0

+
+
var fontdef_tree
+
+

Alias for field number 3

+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/text_extraction.md b/docs/RTFDE/text_extraction.md new file mode 100644 index 0000000..7f2d11f --- /dev/null +++ b/docs/RTFDE/text_extraction.md @@ -0,0 +1,279 @@ +Module RTFDE.text_extraction +============================ + +Functions +--------- + + +`check_codepage_num(codepage_num: int) ‑> int` +: Provide the codepage number back to you if it is valid. + + Args: + codepage_num (int): A possible codepage number. + + Returns: + The codepage number IF it is a valid codepage number + + Raises: + ValueError: The codepage_num provided isn't a valid codepage number. + + +`decode_hex_char(item, codec)` +: Decode a bytes object using a specified codec. + + item (bytes): A bytes object. + codec (str): The name of the codec to use to decode the bytes + + +`decode_surrogate_pair(high: bytes, low: bytes, encoding: str = 'utf-16-le') ‑> bytes` +: Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent. + + Args: + high (bytes): the high-surrogate code point + low (bytes): the low-surrogate code point + encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal. + + +`get_bytes_from_hex_encoded(item)` +: Convert hex encoded string to bytes. + + item (str): a hex encoded string in format \'XX + + +`get_codepage_num_from_fcharset(fcharsetN: int) ‑> Optional[int]` +: Return the codepage to use with a specific fcharsetN. + + Args: + fcharsetN (int): The numeric argument N for a charsetN control word. + + Returns: + (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.) + + +`get_default_font(tree: lark.tree.Tree) ‑> Optional[str]` +: Extract the font number controlword default font if it exists. + + If an RTF file uses a default font, the default font number is specified with the \deffN control word, which must precede the font-table group. + + Args: + tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object. + + Returns: + The default font control number if it exists from the first `\deffN`. None if not found. + + +`get_font_table(tree: lark.tree.Tree) ‑> lark.tree.Tree` +: Extract the font table group from the first 20 tokens of a .rtf document. + + Args: + tree (Tree): A .rtf document object parsed into a Tree object + + Raises: + ValueError: If no group with a `\fonttbl` token as its first controlword is found. + + Returns: + {'\f0': fontdef(fnum='\f0', codepage=932, codec='cp932', fontdef_tree='{\f0\fswiss\fcharset128 MS PGothic;}'), + '\f1': fontdef(fnum='\f1', codepage=None, codec=None, fontdef_tree='{\f1\fmodern MS Gothic;}'), + '\f2': fontdef(fnum='\f2', codepage=None, codec=None, fontdef_tree='{\f2\fnil\fcharset2 Symbol;}'), + '\f3': fontdef(fnum='\f3', codepage=1252, codec='cp1252', fontdef_tree='{\f3\fmodern\fcharset0 Courier New;}'), + '\f4': fontdef(fnum='\f4', codepage=932, codec='cp932', fontdef_tree='{\f4\fswiss\fcharset128 "PMingLiU";}'), + '\f5': fontdef(fnum='\f5', codepage=None, codec=None, fontdef_tree='{\f5\fswiss "Amnesty Trade Gothic";}'), + '\f6': fontdef(fnum='\f6', codepage=None, codec=None, fontdef_tree='{\f6\fswiss "Arial";}')} + + +`get_python_codec(codepage_num: int) ‑> str` +: Returns the python codec needed to decode bytes to unicode. + + Args: + codepage_num (int): A codepage number. + + Returns: + The name of the codec in the Python codec registry. Used as the name for enacoding/decoding. + + +`get_unicode_char_byte_count(item: lark.lexer.Token) ‑> int` +: + + +`has_hexarray(children: List[Union[lark.lexer.Token, lark.tree.Tree]]) ‑> bool` +: Checks if an tree's children includes a hexarray tree. + + children (array): the children object from a tree. + + +`includes_unicode_chars(children: List[lark.lexer.Token]) ‑> bool` +: Does a list include Tokens which contain unicode characters. Not recursive. + + Args: + children (list): A Tree.children list to check to see if it includes unicode characters. + + Returns: + True if list includes tokens which contain unicode chars. False if not. + + +`is_font_number(token: lark.lexer.Token) ‑> bool` +: Checks if an object is a "font number". + + Returns: + True if an object is a "font number" controlword `\fN`. False if not. + + +`is_hex_encoded(item: lark.lexer.Token) ‑> bool` +: Identify if a token contains a HEXENCODED token. + Args: + item (token): A token to check if it is HEXENCODED. + + Return: + True if HEXENCODED. False if not. + + +`is_hexarray(item)` +: Checks if an item is a hexarray tree. + + item (Tree or Token): an item to check to see if its a hex array + + +`is_surrogate_16bit(item: bytes, cp_range) ‑> bool` +: Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions. + + Args: + item (bytes): A bytes representation of a string representing a unicode character. + cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate). + + +`is_surrogate_high_char(item: bytes) ‑> bool` +: Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate. + + item (bytes): A bytes representation of a string representing a unicode character. "\u-10179" + + +`is_surrogate_low_char(item: bytes) ‑> bool` +: Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates. + + item (bytes): A bytes representation of a string representing a unicode character. + + +`is_surrogate_pair(first: bytes, second: bytes) ‑> bool` +: Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order. + + Args: + first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char. + second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char. + + +`is_unicode_char_byte_count(item: lark.lexer.Token) ‑> bool` +: + + +`is_unicode_encoded(item: lark.lexer.Token) ‑> bool` +: Is token contain a unicode char. + + Args: + item (token): A token to check if contains a unicode char. + + Return: + True if token contains a unicode char. False if not. + + +`is_valid_ANSI_representation_char(item: lark.lexer.Token) ‑> bool` +: Is token contain a valid ANSI representation string for a Unicode char. + + Args: + item (token): A token to check if it is a valid ANSI representation. + + Return: + True if token is an ansi representation of a unicode char. False if not. + + +`merge_surrogate_chars(children, ascii_map, use_ASCII_alternatives_on_unicode_decode_failure=False)` +: Raises: + ValueError: A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character. + + +`parse_font_tree(font_tree: lark.tree.Tree) ‑> dict` +: Create a font tree dictionary with appropriate codeces to decode text. + + Args: + font_tree (Tree): The .rtf font table object decoded as a tree. + + Returns: + A dictionary which maps font numbers to appropriate python codeces needed to decode text. + + +`remove_unicode_replacements(children: List[lark.lexer.Token], return_ascii_map: bool = True, byte_count: int = 1) ‑> Union[Tuple[List[lark.lexer.Token], Dict[lark.lexer.Token, List[lark.lexer.Token]]], List[lark.lexer.Token]]` +: Remove all unicode replacement characters from a list of Tokens. + + Args: + children (list): A Tree.children list to remove unicode replacement characters from. + return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed. + byte_count (int): The number of bytes corresponding to a given \uN Unicode character. A default of 1 should be assumed if no \uc keyword has been seen in the current or outer scopes. + + Returns: + new_children (list): The list of Tokens with all unicode replacement characters removed. + ascii_map (dict): All the Tokens which were removed from the provided children list keyed by + + +`unicode_escape_to_chr(item: bytes) ‑> str` +: Convert unicode char from it's decimal to its unicode character representation. From "\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents. + + Args: + item (str): A RTF Escape in the format \u[-]NNNNN. + + Returns: + The unicode character representation of the identified character + + Raises: + ValueError: The escaped unicode character is not valid. + + +`validate_ansi_cpg(header: str) ‑> None` +: Check an '\ansicpgNNNN' string to see if the number NNNN is an actual codepage. + + Args: + header (str): The value from the lark `\ansicpg` CONTROLWORD Token. + + Raises: + MalformedRtf: If the value passed is not a valid ansi codepage. + +Classes +------- + +`TextDecoder(keep_fontdef=False, initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False)` +: keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. + initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information. + use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for. + + ### Methods + + `iterate_on_children(self, children)` + : + + `prep_unicode(self, children: List[lark.lexer.Token])` + : + + `set_font_info(self, obj: lark.tree.Tree)` + : obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. + + `update_children(self, obj: lark.tree.Tree)` + : obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. + +`fontdef(fnum, codepage, codec, fontdef_tree)` +: fontdef(fnum, codepage, codec, fontdef_tree) + + ### Ancestors (in MRO) + + * builtins.tuple + + ### Instance variables + + `codec` + : Alias for field number 2 + + `codepage` + : Alias for field number 1 + + `fnum` + : Alias for field number 0 + + `fontdef_tree` + : Alias for field number 3 \ No newline at end of file diff --git a/docs/RTFDE/transformers.html b/docs/RTFDE/transformers.html new file mode 100644 index 0000000..c81b97e --- /dev/null +++ b/docs/RTFDE/transformers.html @@ -0,0 +1,1582 @@ + + + + + + +RTFDE.transformers API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.transformers

+
+
+
+ +Expand source code + +
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This file is part of RTFDE, a RTF De-Encapsulator.
+# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+
+from typing import Union, List, Tuple
+from typing import TypedDict
+#  from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator
+from collections.abc import Generator
+
+from lark.visitors import Transformer
+from lark.visitors import v_args, Discard
+from lark.tree import Tree
+from lark.lexer import Token
+import re
+
+from RTFDE.utils import log_htmlrtf_stripping
+
+import logging
+log = logging.getLogger("RTFDE")
+
+class StripNonVisibleRTFGroups(Transformer):
+    """Visits each Token in provided RTF Trees and strips out any RTF groups which are non-visible when de-encapsulated into HTML.
+    """
+
+    @v_args(tree=True)
+    def group(self, tree: Tree):
+        """Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings.
+
+NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed.
+
+Args:
+        tree: A .rtf group (Tree object) which needs its contents decoded.
+"""
+        children = tree.children
+        if len(children) == 0:
+            return b""
+        first_child = children[0]
+
+        known_control_groups = ["htmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_control_groups:
+                return tree
+        known_non_visible_control_groups = ["mhtmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_non_visible_control_groups:
+                # print(f"DELETING: {first_child} : because mhtmltag")
+                return b""
+
+        # process known non-visible groups
+        non_visible_control_words = [b"\\context", b"\\colortbl", b"\\fonttbl"]
+        first_control = self.get_first_controlword(children)
+        # print(f"FIRST: {first_control}")
+        if first_control in non_visible_control_words:
+            return b""
+
+        # Process star escaped groups
+        # NOTE: `understood_commands` is where we can include commands we decide to actively process during deencapsulation in the future.
+        # For example, if we added support for `destination text` we would need to add '\\bkmkstart' and '\\ud' so our processor doesn't delete those groups
+        understood_commands: List[str] = []
+        is_star_escaped = None
+        if (isinstance(first_child, Tree) and
+             len(first_child.children) != 0 ):
+            first_item = first_child.children[0]
+            if isinstance(first_item, Token):
+                if first_item.type == "STAR_ESCAPE":
+                    is_star_escaped = True
+        control_word = None
+        if is_star_escaped is True:
+            # print(f"STAR: {children}")
+            first_token = children[1]
+            if isinstance(first_token, Token):
+                if first_token.type == "CONTROLWORD":
+                    control_word = first_token
+                    if control_word.value in understood_commands:
+                        return tree
+                    return b""
+        return tree
+
+    @staticmethod
+    def get_first_controlword(children: List) -> Union[str,None]:
+        """Extracts the first control word from a .rtf group.
+
+Args:
+        children: A list of child objects within a .rtf group
+
+Returns:
+        The first controlword found in a group. Returns None if no controls words are found.
+        """
+        for i in children:
+            try:
+                if i.type == "CONTROLWORD":
+                    return i.value
+            except AttributeError:
+                continue
+        return None
+
+class RTFCleaner(Transformer):
+    """Visits each Token in provided RTF Trees. Converts all tokens that need converting. Deletes all tokens that shouldn't be visible. And, joins all strings that are left into one final string.
+    """
+
+    def start(self, args: List) -> bytes:
+        """Joins the .rtf object's string representations together at highest level object `start`.
+
+This is the final string combination. """
+        return b"".join(args)
+
+    def STRING(self, string: Token) -> bytes:
+        """Convert a string object into a raw string."""
+        if string.value is not None:
+            return string.value
+        return b""
+
+    def SPACE_SAVE(self, string: Token) -> bytes:
+        return string.value
+
+    def string(self, strings: List) -> bytes:
+        """Convert all string objects withing a string group into a single string."""
+        # print(strings)
+        return b"".join(strings)
+
+    def group(self, grp: List) -> bytes:
+        """Join the strings in all group objects."""
+        _new_children = []
+        for i in grp:
+            if isinstance(i, type(Discard)):
+                pass
+            else:
+                _new_children.append(i)
+        return b"".join(_new_children)
+
+    def document(self, args: List) -> bytes:
+        """Join the all the strings in an .rtf object into a single string representation of the document."""
+        args = [i for i in args if i is not None]
+        return b"".join(args)
+
+    def OPENPAREN(self, args: Token) -> bytes:
+        """Delete all open parens."""
+        return b""
+
+    def CLOSEPAREN(self, args: Token) -> bytes:
+        """Delete all closed parens."""
+        return b""
+
+    def mhtmltag_group(self, args: List):
+        """Process MHTMLTAG groups
+
+        Currently discarding because they don't need to be processed.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def htmltag_group(self, strings: List) -> bytes:
+        """HTMLTAG processing.
+
+Takes any string values within an HTMLTAG and returns them.
+        """
+        return b"".join(strings)
+
+    def HTMLTAG(self, htmltag: Token) -> bytes:
+        """Delete all HTMLTAG objects"""
+        return b""
+
+    def STAR_ESCAPE(self, char: Token) -> bytes:
+        """Delete all star escape objects"""
+        # '\\*': ''
+        return b""
+
+    def control_symbol(self, symbols: List) -> bytes:
+        """Join all visible symbols from in control symbol groups."""
+        return b"".join(symbols)
+
+    def NONBREAKING_SPACE(self, args: Token) -> bytes:
+        """Convert non-breaking spaces into visible representation."""
+        # '\\~': '\u00A0',
+        return u'\u00A0'.encode()
+
+    def NONBREAKING_HYPHEN(self, args: Token) -> bytes:
+        """Convert non-breaking hyphens into visible representation."""
+        # '\\_': '\u00AD'
+        return u'\u00AD'.encode()
+
+    def OPTIONAL_HYPHEN(self, args: Token) -> bytes:
+        """Convert hyphen control char into visible representation."""
+        # '\\-': '\u2027'
+        return u'\u2027'.encode()
+
+    def FORMULA_CHARACTER(self, args: Token) -> bytes:
+        """Convert a formula character into an empty string.
+
+If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands."""
+        return b""
+
+    def INDEX_SUBENTRY(self, args: Token) -> bytes:
+        """Process index subentry items
+
+Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time."""
+        return b""
+
+    def CONTROLSYMBOL(self, args: Token) -> bytes:
+        """Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones."""
+        symbols = {
+            b'\\{': b'\x7B',
+            b'\\}': b'\x7D',
+            b'\\\\': b'\x5C',
+        }
+        replacement = symbols.get(args.value, None)
+        # If this is simply a character to replace then return the value
+        if replacement is not None:
+            return replacement
+        return b""
+
+    def CONTROLWORD(self, args: Token) -> bytes:
+        """Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones.
+        """
+        words = {
+            b'\\par': b'\n',
+            b'\\tab': b'\t',
+            b'\\line': b'\n',
+            b'\\lquote': b'\u2018',
+            b'\\rquote': b'\u2019',
+            b'\\ldblquote': b'\u201C',
+            b'\\rdblquote': b'\u201D',
+            b'\\bullet': b'\u2022',
+            b'\\endash': b'\u2013',
+            b'\\emdash': b'\u2014'
+        }
+        replacement = words.get(args.value, None)
+        # If this is simply a character to replace then return the value as a string
+        if replacement is not None:
+            return replacement
+        return b""
+
+def get_stripped_HTMLRTF_values(tree: Tree, current_state: Union[bool,None] = None) -> Generator:
+    """Get a list of Tokens which should be suppressed by HTMLRTF control words.
+
+
+    NOTE: This de-encapsulation supports the HTMLRTF control word within nested groups. The state of the HTMLRTF control word transfers when entering groups and is restored when exiting groups, as specified in [MSFT-RTF].
+
+Returns:
+    A list of Tokens which should be suppressed by HTMLRTF control words.
+    """
+    if current_state is None:
+        htmlrtf_stack = [False]
+    else:
+        htmlrtf_stack = [current_state]
+    for child in tree.children:
+        is_htmlrtf = None
+        if isinstance(child, Tree):
+            # A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups, as specified in [MSFT-RTF].
+            for toggle in get_stripped_HTMLRTF_values(child, htmlrtf_stack[-1]):
+                yield toggle
+        else:
+            is_htmlrtf = toggle_htmlrtf(child)
+            if is_htmlrtf is not None:
+                htmlrtf_stack.append(is_htmlrtf)
+                yield child
+            elif htmlrtf_stack[-1] is True:
+                yield child
+
+def toggle_htmlrtf(child: Union[Token,str]) -> Union[bool,None]:
+    """Identify if htmlrtf is being turned on or off.
+
+Returns:
+    Bool representing if htmlrtf is being enabled or disabled. None if object is not an HTMLRTF token.
+"""
+    if isinstance(child, Token):
+        if child.type == "HTMLRTF":
+            htmlrtfstr = child.value.decode().strip()
+            if (len(htmlrtfstr) > 0 and htmlrtfstr[-1] == "0"):
+                return False
+            return True
+    return None
+
+class DeleteTokensFromTree(Transformer):
+    """Removes a series of tokens from a Tree.
+
+Parameters:
+    tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete)
+
+Attributes:
+    to_delete: A list of tokens to delete from the Tree object.
+    delete_start_pos: The starting position for all the identified tokens. Used to identify which tokens to delete.
+"""
+
+    def __init__(self, tokens_to_delete: List[Token]):
+        """Setup attributes including token start_pos tracking.
+
+Args:
+    tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete)
+"""
+        super().__init__()
+        self.to_delete = tokens_to_delete
+        self.delete_start_pos = {i.start_pos for i in self.to_delete}
+
+    def __default_token__(self, token: Token):
+        """Discard any identified tokens.
+
+Args:
+        token: All tokens within the transformed tree.
+
+Returns:
+        Returns all non-identified tokens. Returns Discard objects for any identified tokens.
+"""
+        # print"(Evaluating token {0} at {1} to consider deleting".format(child.value, child.end_pos))
+        if isinstance(token, Token):
+            if token.start_pos in self.delete_start_pos:
+                for i in self.to_delete:
+                    if (i.start_pos == token.start_pos and
+                        i.end_pos == token.end_pos and
+                        i.value == token.value):
+                        log_htmlrtf_stripping(i)
+                        # print(f"DELETING: {i}")
+                        return Discard
+        return token
+
+class StripUnusedSpecialCharacters(Transformer):
+    """Strip all unused tokens which lark has extracted from the RTF.
+
+These tokens are largely artifacts of the RTF format.
+
+We have to do this because we use the "keep_all_tokens" option in our lark parser. It's better to be explicit then to allow for ambiguity because of the grammar.
+    """
+
+    def _LBRACE(self, token: Token):
+        """Remove RTF braces.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def _RBRACE(self, token: Token):
+        """Remove RTF braces.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def _SPACE_DELETE(self, token: Token):
+        """Remove spaces which are not a part of the content
+
+These are mostly spaces used to separate control words from the content they precede.
+
+Returns:
+        Always returns a discard object.
+        """
+        return Discard
+
+
+class StripControlWords(Transformer):
+    """Visits each control word and strips the whitespace from around it.
+    """
+
+    def CONTROLWORD(self, token: Token):
+        """Strips the whitespace from around a provided control word.
+
+Args:
+        token: A CONTROLWORD token to strip whitespace from.
+        """
+        tok = token.update(value=token.value.strip())
+        return tok
+
+
+def strip_binary_objects(raw_rtf: bytes) -> tuple:
+    """Extracts binary objects from a rtf file.
+
+Parameters:
+    raw_rtf: (bytes): It's the raw RTF file as bytes.
+
+Returns:
+    A tuple containing (new_raw, found_bytes)
+        new_raw: (bytes) A bytes object where any binary data has been removed.
+        found_bytes: (list) List of dictionaries containing binary data extracted from the rtf file. Each dictionary includes the data extracted, where it was extracted from in the original rtf file and where it can be inserted back into the stripped output.
+
+    Description of found_bytes dictionaries:
+
+        "bytes": (bytes) The binary data contained which was extracted.
+        "ctrl_char": (tuple) Tuple containing the binary control word and its numeric parameter
+        "start_pos": (int) The position (in the original raw rtf data) where the binary control word started.
+        "bin_start_pos": (int) The position (in the original raw rtf data) where the binary data starts.
+        "end_pos": (int) The position (in the original raw rtf data) where the binary data ends.
+
+    Here is an example of what this looks like (by displaying the printable representation so you can see the bytes and then splitting the dict keys on new lines to make it readable.)
+        >> print(repr(found_bytes))
+
+        "{'bytes': b'\\xf4UP\\x13\\xdb\\xe4\\xe6CO\\xa8\\x16\\x10\\x8b\\n\\xfbA\\x9d\\xc5\\xd1C',
+          'ctrl_char': (b'\\\\bin', b'20'),
+          'start_pos': 56,
+          'end_pos': 83,
+          'bin_start_pos': 63}"
+    """
+    found_bytes = []
+    byte_finder = rb'(\\bin)([0-9]+)[ ]?'
+    for matchitem in re.finditer(byte_finder, raw_rtf):
+        param = int(matchitem[2])
+        bin_start_pos = matchitem.span()[-1]
+        byte_obj = {"bytes": raw_rtf[bin_start_pos:bin_start_pos+param],
+                    "ctrl_char": matchitem.groups(),
+                    "start_pos": matchitem.span()[0],
+                    "end_pos": bin_start_pos+param,
+                    "bin_start_pos": bin_start_pos
+                    }
+        # byte_obj : dict[str, Union[bytes, int, Tuple[bytes, bytes]]]
+        found_bytes.append(byte_obj)
+    new_raw = b''
+    start_buffer = 0
+    for new_bytes in found_bytes:
+        new_raw += raw_rtf[start_buffer:new_bytes["start_pos"]]
+        start_buffer = new_bytes["end_pos"]
+    new_raw += raw_rtf[start_buffer:]
+    return (new_raw, found_bytes)
+
+
+
+
+
+
+
+

Functions

+
+
+def get_stripped_HTMLRTF_values(tree: lark.tree.Tree, current_state: Optional[bool] = None) ‑> collections.abc.Generator +
+
+

Get a list of Tokens which should be suppressed by HTMLRTF control words.

+
NOTE: This de-encapsulation supports the HTMLRTF control word within nested groups. The state of the HTMLRTF control word transfers when entering groups and is restored when exiting groups, as specified in [MSFT-RTF].
+
+

Returns

+

A list of Tokens which should be suppressed by HTMLRTF control words.

+
+ +Expand source code + +
def get_stripped_HTMLRTF_values(tree: Tree, current_state: Union[bool,None] = None) -> Generator:
+    """Get a list of Tokens which should be suppressed by HTMLRTF control words.
+
+
+    NOTE: This de-encapsulation supports the HTMLRTF control word within nested groups. The state of the HTMLRTF control word transfers when entering groups and is restored when exiting groups, as specified in [MSFT-RTF].
+
+Returns:
+    A list of Tokens which should be suppressed by HTMLRTF control words.
+    """
+    if current_state is None:
+        htmlrtf_stack = [False]
+    else:
+        htmlrtf_stack = [current_state]
+    for child in tree.children:
+        is_htmlrtf = None
+        if isinstance(child, Tree):
+            # A de-encapsulating RTF reader MUST support the HTMLRTF control word within nested groups. The state of the HTMLRTF control word MUST transfer when entering groups and be restored when exiting groups, as specified in [MSFT-RTF].
+            for toggle in get_stripped_HTMLRTF_values(child, htmlrtf_stack[-1]):
+                yield toggle
+        else:
+            is_htmlrtf = toggle_htmlrtf(child)
+            if is_htmlrtf is not None:
+                htmlrtf_stack.append(is_htmlrtf)
+                yield child
+            elif htmlrtf_stack[-1] is True:
+                yield child
+
+
+
+def strip_binary_objects(raw_rtf: bytes) ‑> tuple +
+
+

Extracts binary objects from a rtf file.

+

Parameters

+

raw_rtf: (bytes): It's the raw RTF file as bytes.

+

Returns

+
+
A tuple containing (new_raw, found_bytes)
+
+new_raw
+
(bytes) A bytes object where any binary data has been removed. +found_bytes: (list) List of dictionaries containing binary data extracted from the rtf file. Each dictionary includes the data extracted, where it was extracted from in the original rtf file and where it can be inserted back into the stripped output.
+
+

Description of found_bytes dictionaries:

+
"bytes": (bytes) The binary data contained which was extracted.
+"ctrl_char": (tuple) Tuple containing the binary control word and its numeric parameter
+"start_pos": (int) The position (in the original raw rtf data) where the binary control word started.
+"bin_start_pos": (int) The position (in the original raw rtf data) where the binary data starts.
+"end_pos": (int) The position (in the original raw rtf data) where the binary data ends.
+
+

Here is an example of what this looks like (by displaying the printable representation so you can see the bytes and then splitting the dict keys on new lines to make it readable.) +>> print(repr(found_bytes))

+
"{'bytes': b'\xf4UP\x13\xdb\xe4\xe6CO\xa8\x16\x10\x8b\n\xfbA\x9d\xc5\xd1C',
+  'ctrl_char': (b'\\bin', b'20'),
+  'start_pos': 56,
+  'end_pos': 83,
+  'bin_start_pos': 63}"
+
+
+ +Expand source code + +
def strip_binary_objects(raw_rtf: bytes) -> tuple:
+    """Extracts binary objects from a rtf file.
+
+Parameters:
+    raw_rtf: (bytes): It's the raw RTF file as bytes.
+
+Returns:
+    A tuple containing (new_raw, found_bytes)
+        new_raw: (bytes) A bytes object where any binary data has been removed.
+        found_bytes: (list) List of dictionaries containing binary data extracted from the rtf file. Each dictionary includes the data extracted, where it was extracted from in the original rtf file and where it can be inserted back into the stripped output.
+
+    Description of found_bytes dictionaries:
+
+        "bytes": (bytes) The binary data contained which was extracted.
+        "ctrl_char": (tuple) Tuple containing the binary control word and its numeric parameter
+        "start_pos": (int) The position (in the original raw rtf data) where the binary control word started.
+        "bin_start_pos": (int) The position (in the original raw rtf data) where the binary data starts.
+        "end_pos": (int) The position (in the original raw rtf data) where the binary data ends.
+
+    Here is an example of what this looks like (by displaying the printable representation so you can see the bytes and then splitting the dict keys on new lines to make it readable.)
+        >> print(repr(found_bytes))
+
+        "{'bytes': b'\\xf4UP\\x13\\xdb\\xe4\\xe6CO\\xa8\\x16\\x10\\x8b\\n\\xfbA\\x9d\\xc5\\xd1C',
+          'ctrl_char': (b'\\\\bin', b'20'),
+          'start_pos': 56,
+          'end_pos': 83,
+          'bin_start_pos': 63}"
+    """
+    found_bytes = []
+    byte_finder = rb'(\\bin)([0-9]+)[ ]?'
+    for matchitem in re.finditer(byte_finder, raw_rtf):
+        param = int(matchitem[2])
+        bin_start_pos = matchitem.span()[-1]
+        byte_obj = {"bytes": raw_rtf[bin_start_pos:bin_start_pos+param],
+                    "ctrl_char": matchitem.groups(),
+                    "start_pos": matchitem.span()[0],
+                    "end_pos": bin_start_pos+param,
+                    "bin_start_pos": bin_start_pos
+                    }
+        # byte_obj : dict[str, Union[bytes, int, Tuple[bytes, bytes]]]
+        found_bytes.append(byte_obj)
+    new_raw = b''
+    start_buffer = 0
+    for new_bytes in found_bytes:
+        new_raw += raw_rtf[start_buffer:new_bytes["start_pos"]]
+        start_buffer = new_bytes["end_pos"]
+    new_raw += raw_rtf[start_buffer:]
+    return (new_raw, found_bytes)
+
+
+
+def toggle_htmlrtf(child: Union[lark.lexer.Token, str]) ‑> Optional[bool] +
+
+

Identify if htmlrtf is being turned on or off.

+

Returns

+

Bool representing if htmlrtf is being enabled or disabled. None if object is not an HTMLRTF token.

+
+ +Expand source code + +
def toggle_htmlrtf(child: Union[Token,str]) -> Union[bool,None]:
+    """Identify if htmlrtf is being turned on or off.
+
+Returns:
+    Bool representing if htmlrtf is being enabled or disabled. None if object is not an HTMLRTF token.
+"""
+    if isinstance(child, Token):
+        if child.type == "HTMLRTF":
+            htmlrtfstr = child.value.decode().strip()
+            if (len(htmlrtfstr) > 0 and htmlrtfstr[-1] == "0"):
+                return False
+            return True
+    return None
+
+
+
+
+
+

Classes

+
+
+class DeleteTokensFromTree +(tokens_to_delete: List[lark.lexer.Token]) +
+
+

Removes a series of tokens from a Tree.

+

Parameters

+

tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete)

+

Attributes

+
+
to_delete
+
A list of tokens to delete from the Tree object.
+
delete_start_pos
+
The starting position for all the identified tokens. Used to identify which tokens to delete.
+
+

Setup attributes including token start_pos tracking.

+

Args

+
+
tokens_to_delete
+
A list of tokens to delete from the Tree object. (sets self.to_delete)
+
+
+ +Expand source code + +
class DeleteTokensFromTree(Transformer):
+    """Removes a series of tokens from a Tree.
+
+Parameters:
+    tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete)
+
+Attributes:
+    to_delete: A list of tokens to delete from the Tree object.
+    delete_start_pos: The starting position for all the identified tokens. Used to identify which tokens to delete.
+"""
+
+    def __init__(self, tokens_to_delete: List[Token]):
+        """Setup attributes including token start_pos tracking.
+
+Args:
+    tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete)
+"""
+        super().__init__()
+        self.to_delete = tokens_to_delete
+        self.delete_start_pos = {i.start_pos for i in self.to_delete}
+
+    def __default_token__(self, token: Token):
+        """Discard any identified tokens.
+
+Args:
+        token: All tokens within the transformed tree.
+
+Returns:
+        Returns all non-identified tokens. Returns Discard objects for any identified tokens.
+"""
+        # print"(Evaluating token {0} at {1} to consider deleting".format(child.value, child.end_pos))
+        if isinstance(token, Token):
+            if token.start_pos in self.delete_start_pos:
+                for i in self.to_delete:
+                    if (i.start_pos == token.start_pos and
+                        i.end_pos == token.end_pos and
+                        i.value == token.value):
+                        log_htmlrtf_stripping(i)
+                        # print(f"DELETING: {i}")
+                        return Discard
+        return token
+
+

Ancestors

+
    +
  • lark.visitors.Transformer
  • +
  • lark.visitors._Decoratable
  • +
  • abc.ABC
  • +
  • typing.Generic
  • +
+
+
+class RTFCleaner +(visit_tokens: bool = True) +
+
+

Visits each Token in provided RTF Trees. Converts all tokens that need converting. Deletes all tokens that shouldn't be visible. And, joins all strings that are left into one final string.

+
+ +Expand source code + +
class RTFCleaner(Transformer):
+    """Visits each Token in provided RTF Trees. Converts all tokens that need converting. Deletes all tokens that shouldn't be visible. And, joins all strings that are left into one final string.
+    """
+
+    def start(self, args: List) -> bytes:
+        """Joins the .rtf object's string representations together at highest level object `start`.
+
+This is the final string combination. """
+        return b"".join(args)
+
+    def STRING(self, string: Token) -> bytes:
+        """Convert a string object into a raw string."""
+        if string.value is not None:
+            return string.value
+        return b""
+
+    def SPACE_SAVE(self, string: Token) -> bytes:
+        return string.value
+
+    def string(self, strings: List) -> bytes:
+        """Convert all string objects withing a string group into a single string."""
+        # print(strings)
+        return b"".join(strings)
+
+    def group(self, grp: List) -> bytes:
+        """Join the strings in all group objects."""
+        _new_children = []
+        for i in grp:
+            if isinstance(i, type(Discard)):
+                pass
+            else:
+                _new_children.append(i)
+        return b"".join(_new_children)
+
+    def document(self, args: List) -> bytes:
+        """Join the all the strings in an .rtf object into a single string representation of the document."""
+        args = [i for i in args if i is not None]
+        return b"".join(args)
+
+    def OPENPAREN(self, args: Token) -> bytes:
+        """Delete all open parens."""
+        return b""
+
+    def CLOSEPAREN(self, args: Token) -> bytes:
+        """Delete all closed parens."""
+        return b""
+
+    def mhtmltag_group(self, args: List):
+        """Process MHTMLTAG groups
+
+        Currently discarding because they don't need to be processed.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def htmltag_group(self, strings: List) -> bytes:
+        """HTMLTAG processing.
+
+Takes any string values within an HTMLTAG and returns them.
+        """
+        return b"".join(strings)
+
+    def HTMLTAG(self, htmltag: Token) -> bytes:
+        """Delete all HTMLTAG objects"""
+        return b""
+
+    def STAR_ESCAPE(self, char: Token) -> bytes:
+        """Delete all star escape objects"""
+        # '\\*': ''
+        return b""
+
+    def control_symbol(self, symbols: List) -> bytes:
+        """Join all visible symbols from in control symbol groups."""
+        return b"".join(symbols)
+
+    def NONBREAKING_SPACE(self, args: Token) -> bytes:
+        """Convert non-breaking spaces into visible representation."""
+        # '\\~': '\u00A0',
+        return u'\u00A0'.encode()
+
+    def NONBREAKING_HYPHEN(self, args: Token) -> bytes:
+        """Convert non-breaking hyphens into visible representation."""
+        # '\\_': '\u00AD'
+        return u'\u00AD'.encode()
+
+    def OPTIONAL_HYPHEN(self, args: Token) -> bytes:
+        """Convert hyphen control char into visible representation."""
+        # '\\-': '\u2027'
+        return u'\u2027'.encode()
+
+    def FORMULA_CHARACTER(self, args: Token) -> bytes:
+        """Convert a formula character into an empty string.
+
+If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands."""
+        return b""
+
+    def INDEX_SUBENTRY(self, args: Token) -> bytes:
+        """Process index subentry items
+
+Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time."""
+        return b""
+
+    def CONTROLSYMBOL(self, args: Token) -> bytes:
+        """Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones."""
+        symbols = {
+            b'\\{': b'\x7B',
+            b'\\}': b'\x7D',
+            b'\\\\': b'\x5C',
+        }
+        replacement = symbols.get(args.value, None)
+        # If this is simply a character to replace then return the value
+        if replacement is not None:
+            return replacement
+        return b""
+
+    def CONTROLWORD(self, args: Token) -> bytes:
+        """Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones.
+        """
+        words = {
+            b'\\par': b'\n',
+            b'\\tab': b'\t',
+            b'\\line': b'\n',
+            b'\\lquote': b'\u2018',
+            b'\\rquote': b'\u2019',
+            b'\\ldblquote': b'\u201C',
+            b'\\rdblquote': b'\u201D',
+            b'\\bullet': b'\u2022',
+            b'\\endash': b'\u2013',
+            b'\\emdash': b'\u2014'
+        }
+        replacement = words.get(args.value, None)
+        # If this is simply a character to replace then return the value as a string
+        if replacement is not None:
+            return replacement
+        return b""
+
+

Ancestors

+
    +
  • lark.visitors.Transformer
  • +
  • lark.visitors._Decoratable
  • +
  • abc.ABC
  • +
  • typing.Generic
  • +
+

Methods

+
+
+def CLOSEPAREN(self, args: lark.lexer.Token) ‑> bytes +
+
+

Delete all closed parens.

+
+ +Expand source code + +
def CLOSEPAREN(self, args: Token) -> bytes:
+    """Delete all closed parens."""
+    return b""
+
+
+
+def CONTROLSYMBOL(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones.

+
+ +Expand source code + +
def CONTROLSYMBOL(self, args: Token) -> bytes:
+    """Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones."""
+    symbols = {
+        b'\\{': b'\x7B',
+        b'\\}': b'\x7D',
+        b'\\\\': b'\x5C',
+    }
+    replacement = symbols.get(args.value, None)
+    # If this is simply a character to replace then return the value
+    if replacement is not None:
+        return replacement
+    return b""
+
+
+
+def CONTROLWORD(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones.

+
+ +Expand source code + +
def CONTROLWORD(self, args: Token) -> bytes:
+    """Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones.
+    """
+    words = {
+        b'\\par': b'\n',
+        b'\\tab': b'\t',
+        b'\\line': b'\n',
+        b'\\lquote': b'\u2018',
+        b'\\rquote': b'\u2019',
+        b'\\ldblquote': b'\u201C',
+        b'\\rdblquote': b'\u201D',
+        b'\\bullet': b'\u2022',
+        b'\\endash': b'\u2013',
+        b'\\emdash': b'\u2014'
+    }
+    replacement = words.get(args.value, None)
+    # If this is simply a character to replace then return the value as a string
+    if replacement is not None:
+        return replacement
+    return b""
+
+
+
+def FORMULA_CHARACTER(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert a formula character into an empty string.

+

If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands.

+
+ +Expand source code + +
    def FORMULA_CHARACTER(self, args: Token) -> bytes:
+        """Convert a formula character into an empty string.
+
+If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands."""
+        return b""
+
+
+
+def HTMLTAG(self, htmltag: lark.lexer.Token) ‑> bytes +
+
+

Delete all HTMLTAG objects

+
+ +Expand source code + +
def HTMLTAG(self, htmltag: Token) -> bytes:
+    """Delete all HTMLTAG objects"""
+    return b""
+
+
+
+def INDEX_SUBENTRY(self, args: lark.lexer.Token) ‑> bytes +
+
+

Process index subentry items

+

Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time.

+
+ +Expand source code + +
    def INDEX_SUBENTRY(self, args: Token) -> bytes:
+        """Process index subentry items
+
+Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time."""
+        return b""
+
+
+
+def NONBREAKING_HYPHEN(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert non-breaking hyphens into visible representation.

+
+ +Expand source code + +
def NONBREAKING_HYPHEN(self, args: Token) -> bytes:
+    """Convert non-breaking hyphens into visible representation."""
+    # '\\_': '\u00AD'
+    return u'\u00AD'.encode()
+
+
+
+def NONBREAKING_SPACE(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert non-breaking spaces into visible representation.

+
+ +Expand source code + +
def NONBREAKING_SPACE(self, args: Token) -> bytes:
+    """Convert non-breaking spaces into visible representation."""
+    # '\\~': '\u00A0',
+    return u'\u00A0'.encode()
+
+
+
+def OPENPAREN(self, args: lark.lexer.Token) ‑> bytes +
+
+

Delete all open parens.

+
+ +Expand source code + +
def OPENPAREN(self, args: Token) -> bytes:
+    """Delete all open parens."""
+    return b""
+
+
+
+def OPTIONAL_HYPHEN(self, args: lark.lexer.Token) ‑> bytes +
+
+

Convert hyphen control char into visible representation.

+
+ +Expand source code + +
def OPTIONAL_HYPHEN(self, args: Token) -> bytes:
+    """Convert hyphen control char into visible representation."""
+    # '\\-': '\u2027'
+    return u'\u2027'.encode()
+
+
+
+def SPACE_SAVE(self, string: lark.lexer.Token) ‑> bytes +
+
+
+
+ +Expand source code + +
def SPACE_SAVE(self, string: Token) -> bytes:
+    return string.value
+
+
+
+def STAR_ESCAPE(self, char: lark.lexer.Token) ‑> bytes +
+
+

Delete all star escape objects

+
+ +Expand source code + +
def STAR_ESCAPE(self, char: Token) -> bytes:
+    """Delete all star escape objects"""
+    # '\\*': ''
+    return b""
+
+
+
+def STRING(self, string: lark.lexer.Token) ‑> bytes +
+
+

Convert a string object into a raw string.

+
+ +Expand source code + +
def STRING(self, string: Token) -> bytes:
+    """Convert a string object into a raw string."""
+    if string.value is not None:
+        return string.value
+    return b""
+
+
+
+def control_symbol(self, symbols: List) ‑> bytes +
+
+

Join all visible symbols from in control symbol groups.

+
+ +Expand source code + +
def control_symbol(self, symbols: List) -> bytes:
+    """Join all visible symbols from in control symbol groups."""
+    return b"".join(symbols)
+
+
+
+def document(self, args: List) ‑> bytes +
+
+

Join the all the strings in an .rtf object into a single string representation of the document.

+
+ +Expand source code + +
def document(self, args: List) -> bytes:
+    """Join the all the strings in an .rtf object into a single string representation of the document."""
+    args = [i for i in args if i is not None]
+    return b"".join(args)
+
+
+
+def group(self, grp: List) ‑> bytes +
+
+

Join the strings in all group objects.

+
+ +Expand source code + +
def group(self, grp: List) -> bytes:
+    """Join the strings in all group objects."""
+    _new_children = []
+    for i in grp:
+        if isinstance(i, type(Discard)):
+            pass
+        else:
+            _new_children.append(i)
+    return b"".join(_new_children)
+
+
+
+def htmltag_group(self, strings: List) ‑> bytes +
+
+

HTMLTAG processing.

+

Takes any string values within an HTMLTAG and returns them.

+
+ +Expand source code + +
    def htmltag_group(self, strings: List) -> bytes:
+        """HTMLTAG processing.
+
+Takes any string values within an HTMLTAG and returns them.
+        """
+        return b"".join(strings)
+
+
+
+def mhtmltag_group(self, args: List) +
+
+

Process MHTMLTAG groups

+
    Currently discarding because they don't need to be processed.
+
+

Returns

+

Always returns a discard object.

+
+ +Expand source code + +
    def mhtmltag_group(self, args: List):
+        """Process MHTMLTAG groups
+
+        Currently discarding because they don't need to be processed.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+
+
+def start(self, args: List) ‑> bytes +
+
+

Joins the .rtf object's string representations together at highest level object start.

+

This is the final string combination.

+
+ +Expand source code + +
    def start(self, args: List) -> bytes:
+        """Joins the .rtf object's string representations together at highest level object `start`.
+
+This is the final string combination. """
+        return b"".join(args)
+
+
+
+def string(self, strings: List) ‑> bytes +
+
+

Convert all string objects withing a string group into a single string.

+
+ +Expand source code + +
def string(self, strings: List) -> bytes:
+    """Convert all string objects withing a string group into a single string."""
+    # print(strings)
+    return b"".join(strings)
+
+
+
+
+
+class StripControlWords +(visit_tokens: bool = True) +
+
+

Visits each control word and strips the whitespace from around it.

+
+ +Expand source code + +
class StripControlWords(Transformer):
+    """Visits each control word and strips the whitespace from around it.
+    """
+
+    def CONTROLWORD(self, token: Token):
+        """Strips the whitespace from around a provided control word.
+
+Args:
+        token: A CONTROLWORD token to strip whitespace from.
+        """
+        tok = token.update(value=token.value.strip())
+        return tok
+
+

Ancestors

+
    +
  • lark.visitors.Transformer
  • +
  • lark.visitors._Decoratable
  • +
  • abc.ABC
  • +
  • typing.Generic
  • +
+

Methods

+
+
+def CONTROLWORD(self, token: lark.lexer.Token) +
+
+

Strips the whitespace from around a provided control word.

+

Args

+
+
token
+
A CONTROLWORD token to strip whitespace from.
+
+
+ +Expand source code + +
    def CONTROLWORD(self, token: Token):
+        """Strips the whitespace from around a provided control word.
+
+Args:
+        token: A CONTROLWORD token to strip whitespace from.
+        """
+        tok = token.update(value=token.value.strip())
+        return tok
+
+
+
+
+
+class StripNonVisibleRTFGroups +(visit_tokens: bool = True) +
+
+

Visits each Token in provided RTF Trees and strips out any RTF groups which are non-visible when de-encapsulated into HTML.

+
+ +Expand source code + +
class StripNonVisibleRTFGroups(Transformer):
+    """Visits each Token in provided RTF Trees and strips out any RTF groups which are non-visible when de-encapsulated into HTML.
+    """
+
+    @v_args(tree=True)
+    def group(self, tree: Tree):
+        """Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings.
+
+NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed.
+
+Args:
+        tree: A .rtf group (Tree object) which needs its contents decoded.
+"""
+        children = tree.children
+        if len(children) == 0:
+            return b""
+        first_child = children[0]
+
+        known_control_groups = ["htmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_control_groups:
+                return tree
+        known_non_visible_control_groups = ["mhtmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_non_visible_control_groups:
+                # print(f"DELETING: {first_child} : because mhtmltag")
+                return b""
+
+        # process known non-visible groups
+        non_visible_control_words = [b"\\context", b"\\colortbl", b"\\fonttbl"]
+        first_control = self.get_first_controlword(children)
+        # print(f"FIRST: {first_control}")
+        if first_control in non_visible_control_words:
+            return b""
+
+        # Process star escaped groups
+        # NOTE: `understood_commands` is where we can include commands we decide to actively process during deencapsulation in the future.
+        # For example, if we added support for `destination text` we would need to add '\\bkmkstart' and '\\ud' so our processor doesn't delete those groups
+        understood_commands: List[str] = []
+        is_star_escaped = None
+        if (isinstance(first_child, Tree) and
+             len(first_child.children) != 0 ):
+            first_item = first_child.children[0]
+            if isinstance(first_item, Token):
+                if first_item.type == "STAR_ESCAPE":
+                    is_star_escaped = True
+        control_word = None
+        if is_star_escaped is True:
+            # print(f"STAR: {children}")
+            first_token = children[1]
+            if isinstance(first_token, Token):
+                if first_token.type == "CONTROLWORD":
+                    control_word = first_token
+                    if control_word.value in understood_commands:
+                        return tree
+                    return b""
+        return tree
+
+    @staticmethod
+    def get_first_controlword(children: List) -> Union[str,None]:
+        """Extracts the first control word from a .rtf group.
+
+Args:
+        children: A list of child objects within a .rtf group
+
+Returns:
+        The first controlword found in a group. Returns None if no controls words are found.
+        """
+        for i in children:
+            try:
+                if i.type == "CONTROLWORD":
+                    return i.value
+            except AttributeError:
+                continue
+        return None
+
+

Ancestors

+
    +
  • lark.visitors.Transformer
  • +
  • lark.visitors._Decoratable
  • +
  • abc.ABC
  • +
  • typing.Generic
  • +
+

Static methods

+
+
+def get_first_controlword(children: List) ‑> Optional[str] +
+
+

Extracts the first control word from a .rtf group.

+

Args

+
+
children
+
A list of child objects within a .rtf group
+
+

Returns

+

The first controlword found in a group. Returns None if no controls words are found.

+
+ +Expand source code + +
    @staticmethod
+    def get_first_controlword(children: List) -> Union[str,None]:
+        """Extracts the first control word from a .rtf group.
+
+Args:
+        children: A list of child objects within a .rtf group
+
+Returns:
+        The first controlword found in a group. Returns None if no controls words are found.
+        """
+        for i in children:
+            try:
+                if i.type == "CONTROLWORD":
+                    return i.value
+            except AttributeError:
+                continue
+        return None
+
+
+
+

Methods

+
+
+def group(self, tree: lark.tree.Tree) +
+
+

Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings.

+

NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed.

+

Args

+
+
tree
+
A .rtf group (Tree object) which needs its contents decoded.
+
+
+ +Expand source code + +
    @v_args(tree=True)
+    def group(self, tree: Tree):
+        """Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings.
+
+NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed.
+
+Args:
+        tree: A .rtf group (Tree object) which needs its contents decoded.
+"""
+        children = tree.children
+        if len(children) == 0:
+            return b""
+        first_child = children[0]
+
+        known_control_groups = ["htmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_control_groups:
+                return tree
+        known_non_visible_control_groups = ["mhtmltag_group"]
+        if isinstance(first_child, Tree):
+            if first_child.data in known_non_visible_control_groups:
+                # print(f"DELETING: {first_child} : because mhtmltag")
+                return b""
+
+        # process known non-visible groups
+        non_visible_control_words = [b"\\context", b"\\colortbl", b"\\fonttbl"]
+        first_control = self.get_first_controlword(children)
+        # print(f"FIRST: {first_control}")
+        if first_control in non_visible_control_words:
+            return b""
+
+        # Process star escaped groups
+        # NOTE: `understood_commands` is where we can include commands we decide to actively process during deencapsulation in the future.
+        # For example, if we added support for `destination text` we would need to add '\\bkmkstart' and '\\ud' so our processor doesn't delete those groups
+        understood_commands: List[str] = []
+        is_star_escaped = None
+        if (isinstance(first_child, Tree) and
+             len(first_child.children) != 0 ):
+            first_item = first_child.children[0]
+            if isinstance(first_item, Token):
+                if first_item.type == "STAR_ESCAPE":
+                    is_star_escaped = True
+        control_word = None
+        if is_star_escaped is True:
+            # print(f"STAR: {children}")
+            first_token = children[1]
+            if isinstance(first_token, Token):
+                if first_token.type == "CONTROLWORD":
+                    control_word = first_token
+                    if control_word.value in understood_commands:
+                        return tree
+                    return b""
+        return tree
+
+
+
+
+
+class StripUnusedSpecialCharacters +(visit_tokens: bool = True) +
+
+

Strip all unused tokens which lark has extracted from the RTF.

+

These tokens are largely artifacts of the RTF format.

+

We have to do this because we use the "keep_all_tokens" option in our lark parser. It's better to be explicit then to allow for ambiguity because of the grammar.

+
+ +Expand source code + +
class StripUnusedSpecialCharacters(Transformer):
+    """Strip all unused tokens which lark has extracted from the RTF.
+
+These tokens are largely artifacts of the RTF format.
+
+We have to do this because we use the "keep_all_tokens" option in our lark parser. It's better to be explicit then to allow for ambiguity because of the grammar.
+    """
+
+    def _LBRACE(self, token: Token):
+        """Remove RTF braces.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def _RBRACE(self, token: Token):
+        """Remove RTF braces.
+
+Returns:
+        Always returns a discard object."""
+        return Discard
+
+    def _SPACE_DELETE(self, token: Token):
+        """Remove spaces which are not a part of the content
+
+These are mostly spaces used to separate control words from the content they precede.
+
+Returns:
+        Always returns a discard object.
+        """
+        return Discard
+
+

Ancestors

+
    +
  • lark.visitors.Transformer
  • +
  • lark.visitors._Decoratable
  • +
  • abc.ABC
  • +
  • typing.Generic
  • +
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/transformers.md b/docs/RTFDE/transformers.md new file mode 100644 index 0000000..2fd51a0 --- /dev/null +++ b/docs/RTFDE/transformers.md @@ -0,0 +1,224 @@ +Module RTFDE.transformers +========================= + +Functions +--------- + + +`get_stripped_HTMLRTF_values(tree: lark.tree.Tree, current_state: Optional[bool] = None) ‑> collections.abc.Generator` +: Get a list of Tokens which should be suppressed by HTMLRTF control words. + + + NOTE: This de-encapsulation supports the HTMLRTF control word within nested groups. The state of the HTMLRTF control word transfers when entering groups and is restored when exiting groups, as specified in [MSFT-RTF]. + + Returns: + A list of Tokens which should be suppressed by HTMLRTF control words. + + +`strip_binary_objects(raw_rtf: bytes) ‑> tuple` +: Extracts binary objects from a rtf file. + + Parameters: + raw_rtf: (bytes): It's the raw RTF file as bytes. + + Returns: + A tuple containing (new_raw, found_bytes) + new_raw: (bytes) A bytes object where any binary data has been removed. + found_bytes: (list) List of dictionaries containing binary data extracted from the rtf file. Each dictionary includes the data extracted, where it was extracted from in the original rtf file and where it can be inserted back into the stripped output. + + Description of found_bytes dictionaries: + + "bytes": (bytes) The binary data contained which was extracted. + "ctrl_char": (tuple) Tuple containing the binary control word and its numeric parameter + "start_pos": (int) The position (in the original raw rtf data) where the binary control word started. + "bin_start_pos": (int) The position (in the original raw rtf data) where the binary data starts. + "end_pos": (int) The position (in the original raw rtf data) where the binary data ends. + + Here is an example of what this looks like (by displaying the printable representation so you can see the bytes and then splitting the dict keys on new lines to make it readable.) + >> print(repr(found_bytes)) + + "{'bytes': b'\xf4UP\x13\xdb\xe4\xe6CO\xa8\x16\x10\x8b\n\xfbA\x9d\xc5\xd1C', + 'ctrl_char': (b'\\bin', b'20'), + 'start_pos': 56, + 'end_pos': 83, + 'bin_start_pos': 63}" + + +`toggle_htmlrtf(child: Union[lark.lexer.Token, str]) ‑> Optional[bool]` +: Identify if htmlrtf is being turned on or off. + + Returns: + Bool representing if htmlrtf is being enabled or disabled. None if object is not an HTMLRTF token. + +Classes +------- + +`DeleteTokensFromTree(tokens_to_delete: List[lark.lexer.Token])` +: Removes a series of tokens from a Tree. + + Parameters: + tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete) + + Attributes: + to_delete: A list of tokens to delete from the Tree object. + delete_start_pos: The starting position for all the identified tokens. Used to identify which tokens to delete. + + Setup attributes including token start_pos tracking. + + Args: + tokens_to_delete: A list of tokens to delete from the Tree object. (sets self.to_delete) + + ### Ancestors (in MRO) + + * lark.visitors.Transformer + * lark.visitors._Decoratable + * abc.ABC + * typing.Generic + +`RTFCleaner(visit_tokens: bool = True)` +: Visits each Token in provided RTF Trees. Converts all tokens that need converting. Deletes all tokens that shouldn't be visible. And, joins all strings that are left into one final string. + + ### Ancestors (in MRO) + + * lark.visitors.Transformer + * lark.visitors._Decoratable + * abc.ABC + * typing.Generic + + ### Methods + + `CLOSEPAREN(self, args: lark.lexer.Token) ‑> bytes` + : Delete all closed parens. + + `CONTROLSYMBOL(self, args: lark.lexer.Token) ‑> bytes` + : Convert encoded chars which are mis-categorized as control symbols into their respective chars. Delete all the other ones. + + `CONTROLWORD(self, args: lark.lexer.Token) ‑> bytes` + : Convert encoded chars which are mis-categorized as control words into their respective chars. Delete all the other ones. + + `FORMULA_CHARACTER(self, args: lark.lexer.Token) ‑> bytes` + : Convert a formula character into an empty string. + + If we are attempting to represent formula characters the scope for this library has grown too inclusive. This was only used by Word 5.1 for the Macintosh as the beginning delimiter for a string of formula typesetting commands. + + `HTMLTAG(self, htmltag: lark.lexer.Token) ‑> bytes` + : Delete all HTMLTAG objects + + `INDEX_SUBENTRY(self, args: lark.lexer.Token) ‑> bytes` + : Process index subentry items + + Discard index sub-entries. Because, we don't care about indexes when de-encapsulating at this time. + + `NONBREAKING_HYPHEN(self, args: lark.lexer.Token) ‑> bytes` + : Convert non-breaking hyphens into visible representation. + + `NONBREAKING_SPACE(self, args: lark.lexer.Token) ‑> bytes` + : Convert non-breaking spaces into visible representation. + + `OPENPAREN(self, args: lark.lexer.Token) ‑> bytes` + : Delete all open parens. + + `OPTIONAL_HYPHEN(self, args: lark.lexer.Token) ‑> bytes` + : Convert hyphen control char into visible representation. + + `SPACE_SAVE(self, string: lark.lexer.Token) ‑> bytes` + : + + `STAR_ESCAPE(self, char: lark.lexer.Token) ‑> bytes` + : Delete all star escape objects + + `STRING(self, string: lark.lexer.Token) ‑> bytes` + : Convert a string object into a raw string. + + `control_symbol(self, symbols: List) ‑> bytes` + : Join all visible symbols from in control symbol groups. + + `document(self, args: List) ‑> bytes` + : Join the all the strings in an .rtf object into a single string representation of the document. + + `group(self, grp: List) ‑> bytes` + : Join the strings in all group objects. + + `htmltag_group(self, strings: List) ‑> bytes` + : HTMLTAG processing. + + Takes any string values within an HTMLTAG and returns them. + + `mhtmltag_group(self, args: List)` + : Process MHTMLTAG groups + + Currently discarding because they don't need to be processed. + + Returns: + Always returns a discard object. + + `start(self, args: List) ‑> bytes` + : Joins the .rtf object's string representations together at highest level object `start`. + + This is the final string combination. + + `string(self, strings: List) ‑> bytes` + : Convert all string objects withing a string group into a single string. + +`StripControlWords(visit_tokens: bool = True)` +: Visits each control word and strips the whitespace from around it. + + ### Ancestors (in MRO) + + * lark.visitors.Transformer + * lark.visitors._Decoratable + * abc.ABC + * typing.Generic + + ### Methods + + `CONTROLWORD(self, token: lark.lexer.Token)` + : Strips the whitespace from around a provided control word. + + Args: + token: A CONTROLWORD token to strip whitespace from. + +`StripNonVisibleRTFGroups(visit_tokens: bool = True)` +: Visits each Token in provided RTF Trees and strips out any RTF groups which are non-visible when de-encapsulated into HTML. + + ### Ancestors (in MRO) + + * lark.visitors.Transformer + * lark.visitors._Decoratable + * abc.ABC + * typing.Generic + + ### Static methods + + `get_first_controlword(children: List) ‑> Optional[str]` + : Extracts the first control word from a .rtf group. + + Args: + children: A list of child objects within a .rtf group + + Returns: + The first controlword found in a group. Returns None if no controls words are found. + + ### Methods + + `group(self, tree: lark.tree.Tree)` + : Transformer which aggressively seeks out possible non-visible RTF groups and replaces them with empty strings. + + NOTE: Currently deleting all groups that don't have an htmltag. Please file an issue if you find one that should be included in de-encapsulated HTML. I will refine what gets deleted and what is converted based on identified needs for greater functionality or specific issues which need to be addressed. + + Args: + tree: A .rtf group (Tree object) which needs its contents decoded. + +`StripUnusedSpecialCharacters(visit_tokens: bool = True)` +: Strip all unused tokens which lark has extracted from the RTF. + + These tokens are largely artifacts of the RTF format. + + We have to do this because we use the "keep_all_tokens" option in our lark parser. It's better to be explicit then to allow for ambiguity because of the grammar. + + ### Ancestors (in MRO) + + * lark.visitors.Transformer + * lark.visitors._Decoratable + * abc.ABC + * typing.Generic \ No newline at end of file diff --git a/docs/RTFDE/utils.html b/docs/RTFDE/utils.html new file mode 100644 index 0000000..2eea8a7 --- /dev/null +++ b/docs/RTFDE/utils.html @@ -0,0 +1,886 @@ + + + + + + +RTFDE.utils API documentation + + + + + + + + + + + +
+
+
+

Module RTFDE.utils

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# This file is part of package name, a package description short.
+# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
+
+import difflib
+import sys
+import re
+from typing import Union, AnyStr, Any
+#  from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator
+from collections.abc import Generator
+from lark.lexer import Token
+from lark.tree import Tree
+from lark import Lark
+
+
+import logging
+log = logging.getLogger("RTFDE")
+
+def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
+    """Returns the hex encoded value of a .rtf control parameter.
+
+Args:
+    control_parameter: (int/str) Int or a string which represents an int.
+
+Returns:
+    Zero padded 6 char long hexedecimal string.
+"""
+    try:
+        return f"{control_parameter:#06x}"
+    except ValueError:
+        # If passed as string convert first
+        control_parameter = int(control_parameter)
+        return f"{control_parameter:#06x}"
+
+def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
+    """Prints binary object to a dump file for quick debugging.
+
+Warning: Not for normal use. Only use when debugging.
+
+Args:
+    data (bytes|str): Data to write to path
+    path (str): The file path to write data to
+    """
+    # Be able to print binary objects easily
+    if isinstance(data, (bytes, bytearray)) is True:
+        open_as = 'wb+'
+    else:
+        open_as = 'w+'
+    with open(path, open_as) as fp:
+        original_stdout = sys.stdout
+        sys.stdout = fp
+        print(data)
+        sys.stdout = original_stdout
+
+def encode_escaped_control_chars(raw_text: bytes) -> bytes:
+    """Replaces escaped control chars within the text with their RTF encoded versions \\'HH.
+
+Args:
+    raw_text (str): string which needs escape characters encoded
+
+Returns:
+    A string with escaped control chars
+    """
+    cleaned = raw_text.replace(b'\\\\', b"\\'5c")
+    cleaned = cleaned.replace(b'\\{', b"\\'7b")
+    cleaned = cleaned.replace(b'\\}', b"\\'7d")
+    return cleaned
+
+def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
+    """Checks if a Token is a codeword with a numeric argument.
+
+Returns:
+    True if a Token is a codeword with a numeric argument. False if not.
+"""
+    try:
+        val = token.value.strip()
+        # print(val, codeword)
+        if (val.startswith(codeword) and
+            val[len(codeword):].isdigit()):
+            return True
+    except AttributeError:
+        return False
+    return False
+
+def print_lark_parser_evaluated_grammar(parser):
+    """Prints the final evaluated grammar.
+
+Can be useful for debugging possible errors in grammar evaluation.
+
+Args:
+    parser (Lark obj): Lark object to extract grammar from.
+    """
+    if not isinstance(parser, Lark):
+        raise ValueError("Requires a Lark object.")
+    eq = "="*15
+    eq = " " + eq + " "
+    print(eq + "RULES" + eq + "\n")
+    for i in parser.rules:
+        print("    " + i)
+    print(eq + "TERMINALS" + eq + "\n")
+    for i in parser.terminals:
+        print("    " + i)
+    print(eq + "IGNORED TOKENS" + eq + "\n")
+    for i in parser.ignore_tokens:
+        print("    " + i)
+
+def log_validators(data):
+    """Log validator logging only if RTFDE.validation_logger set to debug.
+    """
+    logger = logging.getLogger("RTFDE.validation_logger")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+def log_transformations(data):
+    """Log transform logging only if RTFDE.transform_logger set to debug.
+    """
+    logger = logging.getLogger("RTFDE.transform_logger")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+def log_text_extraction(data):
+    """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
+    """
+    logger = logging.getLogger("RTFDE.text_extraction")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+def log_htmlrtf_stripping(data: Token):
+    """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.
+
+Raises:
+    AttributeError: Will occur if you pass this something that is not a token.
+"""
+    logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
+    if logger.level == logging.DEBUG:
+        if not isinstance(data, Token):
+            raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
+        tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
+        log_msg = tok_desc.format(value=data.value,
+                                  line=data.line,
+                                  end_line=data.end_line,
+                                  start_pos=data.start_pos,
+                                  end_pos = data.end_pos)
+        logger.debug(log_msg)
+
+def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
+    """Log diff of two strings. Defaults to splitting by newlines and keeping the ends.
+
+Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.
+
+Args:
+    original: The original string
+    revised: The changed version of the string
+    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+"""
+    log.debug(get_string_diff(original, revised, sep))
+
+def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
+    """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.
+
+Args:
+    original: The original string
+    revised: The changed version of the string
+    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+
+Returns:
+    A string object representing the diff of the two strings provided.
+"""
+    if sep is None:
+        orig_split = original.decode().splitlines(keepends=True)
+        revised_split = revised.decode().splitlines(keepends=True)
+    else:
+        original = original.replace(b'\n',b'')
+        revised = revised.replace(b'\n',b'')
+        orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
+        revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
+    return "\n".join(list(difflib.context_diff(orig_split,
+                                               revised_split)))
+
+def get_tree_diff(original: Tree, revised: Tree):
+    """Get the diff of two trees.
+
+Args:
+    original (lark Tree): A lark tree before transformation
+    revised (lark Tree): A lark tree after transformation
+
+Returns:
+    A string object representing the diff of the two Trees provided.
+
+Example:
+    rtf_obj = DeEncapsulator(raw_rtf)
+    rtf_obj.deencapsulate()
+    transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
+    get_tree_diff(rtf_obj.full_tree, transformed_tree)
+
+    """
+    log = logging.getLogger("RTFDE")
+    flat_original  = list(flatten_tree(original))
+    flat_revised  = list(flatten_tree(revised))
+    return "\n".join(list(difflib.context_diff(flat_original,
+                                               flat_revised)))
+def flatten_tree(tree: Tree) -> Generator:
+    """Flatten a lark Tree into a list of repr's of tree objects.
+
+Args:
+    tree (lark Tree): A lark tree
+"""
+    yield f"Tree('{tree.data}')"
+    for child in tree.children:
+        if isinstance(child, Token):
+            yield repr(child)
+        elif isinstance(child, Tree):
+            for i in flatten_tree(child):
+                yield i
+        else:
+            yield repr(child)
+
+def flatten_tree_to_string_array(tree: Tree) -> Generator:
+    """Flatten a lark Tree into a list of repr's of tree objects.
+
+Args:
+    tree (lark Tree): A lark tree
+"""
+    for child in tree.children:
+        if isinstance(child, Tree):
+            for i in flatten_tree_to_string_array(child):
+                yield i
+        elif isinstance(child, Token):
+            yield child.value
+        else:
+            yield child
+
+def make_token_replacement(ttype, value, example):
+    if isinstance(example, Token):
+        fake_tok = Token(ttype,
+                        value,
+                        start_pos=example.start_pos,
+                        end_pos=example.end_pos,
+                        line=example.line,
+                        end_line=example.end_line,
+                        column=example.column,
+                        end_column=example.end_column)
+    elif isinstance(example, Tree):
+        fake_tok = Token(ttype,
+                         value,
+                         start_pos=example.meta.start_pos,
+                         end_pos=example.meta.end_pos,
+                         line=example.meta.line,
+                         end_line=example.meta.end_line,
+                         column=example.meta.column,
+                         end_column=example.meta.end_column)
+
+    return fake_tok
+
+
+def embed():
+    import os
+    import readline
+    import rlcompleter
+    import code
+    import inspect
+    import traceback
+
+    history = os.path.join(os.path.expanduser('~'), '.python_history')
+    if os.path.isfile(history):
+        readline.read_history_file(history)
+
+    frame = inspect.currentframe().f_back
+    namespace = frame.f_locals.copy()
+    namespace.update(frame.f_globals)
+
+    readline.set_completer(rlcompleter.Completer(namespace).complete)
+    readline.parse_and_bind("tab: complete")
+
+    file = frame.f_code.co_filename
+    line = frame.f_lineno
+    function = frame.f_code.co_name
+
+    stack = ''.join(traceback.format_stack()[:-1])
+    print(stack)
+    banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
+    banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
+    try:
+        code.interact(banner=banner, local=namespace)
+    finally:
+        readline.write_history_file(history)
+
+
+
+
+
+
+
+

Functions

+
+
+def embed() +
+
+
+
+ +Expand source code + +
def embed():
+    import os
+    import readline
+    import rlcompleter
+    import code
+    import inspect
+    import traceback
+
+    history = os.path.join(os.path.expanduser('~'), '.python_history')
+    if os.path.isfile(history):
+        readline.read_history_file(history)
+
+    frame = inspect.currentframe().f_back
+    namespace = frame.f_locals.copy()
+    namespace.update(frame.f_globals)
+
+    readline.set_completer(rlcompleter.Completer(namespace).complete)
+    readline.parse_and_bind("tab: complete")
+
+    file = frame.f_code.co_filename
+    line = frame.f_lineno
+    function = frame.f_code.co_name
+
+    stack = ''.join(traceback.format_stack()[:-1])
+    print(stack)
+    banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
+    banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
+    try:
+        code.interact(banner=banner, local=namespace)
+    finally:
+        readline.write_history_file(history)
+
+
+
+def encode_escaped_control_chars(raw_text: bytes) ‑> bytes +
+
+

Replaces escaped control chars within the text with their RTF encoded versions 'HH.

+

Args

+
+
raw_text : str
+
string which needs escape characters encoded
+
+

Returns

+

A string with escaped control chars

+
+ +Expand source code + +
def encode_escaped_control_chars(raw_text: bytes) -> bytes:
+    """Replaces escaped control chars within the text with their RTF encoded versions \\'HH.
+
+Args:
+    raw_text (str): string which needs escape characters encoded
+
+Returns:
+    A string with escaped control chars
+    """
+    cleaned = raw_text.replace(b'\\\\', b"\\'5c")
+    cleaned = cleaned.replace(b'\\{', b"\\'7b")
+    cleaned = cleaned.replace(b'\\}', b"\\'7d")
+    return cleaned
+
+
+
+def flatten_tree(tree: lark.tree.Tree) ‑> collections.abc.Generator +
+
+

Flatten a lark Tree into a list of repr's of tree objects.

+

Args

+
+
tree : lark Tree
+
A lark tree
+
+
+ +Expand source code + +
def flatten_tree(tree: Tree) -> Generator:
+    """Flatten a lark Tree into a list of repr's of tree objects.
+
+Args:
+    tree (lark Tree): A lark tree
+"""
+    yield f"Tree('{tree.data}')"
+    for child in tree.children:
+        if isinstance(child, Token):
+            yield repr(child)
+        elif isinstance(child, Tree):
+            for i in flatten_tree(child):
+                yield i
+        else:
+            yield repr(child)
+
+
+
+def flatten_tree_to_string_array(tree: lark.tree.Tree) ‑> collections.abc.Generator +
+
+

Flatten a lark Tree into a list of repr's of tree objects.

+

Args

+
+
tree : lark Tree
+
A lark tree
+
+
+ +Expand source code + +
def flatten_tree_to_string_array(tree: Tree) -> Generator:
+    """Flatten a lark Tree into a list of repr's of tree objects.
+
+Args:
+    tree (lark Tree): A lark tree
+"""
+    for child in tree.children:
+        if isinstance(child, Tree):
+            for i in flatten_tree_to_string_array(child):
+                yield i
+        elif isinstance(child, Token):
+            yield child.value
+        else:
+            yield child
+
+
+
+def get_control_parameter_as_hex_strings(control_parameter: Union[str, int]) ‑> str +
+
+

Returns the hex encoded value of a .rtf control parameter.

+

Args

+
+
control_parameter
+
(int/str) Int or a string which represents an int.
+
+

Returns

+

Zero padded 6 char long hexedecimal string.

+
+ +Expand source code + +
def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
+    """Returns the hex encoded value of a .rtf control parameter.
+
+Args:
+    control_parameter: (int/str) Int or a string which represents an int.
+
+Returns:
+    Zero padded 6 char long hexedecimal string.
+"""
+    try:
+        return f"{control_parameter:#06x}"
+    except ValueError:
+        # If passed as string convert first
+        control_parameter = int(control_parameter)
+        return f"{control_parameter:#06x}"
+
+
+
+def get_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None) +
+
+

Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.

+

Args

+
+
original
+
The original string
+
revised
+
The changed version of the string
+
sep : string
+
A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+
+

Returns

+

A string object representing the diff of the two strings provided.

+
+ +Expand source code + +
def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
+    """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.
+
+Args:
+    original: The original string
+    revised: The changed version of the string
+    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+
+Returns:
+    A string object representing the diff of the two strings provided.
+"""
+    if sep is None:
+        orig_split = original.decode().splitlines(keepends=True)
+        revised_split = revised.decode().splitlines(keepends=True)
+    else:
+        original = original.replace(b'\n',b'')
+        revised = revised.replace(b'\n',b'')
+        orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
+        revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
+    return "\n".join(list(difflib.context_diff(orig_split,
+                                               revised_split)))
+
+
+
+def get_tree_diff(original: lark.tree.Tree, revised: lark.tree.Tree) +
+
+

Get the diff of two trees.

+

Args

+
+
original : lark Tree
+
A lark tree before transformation
+
revised : lark Tree
+
A lark tree after transformation
+
+

Returns

+

A string object representing the diff of the two Trees provided.

+

Example

+

rtf_obj = DeEncapsulator(raw_rtf) +rtf_obj.deencapsulate() +transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) +get_tree_diff(rtf_obj.full_tree, transformed_tree)

+
+ +Expand source code + +
def get_tree_diff(original: Tree, revised: Tree):
+    """Get the diff of two trees.
+
+Args:
+    original (lark Tree): A lark tree before transformation
+    revised (lark Tree): A lark tree after transformation
+
+Returns:
+    A string object representing the diff of the two Trees provided.
+
+Example:
+    rtf_obj = DeEncapsulator(raw_rtf)
+    rtf_obj.deencapsulate()
+    transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
+    get_tree_diff(rtf_obj.full_tree, transformed_tree)
+
+    """
+    log = logging.getLogger("RTFDE")
+    flat_original  = list(flatten_tree(original))
+    flat_revised  = list(flatten_tree(revised))
+    return "\n".join(list(difflib.context_diff(flat_original,
+                                               flat_revised)))
+
+
+
+def is_codeword_with_numeric_arg(token: Union[lark.lexer.Token, Any], codeword: bytes) ‑> bool +
+
+

Checks if a Token is a codeword with a numeric argument.

+

Returns

+

True if a Token is a codeword with a numeric argument. False if not.

+
+ +Expand source code + +
def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
+    """Checks if a Token is a codeword with a numeric argument.
+
+Returns:
+    True if a Token is a codeword with a numeric argument. False if not.
+"""
+    try:
+        val = token.value.strip()
+        # print(val, codeword)
+        if (val.startswith(codeword) and
+            val[len(codeword):].isdigit()):
+            return True
+    except AttributeError:
+        return False
+    return False
+
+
+
+def log_htmlrtf_stripping(data: lark.lexer.Token) +
+
+

Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.

+

Raises

+
+
AttributeError
+
Will occur if you pass this something that is not a token.
+
+
+ +Expand source code + +
def log_htmlrtf_stripping(data: Token):
+    """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.
+
+Raises:
+    AttributeError: Will occur if you pass this something that is not a token.
+"""
+    logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
+    if logger.level == logging.DEBUG:
+        if not isinstance(data, Token):
+            raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
+        tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
+        log_msg = tok_desc.format(value=data.value,
+                                  line=data.line,
+                                  end_line=data.end_line,
+                                  start_pos=data.start_pos,
+                                  end_pos = data.end_pos)
+        logger.debug(log_msg)
+
+
+
+def log_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None) +
+
+

Log diff of two strings. Defaults to splitting by newlines and keeping the ends.

+

Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.

+

Args

+
+
original
+
The original string
+
revised
+
The changed version of the string
+
sep : string
+
A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+
+
+ +Expand source code + +
def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
+    """Log diff of two strings. Defaults to splitting by newlines and keeping the ends.
+
+Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.
+
+Args:
+    original: The original string
+    revised: The changed version of the string
+    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
+"""
+    log.debug(get_string_diff(original, revised, sep))
+
+
+
+def log_text_extraction(data) +
+
+

Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.

+
+ +Expand source code + +
def log_text_extraction(data):
+    """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
+    """
+    logger = logging.getLogger("RTFDE.text_extraction")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+
+
+def log_transformations(data) +
+
+

Log transform logging only if RTFDE.transform_logger set to debug.

+
+ +Expand source code + +
def log_transformations(data):
+    """Log transform logging only if RTFDE.transform_logger set to debug.
+    """
+    logger = logging.getLogger("RTFDE.transform_logger")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+
+
+def log_validators(data) +
+
+

Log validator logging only if RTFDE.validation_logger set to debug.

+
+ +Expand source code + +
def log_validators(data):
+    """Log validator logging only if RTFDE.validation_logger set to debug.
+    """
+    logger = logging.getLogger("RTFDE.validation_logger")
+    if logger.level == logging.DEBUG:
+        logger.debug(data)
+
+
+
+def make_token_replacement(ttype, value, example) +
+
+
+
+ +Expand source code + +
def make_token_replacement(ttype, value, example):
+    if isinstance(example, Token):
+        fake_tok = Token(ttype,
+                        value,
+                        start_pos=example.start_pos,
+                        end_pos=example.end_pos,
+                        line=example.line,
+                        end_line=example.end_line,
+                        column=example.column,
+                        end_column=example.end_column)
+    elif isinstance(example, Tree):
+        fake_tok = Token(ttype,
+                         value,
+                         start_pos=example.meta.start_pos,
+                         end_pos=example.meta.end_pos,
+                         line=example.meta.line,
+                         end_line=example.meta.end_line,
+                         column=example.meta.column,
+                         end_column=example.meta.end_column)
+
+    return fake_tok
+
+
+
+def print_lark_parser_evaluated_grammar(parser) +
+
+

Prints the final evaluated grammar.

+

Can be useful for debugging possible errors in grammar evaluation.

+

Args

+
+
parser : Lark obj
+
Lark object to extract grammar from.
+
+
+ +Expand source code + +
def print_lark_parser_evaluated_grammar(parser):
+    """Prints the final evaluated grammar.
+
+Can be useful for debugging possible errors in grammar evaluation.
+
+Args:
+    parser (Lark obj): Lark object to extract grammar from.
+    """
+    if not isinstance(parser, Lark):
+        raise ValueError("Requires a Lark object.")
+    eq = "="*15
+    eq = " " + eq + " "
+    print(eq + "RULES" + eq + "\n")
+    for i in parser.rules:
+        print("    " + i)
+    print(eq + "TERMINALS" + eq + "\n")
+    for i in parser.terminals:
+        print("    " + i)
+    print(eq + "IGNORED TOKENS" + eq + "\n")
+    for i in parser.ignore_tokens:
+        print("    " + i)
+
+
+
+def print_to_tmp_file(data: Union[~AnyStr, bytes, bytearray], path: str) +
+
+

Prints binary object to a dump file for quick debugging.

+

Warning: Not for normal use. Only use when debugging.

+

Args

+
+
data (bytes|str): Data to write to path
+
path : str
+
The file path to write data to
+
+
+ +Expand source code + +
def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
+    """Prints binary object to a dump file for quick debugging.
+
+Warning: Not for normal use. Only use when debugging.
+
+Args:
+    data (bytes|str): Data to write to path
+    path (str): The file path to write data to
+    """
+    # Be able to print binary objects easily
+    if isinstance(data, (bytes, bytearray)) is True:
+        open_as = 'wb+'
+    else:
+        open_as = 'w+'
+    with open(path, open_as) as fp:
+        original_stdout = sys.stdout
+        sys.stdout = fp
+        print(data)
+        sys.stdout = original_stdout
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/RTFDE/utils.md b/docs/RTFDE/utils.md new file mode 100644 index 0000000..49c5e39 --- /dev/null +++ b/docs/RTFDE/utils.md @@ -0,0 +1,132 @@ +Module RTFDE.utils +================== + +Functions +--------- + + +`embed()` +: + + +`encode_escaped_control_chars(raw_text: bytes) ‑> bytes` +: Replaces escaped control chars within the text with their RTF encoded versions \'HH. + + Args: + raw_text (str): string which needs escape characters encoded + + Returns: + A string with escaped control chars + + +`flatten_tree(tree: lark.tree.Tree) ‑> collections.abc.Generator` +: Flatten a lark Tree into a list of repr's of tree objects. + + Args: + tree (lark Tree): A lark tree + + +`flatten_tree_to_string_array(tree: lark.tree.Tree) ‑> collections.abc.Generator` +: Flatten a lark Tree into a list of repr's of tree objects. + + Args: + tree (lark Tree): A lark tree + + +`get_control_parameter_as_hex_strings(control_parameter: Union[str, int]) ‑> str` +: Returns the hex encoded value of a .rtf control parameter. + + Args: + control_parameter: (int/str) Int or a string which represents an int. + + Returns: + Zero padded 6 char long hexedecimal string. + + +`get_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)` +: Get the diff of two strings. Defaults to splitting by newlines and keeping the ends. + + Args: + original: The original string + revised: The changed version of the string + sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. + + Returns: + A string object representing the diff of the two strings provided. + + +`get_tree_diff(original: lark.tree.Tree, revised: lark.tree.Tree)` +: Get the diff of two trees. + + Args: + original (lark Tree): A lark tree before transformation + revised (lark Tree): A lark tree after transformation + + Returns: + A string object representing the diff of the two Trees provided. + + Example: + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) + get_tree_diff(rtf_obj.full_tree, transformed_tree) + + +`is_codeword_with_numeric_arg(token: Union[lark.lexer.Token, Any], codeword: bytes) ‑> bool` +: Checks if a Token is a codeword with a numeric argument. + + Returns: + True if a Token is a codeword with a numeric argument. False if not. + + +`log_htmlrtf_stripping(data: lark.lexer.Token)` +: Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug. + + Raises: + AttributeError: Will occur if you pass this something that is not a token. + + +`log_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)` +: Log diff of two strings. Defaults to splitting by newlines and keeping the ends. + + Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging. + + Args: + original: The original string + revised: The changed version of the string + sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. + + +`log_text_extraction(data)` +: Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug. + + +`log_transformations(data)` +: Log transform logging only if RTFDE.transform_logger set to debug. + + +`log_validators(data)` +: Log validator logging only if RTFDE.validation_logger set to debug. + + +`make_token_replacement(ttype, value, example)` +: + + +`print_lark_parser_evaluated_grammar(parser)` +: Prints the final evaluated grammar. + + Can be useful for debugging possible errors in grammar evaluation. + + Args: + parser (Lark obj): Lark object to extract grammar from. + + +`print_to_tmp_file(data: Union[~AnyStr, bytes, bytearray], path: str)` +: Prints binary object to a dump file for quick debugging. + + Warning: Not for normal use. Only use when debugging. + + Args: + data (bytes|str): Data to write to path + path (str): The file path to write data to \ No newline at end of file diff --git a/scripts/extract_rtf_from_msg.py b/scripts/extract_rtf_from_msg.py index 5186a34..f9e470e 100644 --- a/scripts/extract_rtf_from_msg.py +++ b/scripts/extract_rtf_from_msg.py @@ -13,11 +13,18 @@ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. +import os, sys +sys.path.append('/home/s2e/code/RTFDE') + + import argparse from RTFDE.deencapsulate import DeEncapsulator import logging logging.basicConfig(level=logging.ERROR) log = logging.getLogger(__name__) +from os import walk +import os + try: import extract_msg @@ -28,6 +35,9 @@ def main(): args = parse_arguments() set_logging(args.verbose, args.debug) + if args.show_folder_msg_stats is not None: + show_folder_msg_stats(args.show_folder_msg_stats) + return msg_path = args.msg_path with extract_msg.openMsg(msg_path) as msg: attachments = None @@ -40,12 +50,31 @@ def main(): else: log.debug("{0} attachments found in msg.".format(len(attachments))) raw_rtf = msg.rtfBody - if args.extract_raw: + # print(raw_rtf.decode()) + if args.extract_raw is True: if args.outfile: with open(args.outfile, 'wb') as fp: fp.write(raw_rtf) + with open(args.outfile + ".compressed", 'wb') as fp: + fp.write(msg.compressedRtf) else: print(raw_rtf.decode()) + if args.extract_all is True: + # Get the raw HTML and text body (don't de-encapsulate RTF if it exists) + _html_body = msg._ensureSet('_htmlBody', '__substg1.0_10130102', False) + _text_body = msg._ensureSet('_body', '__substg1.0_1000') + others = {"html":_html_body, + "txt": _text_body} + for item_type, item in others.items(): + if item is not None: + if args.outfile: + outname = "{0}.{1}".format(args.outfile, item_type) + with open(outname, 'wb') as fp: + if item_type == "txt": + item = item.encode() + fp.write(item) + else: + print(item) else: rtf_obj = DeEncapsulator(raw_rtf.decode()) rtf_obj.deencapsulate() @@ -54,6 +83,30 @@ def main(): else: print(rtf_obj.text) +def show_folder_msg_stats(folder_path): + for (dirpath, dirnames, filenames) in walk(folder_path): + for f in filenames: + print("processing {0}".format(f)) + if not f.endswith('.msg'): + continue + else: + abf = os.path.join(os.path.abspath(dirpath), f) + parts = get_body_parts(abf) + for k,v in parts.items(): + if v: + print(k,"True") + else: + print(k,"False") + +def get_body_parts(msg_path): + parts = {} + with extract_msg.openMsg(msg_path) as msg: + parts["html"] = msg._ensureSet('_htmlBody', '__substg1.0_10130102', False) + parts["plain"] = msg._ensureSet('_body', '__substg1.0_1000') + parts["rtf"] = msg._ensureSet('_compressedRtf', '__substg1.0_10090102', False) + return parts + + def get_attachments(msg): """ Gets all attachments from a MSG file which will be embedded in the encapsulated RTF body. @@ -102,8 +155,13 @@ def parse_arguments(): parser.add_argument("--extract_raw", "-r", help="Only extract raw rtf encapsulated HTML file from msg.", action='store_true') + parser.add_argument("--extract_all", "-a", + help="Extract HTML and Plain Text alongside RTF from msg.", + action='store_true') parser.add_argument("--outfile", "-o", help="Write the output instead of piping it out.") + parser.add_argument("--show_folder_msg_stats", "-S", + help="Process a folder of msg items printing which ones have different elements.") args = parser.parse_args() return args diff --git a/scripts/make_docs.sh b/scripts/make_docs.sh new file mode 100755 index 0000000..cfda8c8 --- /dev/null +++ b/scripts/make_docs.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# +# This file is part of RTFDE +# Copyright © seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + +# Setup + +#Bash should terminate in case a command or chain of command finishes with a non-zero exit status. +#Terminate the script in case an uninitialized variable is accessed. +#See: https://github.com/azet/community_bash_style_guide#style-conventions +set -e +set -u + +# TODO remove DEBUGGING +set -x + +# Read Only variables + +# readonly PROG_DIR=$(readlink -m $(dirname $0)) +# readonly PROGNAME="$( cd "$( dirname "BASH_SOURCE[0]" )" && pwd )" + + + +main() { + pdoc --force --output-dir docs RTFDE + pdoc --force --html --output-dir docs RTFDE +} + + +cleanup() { + # put cleanup needs here + exit 0 +} + +trap 'cleanup' EXIT + + +main diff --git a/scripts/prep_private_rtf_test_folder.sh b/scripts/prep_private_rtf_test_folder.sh new file mode 100755 index 0000000..4daef88 --- /dev/null +++ b/scripts/prep_private_rtf_test_folder.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# +# This file is part of RTFDE. +# Copyright © Symbol’s function definition is void: end-of-file) seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + +# ============= +# INSTRUCTIONS +# ============= + +# Run this script to extract .rtf msg bodies from .msg files. Can be run in multiple ways. +# +# 1) extract the .rtf bodies from a folder of .msg files into another folder. +#> ./prep_private_rtf_test_folder.sh -i /tmp/msg_files/ -o /tmp/extracted_rtf/ +# +# 2) exract the .rtf body from a single .msg file into a folder +#> ./prep_private_rtf_test_folder.sh -i /tmp/msg_files/email.msg -o /tmp/extracted_rtf/ +# +# 2) exract the .rtf body from a single .msg file to a specific filename +#> ./prep_private_rtf_test_folder.sh -i /tmp/msg_files/email.msg -o /tmp/extracted_rtf/extracted_msg.rtf +# + +# Setup +#Bash should terminate in case a command or chain of command finishes with a non-zero exit status. +#Terminate the script in case an uninitialized variable is accessed. +#See: https://github.com/azet/community_bash_style_guide#style-conventions +set -e +set -u + +readonly PROG_DIR=$(readlink -m $(dirname $0)) + +while getopts "i:o:" option; do + case "${option}" in + i) INPATH="${OPTARG}" + ;; + o) OUTPATH="${OPTARG}" + ;; + esac +done + +main() { + if [[ -d "${INPATH}" ]] && [[ -d "${OUTPATH}" ]]; + then + convert_folder + else + convert_file + fi +} + +convert_folder() { + echo "Extracting RTF from a folder full of .msg files" + for i in "${INPATH}"/*.msg; do + python3 "${PROG_DIR}/extract_rtf_from_msg.py" -r \ + -f "${i}" \ + -o "${OUTPATH}/$(basename ${i/%msg/rtf}).rtf" + done +} + +convert_file() { + echo "Extracting RTF from a single .msg file" + if [[ -d "${OUTPATH}" ]]; + then + extract_location="${OUTPATH}/${INPATH/%msg/rtf}" + else + extract_location="${OUTPATH/%msg/rtf}" + fi + python3 "${PROG_DIR}/extract_rtf_from_msg.py" -r \ + -f "${INPATH}" \ + -o "${extract_location}" +} + +main diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100755 index 0000000..adbc0bc --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# +# This file is part of RTFDE. +# Copyright © 2022 seamus tuohy, +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details. + +# Setup + +#Bash should terminate in case a command or chain of command finishes with a non-zero exit status. +#Terminate the script in case an uninitialized variable is accessed. +#See: https://github.com/azet/community_bash_style_guide#style-conventions +set -e +set -u +# set -x + +MYPY="false" +COVERAGE="false" +DEBUG="false" +while getopts "mcd" option; do + case "${option}" in + m) MYPY="true" + ;; + c) COVERAGE="true" + ;; + d) DEBUG="true" + ;; + esac +done + +main() { + if [[ "$COVERAGE" == "true" ]]; then + coverage run -m unittest discover -v + coverage report -m + elif [[ "$DEBUG" == "true" ]]; then + python3 -m unittest discover -v --locals + else + python3 -m unittest discover -v + fi + + if [[ "$MYPY" == "true" ]]; then + mypy --ignore-missing-imports RTFDE/utils.py + fi +} + +cleanup() { + # put cleanup needs here + exit 0 +} +trap 'cleanup' EXIT + +main diff --git a/setup.py b/setup.py index 9652700..31db40b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="RTFDE", - version="0.0.2", + version="0.1.0", author="seamus tuohy", author_email="code@seamustuohy.com", description="A library for extracting HTML content from RTF encapsulated HTML as commonly found in the exchange MSG email format.", @@ -23,7 +23,11 @@ "Topic :: Communications :: Email :: Filters" ], python_requires='>=3.6', - install_requires=['lark-parser>=0.11', 'oletools>=0.56'], + install_requires=['lark==1.1.5', + 'oletools>=0.56'], extras_require={'msg_parse': ['extract_msg>=0.27'], - 'dev': ['lxml>=4.6']} + 'dev': ['lxml>=4.6', + 'mypy>=1.1.1', + 'pdoc3>=0.10.0', + 'coverage>=7.2.2']} ) diff --git a/tests/deencapsulate/test_de_encapsulate.py b/tests/deencapsulate/test_de_encapsulate.py index ed73905..39af1cc 100644 --- a/tests/deencapsulate/test_de_encapsulate.py +++ b/tests/deencapsulate/test_de_encapsulate.py @@ -2,54 +2,187 @@ """ import unittest +import secrets import quopri # Decode MIME quoted-printable data -from os.path import join +from os.path import join, abspath, isfile +from os import walk +from os import environ from lxml.html.diff import InsensitiveSequenceMatcher, tokenize from RTFDE.deencapsulate import DeEncapsulator +from RTFDE.utils import encode_escaped_control_chars +from lark.exceptions import UnexpectedToken ## Directory with test data, independent of current working directory from tests.test_utils import DATA_BASE_DIR -class TestHtmlCleanDeEncapsulate(unittest.TestCase): - """ Test minimal deviation of original and de-encapsulated HTML content. +from html.parser import HTMLParser - After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.""" +class ExtractHTMLText(HTMLParser): + text = "" - def test_u_encoded_html(self): - "Tests that de-encapsulation on u encoded encoded HTML works." - rtf_path = join(DATA_BASE_DIR, "html", "multiple-encodings.rtf") - txt_path = join(DATA_BASE_DIR, "html", "multiple-encodings.txt") - with open(txt_path, 'r') as fp: - raw_text = fp.read() - original_text = self.clean_whitespace(raw_text) - with open(rtf_path, 'r') as fp: - raw_rtf = fp.read() - rtf_obj = DeEncapsulator(raw_rtf) - rtf_obj.deencapsulate() - output_text = self.clean_whitespace(rtf_obj.html) - self.compare_html(original_text, output_text) + def handle_data(self, data): + self.text += data + # Remove to ignore newline differences + # self.text = self.text.replace("\n"," ") + # Remove to ignore whitespace differences + # self.text = ' '.join(self.text.split()) + +class TestPrivateMsgTestCases(unittest.TestCase): + """Test against a private folder full of RTF files exported from .msg files. + + This test will run against whatever path is set in the RTFDE_PRIVATE_MSG_FOLDER environment variable. If you do not set this environment variable this test will look for .rtf files in the `tests/test_data/personal_rtf_test_files` folder. If it finds none there it will exit without any error. + + > export RTFDE_PRIVATE_MSG_FOLDER="/path/to/folder/with/messages/" + > python3 -m unittest discover -v --locals + + If you want to run tests that check the contents of the original HTML file against the encapsulated version you can use the RTFDE_PRIVATE_MSG_OUTPUT_FOLDER environment variable. If you do not set this environment variable this test will look for .html files in the `tests/test_data/personal_rtf_test_output_files` folder. If it does not find a file with the same name as the .rtf file that it is testing it will not attempt to compare it to anything. (`file.rtf` needs a corresponding `file.html`). + + > export RTFDE_PRIVATE_MSG_OUTPUT_FOLDER="/path/to/folder/with/html/outputs/" + > python3 -m unittest discover -v --locals + + It is important to use the --locals variable for unittest since it will expose the filename where an error occurs alongside the unittest failure. Look for the variable named `FAILING_FILENAME` - def compare_html(self, original_text, output_text): - """Do a diff of two HTML files. + The folder `tests/test_data/personal_rtf_test_files` in the source code repository has been included in the .gitignore to allow developers to safely use it for this purpose. - Only the text, tags and attributes in the HTML are diffed. + + See code in `scripts/prep_private_rtf_test_folder.sh` for guidance on how to populate a private test folder with RTF files extracted from a folder full of .msg files. +""" + def test_private_msg_folder(self): + + try: + folder_path = environ["RTFDE_PRIVATE_MSG_FOLDER"] + except KeyError: + folder_path = join(DATA_BASE_DIR, "personal_rtf_test_files") + try: + output_path = environ["RTFDE_PRIVATE_MSG_OUTPUT_FOLDER"] + except KeyError: + output_path = join(DATA_BASE_DIR, "personal_rtf_test_output_files") + for (dirpath, dirnames, filenames) in walk(folder_path): + for f in filenames: + if not f.endswith('.rtf'): + continue + else: + raw_rtf = None + abfpath = join(abspath(dirpath), f) + FAILING_FILENAME = f + with open(abfpath, 'rb') as fp: + raw_rtf = fp.read() + rtf_obj = DeEncapsulator(raw_rtf) + try: + rtf_obj.deencapsulate() + except: + self.fail(f"\nFailed to deencapsulate test file: {f}") + # If output result then compare it + html_filename = f.replace('.rtf', '.html') + output_to_compare = join(abspath(output_path), + html_filename) + if isfile(output_to_compare): + with open(output_to_compare, 'rb') as fp: + html_output = fp.read() + f = ExtractHTMLText() + f.feed(html_output.decode()) + html_output_text = f.text + f = ExtractHTMLText() + f.feed(rtf_obj.content.decode()) + rtf_output_text = f.text + self.assertEqual(rtf_output_text, html_output_text, + msg=f"\nFailed comparing personal test file: {html_filename}") + + +class TestBinaryData(unittest.TestCase): + """Test binary data in RTF files.""" + + def test_encoded_bytes_stay_encoded_character(self): + """Test that any hexbytes that are not encoded into the RTF stay in the bytes returned without being modified.""" + raw_rtf = self.get_small_template() + bin_data = secrets.token_bytes(1) + binary_string = b'This test is one string ' + bin_data + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + self.assertEqual(binary_string, rtf_obj.content) + if not (binary_string == rtf_obj.content): + with open('/tmp/bin_data_fail_input.bytes', 'wb') as fp: + fp.write(binary_string) + with open('/tmp/bin_data_fail_output.bytes', 'wb') as fp: + fp.write(rtf_obj.content) + + def test_bin_data_captured(self): + """Tests that binary data is captured. """ - # We start with a diff of the text tokens alone. This allows us to check that the content is the same (weather or not some non-visible structural elements may have been added/removed/modified) - old_html_tokens = tokenize(output_text) - new_html_tokens = tokenize(original_text) - s = InsensitiveSequenceMatcher(a=old_html_tokens, b=new_html_tokens) - commands = s.get_opcodes() - # If the content is the same it will only have one opcode which states that the objects are equal - self.assertEqual(len(commands), 1) - self.assertEqual('equal', commands[0][0]) - # Now we do the real test of equality between the original and the de-encapsulated copy - self.assertEqual(original_text, output_text) - - def clean_whitespace(self, string): - """Getting the newlines and spaces correct when decoding is not perfect. So, we compare strings by looking at them without newlines and spaces. - - RTF encapsulation inserts spaces and newlines into the file around html elements which can't easily be identified as insertions. As such, the only way to really evaluate true equality between the source and the destination is to strip all whitespace (spaces and newlines) and compare.""" - return string.replace('\n',"").replace(' ','') + # Test one bin string + raw_rtf = self.get_small_template() + bin_data = secrets.token_bytes(20) + binary_string = b'This test is one string \\bin20' + bin_data + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + + self.assertIsNotNone(rtf_obj.found_binary) + self.assertEqual(b'This test is one string that is it.', + rtf_obj.content) + # Proove that the stripped data contains the bin_data at the right place. + # re_built_string = rtf_obj.content[:] + bin_dat = rtf_obj.found_binary[0] + self.assertEqual(bin_dat['start_pos'], 216) + self.assertEqual(bin_dat['end_pos'], 242) + self.assertEqual(bin_dat['bin_start_pos'], 222) + self.assertEqual(bin_dat['bytes'], bin_data) + self.assertEqual(bin_dat['ctrl_char'][0], b'\\bin') + self.assertEqual(bin_dat['ctrl_char'][1], b'20') + + # Make sure spaces after the control char are handled correctly + binary_string = b'This test is one string \\bin20 ' + bin_data + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + bin_dat = rtf_obj.found_binary[0] + self.assertEqual(bin_dat['bytes'], bin_data) + self.assertEqual(bin_dat['ctrl_char'][1], b'20') + self.assertEqual(bin_dat['start_pos'], 216) + self.assertEqual(bin_dat['end_pos'], 243) + self.assertEqual(bin_dat['bin_start_pos'], 223) + + # Test multiple bin string in an rtf file + bin_addition = b' and more \\bin20 ' + bin_data + bin_addition = bin_addition*5 + binary_string = b'This test is one string ' + bin_addition + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + self.assertEqual(len(rtf_obj.found_binary), 5) + + # Test bin with negative params are ignored as they are invalid (i.e. \\bin-1234) + binary_string = b'This test is one string \\bin-20 ' + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + with self.assertRaises(AttributeError): + _test = rtf_obj.found_binary + + # Test bin with no params are ignored as they are invalid + binary_string = b'This test is one string \\bin ' + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + with self.assertRaises(AttributeError): + _test = rtf_obj.found_binary + + # Test bin with 0 length params only include 0 length bytes (i.e. \\bin0 AND \\bin00000000 ) + binary_string = b'This test is one string \\bin0 \\bin0000000' + b'that is it.' + rep_rtf = raw_rtf.replace(b"REPLACE_ME", binary_string) + rtf_obj = self.deencapsulate_string(rep_rtf) + self.assertEqual(len(rtf_obj.found_binary), 2) + for byte_obj in rtf_obj.found_binary: + self.assertEqual(byte_obj['bytes'], b'') + + + def deencapsulate_string(self, raw_rtf): + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + return rtf_obj + + def get_small_template(self): + template_path = join(DATA_BASE_DIR, + "rtf_parsing", + "small_template.rtf") + with open(template_path, 'rb') as fp: + raw_rtf = fp.read() + return raw_rtf class TestTextCleanDeEncapsulate(unittest.TestCase): """ Test minimal deviation of original and de-encapsulated plain text content. @@ -59,12 +192,12 @@ class TestTextCleanDeEncapsulate(unittest.TestCase): def test_japanese_encoded_text(self): """ """ rtf_path = join(DATA_BASE_DIR, "plain_text", "japanese_iso_2022.rtf") - original_body = "すみません。" - with open(rtf_path, 'r') as fp: + original_body = "すみません。\n".encode() + with open(rtf_path, 'rb') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() - output_text = self.clean_newlines(rtf_obj.text) + output_text = rtf_obj.text self.assertEqual(output_text, original_body) def test_quoted_printable(self): @@ -73,40 +206,253 @@ def test_quoted_printable(self): This test checks that it is STILL NOT IMPLEMENTED. So, if you fix it this test will expose that and we will need to change the test.""" quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf") quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt") - # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml") - # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg") - with open(quote_printable_txt_path, 'r') as fp: + with open(quote_printable_txt_path, 'rb') as fp: raw_text = fp.read() - original_decoded_text = self.clean_newlines(raw_text) - with open(quote_printable_rtf_path, 'r') as fp: + original_decoded_text = raw_text + with open(quote_printable_rtf_path, 'rb') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() - output_text = self.clean_newlines(rtf_obj.text) + output_text = rtf_obj.text self.assertNotEqual(original_decoded_text, output_text) def test_decoded_quoted_printable(self): """Test that decoded text in an original quoted printable message is still quoted when de-encapsulated.""" quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf") quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt") - # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml") - # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg") charset = "cp1251" - with open(quote_printable_txt_path, 'r') as fp: + with open(quote_printable_txt_path, 'rb') as fp: raw_text = fp.read() original_decoded_text = quopri.decodestring(raw_text) original_decoded_text = original_decoded_text.decode(charset) - original_decoded_text = self.clean_newlines(original_decoded_text) - with open(quote_printable_rtf_path, 'r') as fp: + with open(quote_printable_rtf_path, 'rb') as fp: + raw_rtf = fp.read() + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + output_text = rtf_obj.text + self.assertEqual(original_decoded_text, output_text.decode()) + + + +class TestTextDecoding(unittest.TestCase): + """Test text decoding code""" + + def test_theta(self): + """ """ + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "theta.rtf") + original_body = ' ф\n'.encode() + with open(rtf_path, 'rb') as fp: + raw_rtf = fp.read() + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + # print(rtf_obj.full_tree) + output_text = rtf_obj.text + # print(repr(output_text)) + self.assertEqual(output_text, original_body) + + def test_translated_by(self): + """ """ + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "translated_by.rtf") + original_body = 'ترجمة: سمير المجذوب\n'.encode() + with open(rtf_path, 'rb') as fp: + raw_rtf = fp.read() + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + # print(rtf_obj.full_tree) + output_text = rtf_obj.text + # print(repr(output_text)) + self.assertEqual(output_text, original_body) + + + def test_unicode_decoding(self): + """ """ + # print("\n") + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "surrogates.rtf") + rtf_obj = self.deencapsulate(rtf_path) + self.assertTrue("😊".encode() in rtf_obj.content) + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "surrogate_pairs.rtf") + rtf_obj = self.deencapsulate(rtf_path) + correct_repr = 'Ǣ?\nǢ\n😊\n😊??\n😊\n'.encode() + # print(correct_repr) + # print(rtf_obj.content) + self.assertEqual(correct_repr, rtf_obj.content) + + # Test that non unicode chars do not decode + from RTFDE.text_extraction import unicode_escape_to_chr + surh = b"\\u-10179" + surl = b"\\u-8694" + bad_encodings = [ + b"0x000000A9", + b"0x00A9", + b"0xC2 0xA9", + b"©", + b"©", + b"©" + ] + for bad in bad_encodings: + with self.assertRaises(ValueError): + unicode_escape_to_chr(bad) + surrogate_encodings = [b"\\u-10179", b"\\u-8694"] + for sur in surrogate_encodings: + self.assertIsInstance(unicode_escape_to_chr(sur), str) + + + def test_surrogate_in_htmlrtf(self): + """Don't show surrogate text within htmlrtf block.""" + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "surrogate_pairs_02.rtf") + rtf_obj = self.deencapsulate(rtf_path) + correct_repr = b'😊' + self.assertEqual(correct_repr, rtf_obj.content) + #print(rtf_obj.content) + + def test_surrogate_without_16bitsigned(self): + """Test surrogate which doesn't use a 16 signed integer.""" + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "surrogate_pairs_03.rtf") + rtf_obj = self.deencapsulate(rtf_path) + #print(rtf_obj.content) + correct_repr = b'😊' + self.assertEqual(correct_repr, rtf_obj.content) + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "surrogate_pairs_04.rtf") + rtf_obj = self.deencapsulate(rtf_path) + correct_repr = b'😊 \xf0\x9f\x93\x9e' + self.assertEqual(correct_repr, rtf_obj.content) + + def test_hexencoded(self): + original_body = '【 This is a test 】'.encode() + raw_rtf = self.get_small_template() + # print(raw_rtf) + # Add fontdef + font_def = b"""{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}{\\f1\\fswiss\\fcharset128 "PMingLiU";}{\\f2\\fnil\\fcharset1 Arial;}}""" + base = b"""{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}}""" + new_rtf = raw_rtf.replace(base, font_def) + # print(new_rtf) + # Add hec encoded item + hex_encoded_text = b"""{\\lang1028 \\f1 \\'81\\'79} This is a test {\\lang1028 \\f1 \\'81\\'7a}""" + rep_rtf = new_rtf.replace(b"REPLACE_ME", hex_encoded_text) + # print(rep_rtf) + rtf_obj = self.deencapsulate_string(rep_rtf) + # rtf_obj.deencapsulate() + output_text = rtf_obj.content + # 'not a valid ANSI representation' + self.assertEqual(output_text, original_body) + # print(repr(output_text)) + # print(repr(rtf_obj.content)) + + def text_hexencode_as_replacement(self): + """test that unicode text with hex encoded replacement works.""" + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "unicode_HH_replacement.rtf") + rtf_obj = self.deencapsulate(rtf_path) + correct_repr = b'😊' + self.assertEqual(correct_repr, rtf_obj.content) + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "unicode_HH_replacement_01.rtf") + rtf_obj = self.deencapsulate(rtf_path) + correct_repr = b'😊 \xf0\x9f\x93\x9e' + self.assertEqual(correct_repr, rtf_obj.content) + + def test_windows_950_codec(self): + """Windows 950 codec's currently fail. Ensure that they still fail in tests so we can identify when the underlying libraries fix this. + + https://github.com/seamustuohy/RTFDE/issues/19 + """ + rtf_path = join(DATA_BASE_DIR, "rtf_parsing", "windows_950.rtf") + # Word successfully parses this, showing "Hello" followed by a space then a single character, though it's either one it doesn't know how to render or is meant to look like a box. + original_body = "Hello ??" # TODO: Fix once we know what the char is. + with open(rtf_path, 'rb') as fp: + raw_rtf = fp.read() + rtf_obj = DeEncapsulator(raw_rtf) + with self.assertRaises(UnicodeDecodeError): + rtf_obj.deencapsulate() + + def test_font_table_variation(self): + from RTFDE.text_extraction import get_font_table,parse_font_tree + raw_rtf = self.get_small_template() + base = b"""{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}}""" + + # Test \\cpg is consistant with fcharset + base_w_cpg = b"""{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}{\\f2\\cpg1253\\fcharset161 Arial;}}""" + new_rtf = raw_rtf.replace(base, base_w_cpg) + rtf_obj = self.full_tree_from_string(new_rtf) + raw_font_table = get_font_table(rtf_obj.full_tree.children[1]) + font_table = parse_font_tree(raw_font_table) + # print(repr(font_table)) + # print(type(font_table)) + self.assertEqual(font_table[b'\\f2'].codepage, 1253) + + # Test \\fcharset takes precedence over \\cpg + base_w_cpg = b"""{\\fonttbl{\\f0\\fnil\\fcharset0 Calibri;}{\\f2\\cpg1253\\fcharset204 Arial;}}""" + new_rtf = raw_rtf.replace(base, base_w_cpg) + rtf_obj = self.full_tree_from_string(new_rtf) + raw_font_table = get_font_table(rtf_obj.full_tree.children[1]) + font_table = parse_font_tree(raw_font_table) + # \\fcharset204 == codepage 1251 + self.assertEqual(font_table[b'\\f2'].codepage, 1251) + + def test_text_decoder(self): + # TODO + from RTFDE.text_extraction import TextDecoder + pass + + def test_default_font(self): + from RTFDE.text_extraction import get_default_font + raw_rtf = self.get_small_template() + + # set_font_info with missing deff0 + new_rtf = raw_rtf.replace(b'\\deff0', b'') + rtf_obj = self.deencapsulate_string(new_rtf) + # deff0 is not required + self.assertIsNone(get_default_font(rtf_obj.full_tree)) + + # multiple deff0 is fine. Only use the first found + new_rtf = raw_rtf.replace(b'\\deff0', b'\\deff0\\deff1\\deff2') + rtf_obj = self.deencapsulate_string(new_rtf) + self.assertEqual(get_default_font(rtf_obj.full_tree), + b'\\f0') + + + + + def test_codepage_num_from_charset(self): + from RTFDE.text_extraction import get_codepage_num_from_fcharset + acceptable = [0,128,129,134,136,161,162,177,178,186,204,222,238] + for i in acceptable: + self.assertIsNotNone(get_codepage_num_from_fcharset(i)) + acceptable_none = [1,2,255] + for i in acceptable_none: + self.assertIsNone(get_codepage_num_from_fcharset(i)) + from random import randrange + for i in range(20): + test = randrange(600) + if test in acceptable: + continue + else: + self.assertIsNone(get_codepage_num_from_fcharset(test)) + + def get_small_template(self): + template_path = join(DATA_BASE_DIR, + "rtf_parsing", + "small_template.rtf") + with open(template_path, 'rb') as fp: + raw_rtf = fp.read() + return raw_rtf + + def full_tree_from_string(self, raw_rtf): + rtf_obj = DeEncapsulator(raw_rtf) + escaped_rtf = encode_escaped_control_chars(rtf_obj.raw_rtf) + rtf_obj.parse_rtf(escaped_rtf) + return rtf_obj + + def deencapsulate_string(self, raw_rtf): + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + return rtf_obj + + def deencapsulate(self, rtf_path): + with open(rtf_path, 'rb') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() - output_text = self.clean_newlines(rtf_obj.text) - self.assertEqual(original_decoded_text, output_text) + return rtf_obj - def clean_newlines(self, string): - """Getting the newlines correct when decoding is not perfect. So, we compare strings by looking at them without newlines.""" - return string.replace('\n',"") # just in case somebody calls this file as a script if __name__ == '__main__': diff --git a/tests/parse_rtf/test_parse_rtf.py b/tests/parse_rtf/test_parse_rtf.py index 23032a2..daa40a5 100644 --- a/tests/parse_rtf/test_parse_rtf.py +++ b/tests/parse_rtf/test_parse_rtf.py @@ -8,10 +8,16 @@ """ import unittest +import logging from os.path import join from RTFDE.exceptions import MalformedRtf, MalformedEncapsulatedRtf, NotEncapsulatedRtf from RTFDE.deencapsulate import DeEncapsulator +from RTFDE.utils import encode_escaped_control_chars +from RTFDE.text_extraction import get_python_codec +from RTFDE.text_extraction import TextDecoder + from tests.test_utils import DATA_BASE_DIR +from lark import logger as lark_logger class TestParseRtf(unittest.TestCase): """Test proper parsing of RTF files @@ -30,29 +36,45 @@ def test_get_first_20_header_control_words(self): "rtf_parsing", "from_header_template.rtf") # Has 20 control words - ctrl_words = ("\\deff0"*16) + "\\fromhtml1" - rtf = self.replace_from_header(template_path, ctrl_words) + ctrl_words = (b"\\deff0"*16) + b"\\fromhtml1" + rtf = self.replace_text_in_template(template_path, ctrl_words) output = self.run_parsing(rtf) - ctrl_wds = output._get_header_control_words_before_first_group() - ctrl_wds =[i.value for i in ctrl_wds] - correct_ctrl = ['\\rtf1', '\\ansi', '\\ansicpg1252', '\\deff0', '\\deff0', '\\deff0', - '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\deff0', - '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\deff0', - '\\deff0', '\\fromhtml1'] + ctrl_wds = output.get_header_control_words_before_first_group() + ctrl_wds =[i.value.strip() for i in ctrl_wds] + correct_ctrl = [b'\\rtf1', + b'\\ansi', + b'\\ansicpg1252', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\deff0', + b'\\fromhtml1'] self.assertEqual(ctrl_wds, correct_ctrl) # group comes before 20 control words - ctrl_words = ("\\deff0"*5) + "\\fromhtml1 \\deff0" + '{\colortbl\red0\green0\blue0;\red5\green99\blue193;}' - rtf = self.replace_from_header(template_path, ctrl_words) + ctrl_words = (b"\\deff0"*5) + b"\\fromhtml1 \\deff0" + b'{\colortbl\red0\green0\blue0;\red5\green99\blue193;}' + rtf = self.replace_text_in_template(template_path, ctrl_words) output = self.run_parsing(rtf) - ctrl_wds = output._get_header_control_words_before_first_group() - ctrl_wds = [i.value for i in ctrl_wds] - correct_ctrl = ['\\rtf1', '\\ansi', '\\ansicpg1252', '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\deff0', '\\fromhtml1', '\\deff0'] + ctrl_wds = output.get_header_control_words_before_first_group() + ctrl_wds = [i.value.strip() for i in ctrl_wds] + correct_ctrl = [b'\\rtf1', b'\\ansi', b'\\ansicpg1252', b'\\deff0', b'\\deff0', b'\\deff0', b'\\deff0', b'\\deff0', b'\\fromhtml1', b'\\deff0'] self.assertEqual(ctrl_wds, correct_ctrl) # fromhtml header parsing not affected by this function - ctrl_words = '{\\colortbl\\red0\\green0\\blue0;\\red5\\green99\\blue193;}' + "\\fromhtml1 \\deff0" - rtf = self.replace_from_header(template_path, ctrl_words) + ctrl_words = b'{\\colortbl\\red0\\green0\\blue0;\\red5\\green99\\blue193;}' + b"\\fromhtml1 \\deff0" + rtf = self.replace_text_in_template(template_path, ctrl_words) # The fromhtml header needs to be in the first 10 tokens. Tokens includes groups in this case. So this should succeed. self.check_deencapsulate_validity(rtf, expect_error=None, @@ -64,10 +86,10 @@ def test_codepage_num(self): "from_header_template.rtf") # bad - bad_charsets = ["\\ansicpg1234", "\\pccpg1252", "\\ansicpg"] + bad_charsets = [b"\\ansicpg1234", b"\\ansicpg"] for badchar in bad_charsets: - rtf = self.replace_from_header(template_path, "\\fromhtml1") - rtf = self.replace_from_header(None, badchar, rep_str="\\ansicpg1252", string=rtf) + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") + rtf = self.replace_text_in_template(None, badchar, rep_str=b"\\ansicpg1252", string=rtf) self.check_deencapsulate_validity(rtf, expect_error=MalformedRtf, name="bad codepage keyword: {0}".format(badchar)) @@ -80,21 +102,16 @@ def test_codepage_num(self): # NOTE: oletools supports some codepages which MS doesn't include so those are not included here: (32768, 32769) supported_codepage_nums = [37, 708, 709, 710, 870, 1047, 1141, 1201, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10021, 10029, 10079, 10081, 12000, 12001, 20127, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 38598, 65000] for goodpg in supported_codepage_nums: - rtf = self.replace_from_header(template_path, "\\fromhtml1") - good_codepage = "\\ansicpg{0}".format(goodpg) - rtf = self.replace_from_header(None, good_codepage, rep_str="\\ansicpg1252", string=rtf) - output = self.run_parsing(rtf) - output._validate_encapsulation() - codec = output._get_python_codec() + codec = get_python_codec(goodpg) self.assertIsInstance(codec, str) self.assertNotEqual(codec, 'utf8') - # missing - rtf = self.replace_from_header(template_path, "\\fromhtml1") - rtf = self.replace_from_header(None, "", rep_str="\\ansicpg1252", string=rtf) + # Codepage is optional. Therefore a missing codepage number is fine. + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") + rtf = self.replace_text_in_template(None, b"", rep_str=b"\\ansicpg1252", string=rtf) self.check_deencapsulate_validity(rtf, - expect_error=MalformedRtf, - name="Missing codepage num") + expect_error=None, + name="Ansi codepage num is optional") def test_charset_header(self): template_path = join(DATA_BASE_DIR, @@ -102,77 +119,78 @@ def test_charset_header(self): "from_header_template.rtf") # Bad charset - bad_charsets = ["\\ANSI", "ansi", "\\PC", "\\osx"] + bad_charsets = [b"\\ANSI", b"ansi", b"\\PC", b"\\osx"] for badchar in bad_charsets: - rtf = self.replace_from_header(template_path, "\\fromhtml1") - rtf = self.replace_from_header(None, badchar, rep_str="\\ansi", string=rtf) + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") + rtf = self.replace_text_in_template(None, badchar, rep_str=b"\\ansi", string=rtf) self.check_deencapsulate_validity(rtf, expect_error=MalformedRtf, name="bad charset keyword: {0}".format(badchar)) # Good charset # included \\ after the charset keyword to ensure we don't capture the \\ansicpg - good_charsets = ["\\ansi\\", "\\mac\\", "\\pc\\", "\\pac\\"] + good_charsets = [b"\\ansi\\", b"\\mac\\", b"\\pc\\", b"\\pca\\"] for goodchar in good_charsets: - rtf = self.replace_from_header(template_path, "\\fromhtml1") - rtf = self.replace_from_header(None, goodchar, rep_str="\\ansi\\", string=rtf) + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") + rtf = self.replace_text_in_template(None, goodchar, rep_str=b"\\ansi\\", string=rtf) self.check_deencapsulate_validity(rtf, expect_error=None, name="Good charset keyword: {0}".format(goodchar)) # group before charset - new_front = '{\\rtf1{\\colortbl\\red0\\green0\\blue0;\\red5\\green99\\blue193;}' - rtf = self.replace_from_header(template_path, "\\fromhtml1") + new_front = b'{\\rtf1{\\colortbl\\red0\\green0\\blue0;\\red5\\green99\\blue193;}' + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") rtf = new_front + rtf[6:] # Replaces `{\\rtf1` self.check_deencapsulate_validity(rtf, expect_error=MalformedRtf, name="no groups before charset") # Missing - rtf = self.replace_from_header(template_path, "\\fromhtml1") - rtf = self.replace_from_header(None, "\\", rep_str="\\ansi\\", string=rtf) + rtf = self.replace_text_in_template(template_path, b"\\fromhtml1") + rtf = self.replace_text_in_template(None, b"\\", rep_str=b"\\ansi\\", string=rtf) self.check_deencapsulate_validity(rtf, expect_error=MalformedRtf, name="missing charset") # Test missing using default fallback to ansi output = self.run_parsing(rtf) - output._validate_encapsulation() - charset = output._get_charset(fallback_to_default=True) - self.assertEqual("\\ansi", charset) + charset = output.validate_charset(fallback_to_default=True) + self.assertEqual(b"\\ansi", charset) def test_get_python_codec(self): """Test getting correct python codec.""" template_path = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - base = self.replace_from_header(template_path, "\\fromhtml1") + base = self.replace_text_in_template(template_path, b"\\fromhtml1") # Big-5 big5string = b'\xb3o\xacO\xa4@\xad\xd3\xa4\xe5\xa5\xbb\xa6r\xb2\xc5\xa6\xea\xa1C' - rtf = self.replace_from_header(None, - replacement="\\ansicpg10002", - rep_str="\\ansicpg1252", + rtf = self.replace_text_in_template(None, + replacement=b"\\ansicpg10002", + rep_str=b"\\ansicpg1252", string=base) output = self.run_parsing(rtf) - output._validate_encapsulation() - output.charset = output._get_charset() - output.text_codec = output._get_python_codec() - big5string.decode(output.text_codec) - self.assertEqual(output.text_codec, "big5") - self.assertEqual(big5string.decode(output.text_codec), + # output.validate_encapsulation() + # charset = output._validate_charset() + ansicpg_header = output.get_ansicpg_header() + possible_cpg_num = int(ansicpg_header.value.strip()[8:]) + text_codec = get_python_codec(possible_cpg_num) + big5string.decode(text_codec) + self.assertEqual(text_codec, "big5") + self.assertEqual(big5string.decode(text_codec), '這是一個文本字符串。') # Hebrew hebrew = b'\xe6\xe4\xe5 \xee\xe7\xf8\xe5\xe6\xfa \xe8\xf7\xf1\xe8.' - rtf = self.replace_from_header(None, - replacement="\\ansicpg10005", - rep_str="\\ansicpg1252", + rtf = self.replace_text_in_template(None, + replacement=b"\\ansicpg10005", + rep_str=b"\\ansicpg1252", string=base) output = self.run_parsing(rtf) - output._validate_encapsulation() - output.charset = output._get_charset() - output.text_codec = output._get_python_codec() - self.assertEqual(output.text_codec, "hebrew") - self.assertEqual(hebrew.decode(output.text_codec), "זהו מחרוזת טקסט.") + ansicpg_header = output.get_ansicpg_header() + possible_cpg_num = int(ansicpg_header.value.strip()[8:]) + text_codec = get_python_codec(possible_cpg_num) + self.assertEqual(text_codec, "hebrew") + self.assertEqual(hebrew.decode(text_codec), "זהו מחרוזת טקסט.") def test_from_header_html(self): @@ -180,18 +198,17 @@ def test_from_header_html(self): from_html = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(from_html, "\\fromhtml1") + rtf = self.replace_text_in_template(from_html, b"\\fromhtml1") self.check_deencapsulate_validity(rtf, expect_error=None, name="working fromheaderhtml") - def test_from_header_text(self): """Check that a basic fromtext header works.""" from_text = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(from_text, "\\fromtext") + rtf = self.replace_text_in_template(from_text, b"\\fromtext") self.check_deencapsulate_validity(rtf, expect_error=None, name="working fromheader text") @@ -201,7 +218,7 @@ def test_missing_from_header(self): missing_from = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(missing_from, "") + rtf = self.replace_text_in_template(missing_from, b"") self.check_deencapsulate_validity(rtf, expect_error=NotEncapsulatedRtf, name="missing fromheader text") @@ -209,9 +226,9 @@ def test_missing_from_header(self): missing_but_one_in_body = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(missing_but_one_in_body, "") - rtf = self.replace_from_header(None, "\\fromtext", - rep_str="INSERT_BODY_TEXT_HERE", + rtf = self.replace_text_in_template(missing_but_one_in_body, b"") + rtf = self.replace_text_in_template(None, b"\\fromtext", + rep_str=b"INSERT_BODY_TEXT_HERE", string=rtf) self.check_deencapsulate_validity(rtf, expect_error=NotEncapsulatedRtf, @@ -222,7 +239,7 @@ def test_multiple_from_headers(self): multiple_from_html = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(multiple_from_html, "\\fromhtml1\\fromhtml1") + rtf = self.replace_text_in_template(multiple_from_html, b"\\fromhtml1\\fromhtml1") self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") @@ -230,7 +247,7 @@ def test_multiple_from_headers(self): multiple_from_html_first = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(multiple_from_html_first, "\\fromhtml1\\fromtext") + rtf = self.replace_text_in_template(multiple_from_html_first, b"\\fromhtml1\\fromtext") self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") @@ -238,7 +255,7 @@ def test_multiple_from_headers(self): multiple_from_txt_first = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(multiple_from_txt_first, "\\fromtext\\fromhtml1") + rtf = self.replace_text_in_template(multiple_from_txt_first, b"\\fromtext\\fromhtml1") self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") @@ -248,9 +265,10 @@ def test_from_header_before_rtf(self): missing_from = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(missing_from, "") + rtf = self.replace_text_in_template(missing_from, b"") # Append a new curly and the control word to the start of the rtf file - rtf = "{\\fromhtml1" + rtf[1:] + # TODO: Fix this array use on a bytes object + rtf = b"{\\fromhtml1" + rtf[1:] self.check_deencapsulate_validity(rtf, expect_error=MalformedRtf, name="from header before magic") @@ -260,22 +278,25 @@ def test_broken_magic(self): missing_from = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(missing_from, "\\fromhtml1") + rtf = self.replace_text_in_template(missing_from, b"\\fromhtml1") # Append a new curly and broken rtf to the start of the rtf file - rtf_no_one = "{\\rtf" + rtf[6:] # Removes `{\\rtf1` + # TODO: Fix this array use on a bytes object + rtf_no_one = b"{\\rtf" + rtf[6:] # Removes `{\\rtf1` self.check_deencapsulate_validity(rtf_no_one, expect_error=MalformedRtf, name="malformed file magic") - - rtf_two = "{\\rtf2" + rtf[6:] + # TODO: Fix this array use on a bytes object + rtf_two = b"{\\rtf2" + rtf[6:] self.check_deencapsulate_validity(rtf_two, expect_error=MalformedRtf, name="malformed file magic") - RTF = "{\\RTF1" + rtf[6:] + # TODO: Fix this array use on a bytes object + RTF = b"{\\RTF1" + rtf[6:] self.check_deencapsulate_validity(RTF, expect_error=MalformedRtf, name="malformed file magic") - PiRTF = "{\\ARRRRRR-TEEE-FFF" + rtf[6:] # Because Pirates + # TODO: Fix this array use on a bytes object + PiRTF = b"{\\ARRRRRR-TEEE-FFF" + rtf[6:] # Because Pirates self.check_deencapsulate_validity(PiRTF, expect_error=MalformedRtf, name="Ahoy, matey! this here file magic be broken") @@ -291,12 +312,33 @@ def test_fonttble_too_early(self): "rtf_parsing", "from_header_template.rtf") - early_font_table = '{\\fonttbl\n{\\f0\\fswiss Arial;}\n{\\f1\\fmodern Courier New;}\n{\\f2\\fnil\\fcharset2 Symbol;}\n{\\f3\\fmodern\\fcharset0 Courier New;}}' + "\\fromhtml1 \\deff0" - rtf = self.replace_from_header(template_path, early_font_table) + early_font_table = b'{\\fonttbl\n{\\f0\\fswiss Arial;}\n{\\f1\\fmodern Courier New;}\n{\\f2\\fnil\\fcharset2 Symbol;}\n{\\f3\\fmodern\\fcharset0 Courier New;}}' + b"\\fromhtml1 \\deff0" + rtf = self.replace_text_in_template(template_path, early_font_table) self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="fonttable before fromhtml in header") + def test_missing_fonttable(self): + """fail when fonttable is missing""" + template_path = join(DATA_BASE_DIR, + "rtf_parsing", + "font_table_template.rtf") + + no_font_table = b"" + rtf = self.replace_text_in_template(template_path, + no_font_table, + rep_str=b"REPLACE_FONT_TABLE_HERE") + NA = b"REPLACE_FONT_TABLE_HERE" + default = b"""{\\fonttbl +{\\f0\\fswiss Arial;} +{\\f1\\fmodern Courier New;} +{\\f2\\fnil\\fcharset2 Symbol;} +{\\f3\\fmodern\\fcharset0 Courier New;}}""" + self.check_deencapsulate_validity(rtf, + expect_error=ValueError, + name="fonttable Missing") + # print_error=True) + def test_extracted_correct_from_header(self): """ - correctly extract the header type @@ -305,33 +347,118 @@ def test_extracted_correct_from_header(self): template_data = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") - rtf = self.replace_from_header(template_data, "\\fromhtml1") + rtf = self.replace_text_in_template(template_data, b"\\fromhtml1") output = DeEncapsulator(rtf) output.deencapsulate() self.assertEqual('html', output.get_content_type()) - rtf = self.replace_from_header(template_data, "\\fromtext") + rtf = self.replace_text_in_template(template_data, b"\\fromtext") output = DeEncapsulator(rtf) output.deencapsulate() self.assertEqual('text', output.get_content_type()) # Try with them back to back. First should win. - rtf = self.replace_from_header(template_data, "\\fromtext\\fromhtml1") + rtf = self.replace_text_in_template(template_data, b"\\fromtext\\fromhtml1") self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") - rtf = self.replace_from_header(template_data, "\\fromhtml1\\fromtext") + rtf = self.replace_text_in_template(template_data, b"\\fromhtml1\\fromtext") self.check_deencapsulate_validity(rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") + def test_parse_tilde_control_chars(self): + """Correctly parse control chars + """ + path = join(DATA_BASE_DIR, + "rtf_parsing", + "control_chars.rtf") + if path is not None: + with open(path, 'rb') as fp: + rtf = fp.read() + self.check_deencapsulate_validity(rtf, + expect_error=None, + name="Parse the tilde the \~ command char.") - def replace_from_header(self, path, replacement, - rep_str="REPLACE_FROM_HEADER_HERE", + def test_parse_spaces_in_own_groups(self): + """Correctly parse spaces when in their own groups + """ + path = join(DATA_BASE_DIR, + "rtf_parsing", + "five_spaces.rtf") + if path is not None: + with open(path, 'rb') as fp: + rtf = fp.read() + self.check_deencapsulate_validity(rtf, + expect_error=None, + name="Parse spaces in their own groups.") + output = DeEncapsulator(rtf) + output.deencapsulate() + # self.maxDiff = None + self.assertEqual(b'INSERT_BODY_TEXT_HERE ', output.text) + + def test_parse_multiple_features(self): + """Correctly parse spaces when in their own groups + """ + path = join(DATA_BASE_DIR, + "rtf_parsing", + "encapsulated_example.rtf") + htmlpath = join(DATA_BASE_DIR, + "rtf_parsing", + "encapsulated_example.html") + + if path is not None: + with open(path, 'rb') as fp: + rtf = fp.read() + if htmlpath is not None: + with open(htmlpath, 'rb') as fp: + html = fp.read() + + self.check_deencapsulate_validity(rtf, + expect_error=None, + print_error=True, + name="Test parse multiple features.") + self.maxDiff = None + output = DeEncapsulator(rtf) + output.deencapsulate() + # print("RUNNING DIFF") + # print(repr(html)) + # The CR+LF newlines should not match. + self.assertNotEqual(html, output.html) + # The LF newlines should be replaced. + html = html.replace(b'\r\n', b'\n') + self.assertEqual(html, output.html) + + + def debug_error(self, err): + print("FOUND ERROR") + # print(dir(err)) + # print(err) + # print(err.start) + # print(err.considered_rules) + # print(err.state) + # print(dir(err.state)) + print("Current stack of tokens") + # print(err.state.state_stack) + # print(err.state.value_stack) + for i in err.state.value_stack: + print(i) + # print(dir(i)) + # print(len(i)) + # print(type(i)) + # 'lexer', 'parse_conf', 'position', 'state_stack', 'value_stack' + print(err.token_history) + # print(err.interactive_parser.lexer_thread ) + + print("FOUND ERROR DONE") + return True + + def replace_text_in_template(self, path, replacement, + rep_str=b"REPLACE_FROM_HEADER_HERE", string=None): if path is not None: - with open(path, 'r') as fp: + with open(path, 'rb') as fp: raw_rtf = fp.read() else: raw_rtf = string @@ -340,12 +467,15 @@ def replace_from_header(self, path, replacement, def run_parsing(self, rtf): output = DeEncapsulator(rtf) - output.stripped_rtf = output._strip_htmlrtf_sections() - output.simplified_rtf = output._simplify_text_for_parsing() - output.doc_tree = output._parse_rtf() + escaped_rtf = encode_escaped_control_chars(output.raw_rtf) + output.parse_rtf(escaped_rtf) + output.get_doc_tree() return output - def check_deencapsulate_validity(self, data, expect_error=None, name="test"): + def check_deencapsulate_validity(self, data, + expect_error=None, + name="test", + print_error=False): """Helper to check if a test input raises or doesn't raise an error.""" found_error = None try: @@ -353,7 +483,12 @@ def check_deencapsulate_validity(self, data, expect_error=None, name="test"): output.deencapsulate() except Exception as _e: found_error = _e - + if print_error is True: + import traceback + traceback.print_exception(type(found_error), + found_error, + found_error.__traceback__) + # output.deencapsulate() if expect_error is not None: if found_error is None: self.fail("Expected {} but DeEncapsulator finished without error on {}.".format(expect_error, name)) diff --git a/tests/parse_rtf/test_validate_file.py b/tests/parse_rtf/test_validate_file.py index 1c3381c..917d993 100644 --- a/tests/parse_rtf/test_validate_file.py +++ b/tests/parse_rtf/test_validate_file.py @@ -8,20 +8,21 @@ import unittest from os.path import join -from RTFDE.exceptions import MalformedRtf +from RTFDE.exceptions import MalformedRtf, MalformedEncapsulatedRtf from RTFDE.deencapsulate import DeEncapsulator from tests.test_utils import DATA_BASE_DIR + class TestInputValidity(unittest.TestCase): """ Tests basic valid and invalid inputs.""" def test_valid_rtf_string(self): - """ Check that a valid encapsulated rtf string returns 0 exit status.""" + """ Check that opening an rtf file as a string returns a TypeError.""" quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf") with open(quote_printable_rtf_path, 'r') as fp: raw_rtf = fp.read() self.check_deencapsulate_validity(raw_rtf, - expect_error=None, + expect_error=TypeError, name="quoted_printable_01.rtf") def test_valid_rtf_bytes(self): @@ -35,7 +36,7 @@ def test_valid_rtf_bytes(self): def test_invalid_none(self): """ Check that passing nothing returns a non-zero exit status.""" - self.check_deencapsulate_validity("", + self.check_deencapsulate_validity(b"", expect_error=MalformedRtf, name="empty string") self.check_deencapsulate_validity(b"", @@ -62,6 +63,16 @@ def test_invalid_file_pointer(self): expect_error=TypeError, name="rtf file pointer") + def test_far_too_minimal_rtf_file(self): + raw_rtf = b"{\\rtf1}}" + self.check_deencapsulate_validity(raw_rtf, + expect_error=MalformedEncapsulatedRtf, + name="rtf with an extra brace") + raw_rtf = b"{\\rtf1}1" + self.check_deencapsulate_validity(raw_rtf, + expect_error=MalformedEncapsulatedRtf, + name="rtf with an extra 1") + def check_deencapsulate_validity(self, data, expect_error=None, name="test"): """Helper to check if a test input raises or doesn't raise an error.""" found_error = None diff --git a/tests/test_data/html/multiple-encodings.rtf b/tests/test_data/html/multiple-encodings.rtf index 4dfc8a0..e509f6b 100644 --- a/tests/test_data/html/multiple-encodings.rtf +++ b/tests/test_data/html/multiple-encodings.rtf @@ -2,15 +2,15 @@ {\f0\fswiss Arial;}} {\colortbl\red0\green0\blue0;} \uc1\pard\plain\deftab360 \f0\fs24 -{\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0\u36889 ?\u26159 ?\u19968 ?\u20491 ?\u25991 ?\u26412 ?\u23383 ?\u31526 ?\u20018 ? +{\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0\u36889 ?\u26159 ?\u19968 ?\u20491 ?\u25991 ?\u26412 ?\u65533 ?=AD\u65533 ?\u31526 ?\u20018 ? {\*\htmltag116
}\htmlrtf \line -\htmlrtf0 \u1494 ?\u1492 ?\u1493 ? \u32 ? \u1502 ?\u1495 ?\u1512 ?\u1493 ?\u1494 ?\u1514 ?\u32 ? \u1496 ?\u1511 ?\u1505 ?\u1496 ?. +\htmlrtf0 \u1494 ?\u1492 ?\u1493 ? \u1502 ?\u1495 ?\u1512 ?\u1493 ?\u65533 ?=96\u1514 ? \u1496 ?\u1511 ?\u1505 ?\u1496 ?. {\*\htmltag240
} {\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0 {\*\htmltag116
}\htmlrtf \line \htmlrtf0 {\*\htmltag72}\htmlrtf\par}\htmlrtf0 -{\*\htmltag104
}\htmlrtf }\htmlrtf0 +{\*\htmltag104
}\htmlrtf }\htmlrtf0 {\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0 {\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0 {\*\htmltag96
}\htmlrtf {\htmlrtf0 {\*\htmltag64}\htmlrtf {\htmlrtf0 @@ -34,6 +34,6 @@ {\*\htmltag104
}\htmlrtf }\htmlrtf0 {\*\htmltag72}\htmlrtf\par}\htmlrtf0 -{\*\htmltag104
}\htmlrtf }\htmlrtf0 +{\*\htmltag104
}\htmlrtf }\htmlrtf0 {\*\htmltag104
} -{\*\htmltag0 \par }} +{\*\htmltag0 \par }} \ No newline at end of file diff --git a/tests/test_data/personal_rtf_test_files/README.md b/tests/test_data/personal_rtf_test_files/README.md new file mode 100644 index 0000000..fb9b84e --- /dev/null +++ b/tests/test_data/personal_rtf_test_files/README.md @@ -0,0 +1,63 @@ +# Private Data Test Folder + +Use this folder to include files to test against a private folder full of RTF files exported from .msg files. The `TestPrivateMsgTestCases` unittest in `test_de_encapsulate.py` will run against this, or any other, folder full of raw encapsulated .rtf files. + +This test will first try to run against whatever path is set in the RTFDE_PRIVATE_MSG_FOLDER environment variable. + +If you do not set this environment variable this test will look for .rtf files in the `tests/test_data/personal_rtf_test_files` folder. If it finds none there it will exit without any error. + +``` +> export RTFDE_PRIVATE_MSG_FOLDER="/path/to/folder/with/messages/" +> python3 -m unittest discover -v +``` + +If you want to run tests that check the contents of the original HTML file against the encapsulated version you can use the RTFDE_PRIVATE_MSG_OUTPUT_FOLDER environment variable. If you do not set this environment variable this test will look for .html files in the `tests/test_data/personal_rtf_test_output_files` folder. If it does not find a file with the same name as the .rtf file that it is testing it will not attempt to compare it to anything. (`file.rtf` needs a corresponding `file.html`). + +``` + > export RTFDE_PRIVATE_MSG_OUTPUT_FOLDER="/path/to/folder/with/html/outputs/" + > python3 -m unittest discover -v --locals +``` + +It is important to use the --locals variable for unittest since it will expose the filename where an error occurs alongside the unittest failure. Look for the variable named `FAILING_FILENAME` + +This folder `tests/test_data/personal_rtf_test_files` has been included in the .gitignore to allow developers to safely use it for this purpose. + + +# Populating the private test folder +See code in `scripts/prep_private_rtf_test_folder.sh` for guidance on how to populate a private test folder with RTF files extracted from a folder full of .msg files. + +Run this script to extract .rtf msg bodies from .msg files. Can be run in multiple ways. + +1) extract the .rtf bodies from a folder of .msg files into another folder. +``` +./prep_private_rtf_test_folder.sh \ + -i /tmp/msg_files/ \ + -o /tmp/extracted_rtf/ +``` + +2) exract the .rtf body from a single .msg file into a folder +``` +./prep_private_rtf_test_folder.sh \ + -i /tmp/msg_files/email.msg \ + -o /tmp/extracted_rtf/ +``` + +2) exract the .rtf body from a single .msg file to a specific filename +``` +./prep_private_rtf_test_folder.sh \ + -i /tmp/msg_files/email.msg \ + -o /tmp/extracted_rtf/extracted_msg.rtf +``` + + +# Example test files to include + +``` +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/chinese-exotic-test.rtf' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/complex-test.rtf' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/hebrew-test.rtf' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/mixed-charsets-test.rtf' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/newlines-test.rtf' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/russian-test.rtf' +wget 'https://github.com/bbottema/rtf-to-html/raw/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/input/simple-test.rtf' +``` diff --git a/tests/test_data/personal_rtf_test_output_files/README.md b/tests/test_data/personal_rtf_test_output_files/README.md new file mode 100644 index 0000000..019c9e8 --- /dev/null +++ b/tests/test_data/personal_rtf_test_output_files/README.md @@ -0,0 +1,29 @@ +# Private Data Test HTML output Folder + +Use this folder to include HTML files which correspond to the to RTF files in the private test folder. It is used in the `TestPrivateMsgTestCases` unittest in `test_de_encapsulate.py`. + +See the README.md in the `tests/test_data/personal_rtf_test_files` directory for more info on the primary .rtf files. + +If you want to run tests that check the contents of the original HTML file against the encapsulated version you can use the RTFDE_PRIVATE_MSG_OUTPUT_FOLDER environment variable. If you do not set this environment variable this test will look for .html files in the `tests/test_data/personal_rtf_test_output_files` folder. If it does not find a file with the same name as the .rtf file that it is testing it will not attempt to compare it to anything. (`file.rtf` needs a corresponding `file.html`). + +``` + > export RTFDE_PRIVATE_MSG_OUTPUT_FOLDER="/path/to/folder/with/html/outputs/" + > python3 -m unittest discover -v --locals +``` + +It is important to use the --locals variable for unittest since it will expose the filename where an error occurs alongside the unittest failure. Look for the variable named `FAILING_FILENAME` + +This folder `tests/test_data/personal_rtf_test_outputfiles` has been included in the .gitignore to allow developers to safely use it for this purpose. + + +# Example test files to include + +``` +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/chinese-exotic-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/complex-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/hebrew-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/mixed-charsets-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/newlines-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/russian-test.html' +wget 'https://raw.githubusercontent.com/bbottema/rtf-to-html/9e4c42dbd7a8505d862aaf905739c5b6fc5e3be9/src/test/resources/test-messages/output/rfcompliant/simple-test.html' +``` diff --git a/tests/test_data/plain_text/test_data.rtf b/tests/test_data/plain_text/test_data.rtf new file mode 100644 index 0000000..1c40096 --- /dev/null +++ b/tests/test_data/plain_text/test_data.rtf @@ -0,0 +1,8 @@ +{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0{\fonttbl +{\f0\fswiss Arial;} +{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;}} +{\colortbl\red0\green0\blue0;\red0\green0\blue255;} +\uc1\pard\plain\deftab360 \f0\fs20 body\par +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/control_chars.rtf b/tests/test_data/rtf_parsing/control_chars.rtf new file mode 100644 index 0000000..5c454cf --- /dev/null +++ b/tests/test_data/rtf_parsing/control_chars.rtf @@ -0,0 +1,185 @@ +{\rtf1\ansi\fbidis\ansicpg1252\deff0\fromhtml1{\fonttbl{\f0\fswiss\fcharset0 Times New Roman;}{\f1\fswiss\fcharset2 Symbol;}{\f2 +\fswiss\fcharset0 Calibri;}{\f3\fswiss\fcharset0 Verdana;}} +{\colortbl;\red192\green192\blue192;\red255\green255\blue255;\red0\green0\blue1;\red43\green59\blue70;\red0\green0\blue255 +;\red5\green99\blue193;\red167\green207\blue56;\red42\green59\blue69;\red60\green60\blue59;} +{\*\generator Microsoft Exchange Server;} +{\*\formatConverter converted from html;} +\viewkind5\viewscale100 +\htmlrtf{\*\bkmkstart BM_BEGIN}\htmlrtf0{\*\htmltag64}{\*\htmltag0
}{\*\htmltag0

}\pard +\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Hi {\*\htmltag0

}\htmlrtf}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Lorem ipsum dolor sit amet, consectetur adipiscing elit. (\'A30k) Vestibulum malesuada cursus est, eget sollicitudin lorem.{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Duis convallis dui a scelerisque aliquet. Quisque accumsan auctor convallis. Morbi faucibus nibh tellus, non tincidunt lorem elementum sed. Vestibulum ipsum magna, aliquet a enim sagittis, finibus elementum dolor.{\*\htmltag0
}\htmlrtf +}{\f2\lang2057\fs22\line +\htmlrtf0{\*\htmltag0\par}{\*\htmltag0
}\htmlrtf}{\f2\lang2057\fs22\line +\htmlrtf0{\*\htmltag0\par}Etiam maximus blandit nibh nec porttitor.{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 A brier quality data.{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Sed a porttitor leo 1{\*\htmltag0 }\htmlrtf +}{\f2\lang2057\super\htmlrtf0 st{\*\htmltag0 }\htmlrtf}{\f2\lang2057\fs22\htmlrtf0 Curabitur laoreet massa ut tincidunt.{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Vivamus tempor turpis ante?{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 Thanks{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0 {\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }\htmlrtf\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard\plain\htmlrtf{\f2\lang2057 +\fs22\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf\par +\htmlrtf0{\*\htmltag0 }{\*\htmltag0 }\htmlrtf\pard\intbl\cbpat2\cell\htmlrtf0{\*\htmltag0 }\htmlrtf\pard +\intbl\row +\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\cbpat8\cell\htmlrtf0{\*\htmltag0 }\htmlrtf\pard +\intbl\row +\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\cbpat2\cell\htmlrtf0{\*\htmltag0 }\htmlrtf\pard +\intbl\row +\htmlrtf0{\*\htmltag0
}{\*\htmltag0 }\htmlrtf +\trowd\irow0\irowband0\trcbpat2\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\htmlrtf0 +{\*\htmltag0 }{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow0\irowband0\trcbpat2 +\trrh312\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow1\irowband1\trcbpat2 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow2\irowband2\trcbpat2 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf\nestcell{\nonesttables\tab}\htmlrtf0 +{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow3\irowband3\trcbpat2\trrh120\clpadl0\clpadfl3\clpadr0 +\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl +\itap2{\*\nesttableprops\trowd\irow4\irowband4\trcbpat2\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3 +\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow5\irowband5\trcbpat2 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow6\irowband6\trcbpat2 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow7\irowband7\trcbpat2 +\trrh168\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f2\lang2057\cf3\fs18\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf3\fs18\htmlrtf0 Kind Regards{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f2\lang2057\cf3\fs20\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf4\fs18\b\htmlrtf0 C{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf4\fs18\htmlrtf0 Viles & Support {\*\htmltag0 }\htmlrtf +}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf3\fs18\htmlrtf0 +44 (0)7788 365 955{\*\htmltag0 }\htmlrtf +}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\field{\*\fldinst HYPERLINK "mailto:ceme@e.com" }{\fldrslt{\f3\lang2057\cf6\fs18 +\ul\htmlrtf0 cemery@senceive.com{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0 }\htmlrtf}}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf{\field{\*\fldinst HYPERLINK "https://www.linkedin.com/in//" }{\fldrslt +{\f3\lang2057\cf7\fs18\b\ul\htmlrtf0 Connect\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0 }\htmlrtf}}{\f3\lang2057 +\cf4\fs18\htmlrtf0 with me on LinkedIn{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0
}{\*\htmltag0 }\htmlrtf +\trowd\irow1\irowband1\trcbpat2\clcbpat8\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\cellx8856\htmlrtf0 +{\*\htmltag0 }{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow0\irowband0\trcbpat8 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat8\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat8\plain{\*\htmltag0 }\htmlrtf +{\field{\*\fldinst HYPERLINK "https://www.sss.com/" }{\fldrslt\htmlrtf0{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0 }\htmlrtf +}}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0
}{\*\htmltag0 }\htmlrtf +\trowd\irow2\irowband2\trcbpat2\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\htmlrtf0 +{\*\htmltag0 }{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow0\irowband0\trcbpat2 +\trrh240\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow1\irowband1\trcbpat2 +\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0 }\htmlrtf +\nestcell{\nonesttables\tab}\htmlrtf0{\*\htmltag0 }\htmlrtf\pard\intbl\itap2{\*\nesttableprops\trowd\irow2\irowband2\trcbpat2 +\trrh240\clpadl0\clpadfl3\clpadr0\clpadfr3\clpadb0\clpadfb3\clpadt0\clpadft3\clcbpat2\cellx8856\nestrow}{\nonesttables\par} +\htmlrtf0{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0 7b/7c Imperial Studios, Imperial Road, Fulham, London SW6 2AG{\*\htmltag0
}\htmlrtf +}{\f3\lang2057\cf9\fs15\line +\htmlrtf0{\*\htmltag0\par}United Kingdom Company no. 05608752 (England) VAT no. GB 894344685{\*\htmltag0
}\htmlrtf +}\htmlrtf0{\*\htmltag0

}\htmlrtf\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf +\par +\htmlrtf0{\*\htmltag0

}{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain\htmlrtf{\f3\lang2057\cf9\fs15\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }{\*\htmltag0 }\pard +\intbl\itap2\cbpat2\plain{\*\htmltag0 }\htmlrtf\objattph \htmlrtf0{\*\htmltag0

}{\*\htmltag0
}{\*\htmltag0
}{\*\htmltag0

}{\*\htmltag0 }\pard +\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0 }\htmlrtf}\htmlrtf0{\*\htmltag0

}\htmlrtf\par +\htmlrtf0{\*\htmltag0

}\pard\plain\htmlrtf{\f2\lang2057\fs22\htmlrtf0\~{\*\htmltag0

}\htmlrtf +}\htmlrtf0{\*\htmltag0 }{\*\htmltag0
}{\*\htmltag0 }{\*\htmltag0 }} diff --git a/tests/test_data/rtf_parsing/encapsulated_example.html b/tests/test_data/rtf_parsing/encapsulated_example.html new file mode 100644 index 0000000..4977ac4 --- /dev/null +++ b/tests/test_data/rtf_parsing/encapsulated_example.html @@ -0,0 +1,23 @@ + + + + + +

Note the line break inside a P tag. This is bold text

+

+This is a normal text with a character references:  < ¨
+characters that have special meaning in RTF: {}\
+

+
    +
  1. This is a list item +
+ + diff --git a/tests/test_data/rtf_parsing/encapsulated_example.rtf b/tests/test_data/rtf_parsing/encapsulated_example.rtf new file mode 100644 index 0000000..8aca315 --- /dev/null +++ b/tests/test_data/rtf_parsing/encapsulated_example.rtf @@ -0,0 +1,38 @@ +{\rtf1\ansi\ansicpg1251\fromhtml1 \deff0 +{\fonttbl {\f0\fmodern Courier New;}{\f1\fswiss Arial;}{\f2\fswiss\fcharset0 Arial;}} +{\colortbl\red0\green0\blue0;\red0\green0\blue255;} +{\*\htmltag64} +\uc1\pard\plain\deftab360 \f0\fs24 +{\*\htmltag \par +\par +\tab \par +\par +\par +} +{\htmlrtf \f1 \htmlrtf0 Note the line break inside a P tag. {\*\htmltag }{\htmlrtf \b +\htmlrtf0 This is bold text{\*\htmltag }} \htmlrtf\par\htmlrtf0} +\htmlrtf \par \htmlrtf0 +{\*\htmltag

\par +

\par} +{\htmlrtf \f1 \htmlrtf0 This is a normal text with a character references: +{\*\htmltag  }\htmlrtf \'a0\htmlrtf0 {\*\htmltag <}\htmlrtf <\htmlrtf0 {\*\htmltag +¨}\htmlrtf {\f2\'a8}\htmlrtf0{\*\htmltag
\par}\htmlrtf\line\htmlrtf0 +characters that have special meaning in RTF: \{\}\\{\*\htmltag +
\par}\htmlrtf\line\htmlrtf0\htmlrtf\par\htmlrtf0} +{\*\htmltag

\par +
    \par +
  1. }{\htmlrtf +{{\*\pn\pnlvlbody\pndec\pnstart1\pnindent360{\pntxta.}}\li360\fi-360{\pntext 1.\tab} \f1 +\htmlrtf0 This is a list item}\htmlrtf\par\htmlrtf0} +{\*\htmltag \par +
\par +\par +\par }} diff --git a/tests/test_data/rtf_parsing/five_spaces.rtf b/tests/test_data/rtf_parsing/five_spaces.rtf new file mode 100644 index 0000000..37ee241 --- /dev/null +++ b/tests/test_data/rtf_parsing/five_spaces.rtf @@ -0,0 +1,11 @@ +{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0{\fonttbl +{\f0\fswiss Arial;} +{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain\deftab360 \f0\fs24\lang1033 +{ +INSERT_BODY_TEXT_HERE { {\*\customDest You really shouldn't be seeting this in the rendered file} {\*\customDest Hi there} } +} +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/font_table_template.rtf b/tests/test_data/rtf_parsing/font_table_template.rtf new file mode 100644 index 0000000..1073e77 --- /dev/null +++ b/tests/test_data/rtf_parsing/font_table_template.rtf @@ -0,0 +1,27 @@ +{\rtf1\ansi\ansicpg1252\\fromhtml1 \\deff0 \fbidis \deff0REPLACE_FONT_TABLE_HERE +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain\deftab360 \f0\fs24 +{\*\htmltag19 } +{\*\htmltag34 } +{\*\htmltag161 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag41 } +{\*\htmltag50 }\htmlrtf \lang1033 \htmlrtf0 +{\*\htmltag96
}\htmlrtf {\htmlrtf0 +{\*\htmltag64

}INSERT_BODY_TEXT_HERE +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag84 A picture containing text, primate, mammal, sitting

Description automatically generated} +{\*\htmltag244 } +{\*\htmltag252 }\htmlrtf\par}\htmlrtf0 +\htmlrtf \par +\htmlrtf0 +{\*\htmltag72

} +{\*\htmltag104
}\htmlrtf }\htmlrtf0 +{\*\htmltag58 } +{\*\htmltag27 }} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/small_template.rtf b/tests/test_data/rtf_parsing/small_template.rtf new file mode 100644 index 0000000..81c7862 --- /dev/null +++ b/tests/test_data/rtf_parsing/small_template.rtf @@ -0,0 +1,7 @@ +{\rtf1\ansi\ansicpg1252\deff0\fromhtml1\nouicompat\deflang1033{\fonttbl{\f0\fnil\fcharset0 Calibri;}} +{\*\generator Riched20 10.0.16299}\viewkind4\uc1 +\pard\sa200\sl276\slmult1\f0\fs22\lang9 + +REPLACE_ME + +{}} diff --git a/tests/test_data/rtf_parsing/surrogate_pairs.rtf b/tests/test_data/rtf_parsing/surrogate_pairs.rtf new file mode 100644 index 0000000..815146b --- /dev/null +++ b/tests/test_data/rtf_parsing/surrogate_pairs.rtf @@ -0,0 +1,11 @@ +{\rtf1\ansi\ansicpg1252\deff0\fromhtml1\nouicompat\deflang1033{\fonttbl{\f0\fnil\fcharset0 Calibri;}} +{\*\generator Riched20 10.0.16299}\viewkind4\uc1 +\pard\sa200\sl276\slmult1\f0\fs22\lang9 + +{\uc1\u482??}\line +{\uc2\u482??}\line +{\uc1\u-10179 ?\u-8694 ?}\line +{\uc1\u-10179 ??\u-8694 ??}\line +{\uc2\u-10179 ??\u-8694 ??}\line + +{}} diff --git a/tests/test_data/rtf_parsing/surrogate_pairs_02.rtf b/tests/test_data/rtf_parsing/surrogate_pairs_02.rtf new file mode 100644 index 0000000..24d3dfa --- /dev/null +++ b/tests/test_data/rtf_parsing/surrogate_pairs_02.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;}{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";} +{\f5\fswiss\fcharset0 "Century Gothic";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain +{\*\htmltag84 😊}\htmlrtf \u-10179 ?\u-8694 ?\htmlrtf0 +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/surrogate_pairs_03.rtf b/tests/test_data/rtf_parsing/surrogate_pairs_03.rtf new file mode 100644 index 0000000..c3c6182 --- /dev/null +++ b/tests/test_data/rtf_parsing/surrogate_pairs_03.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;}{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";} +{\f5\fswiss\fcharset0 "Century Gothic";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain +{\*\htmltag84 😊}\htmlrtf \u55357 ?\u56542 ?\htmlrtf0 +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/surrogate_pairs_04.rtf b/tests/test_data/rtf_parsing/surrogate_pairs_04.rtf new file mode 100644 index 0000000..2b03a07 --- /dev/null +++ b/tests/test_data/rtf_parsing/surrogate_pairs_04.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;}{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";} +{\f5\fswiss\fcharset0 "Century Gothic";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain +{\*\htmltag84 😊} \u55357 ?\u56542 ? +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/surrogates.rtf b/tests/test_data/rtf_parsing/surrogates.rtf new file mode 100644 index 0000000..dea78b7 --- /dev/null +++ b/tests/test_data/rtf_parsing/surrogates.rtf @@ -0,0 +1,31 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss Arial;} +{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";}} +{\colortbl\red0\green0\blue0;\red0\green0\blue255;} +\uc1\pard\plain\deftab360 \f0\fs24 +{\*\htmltag2 \par } +{\*\htmltag18 } +{\*\htmltag34 } +{\*\htmltag161 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag41 } +{\*\htmltag50 }\htmlrtf \lang1033 \htmlrtf0 +{\*\htmltag96
}\htmlrtf {\htmlrtf0 +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\f4 \htmlrtf0 \u-10179 ?\u-8694 ? +{\*\htmltag156 }\htmlrtf }\htmlrtf0 +{\*\htmltag244 } +{\*\htmltag252 }\htmlrtf\par}\htmlrtf0 +\htmlrtf \par +\htmlrtf0 +{\*\htmltag72

} +{\*\htmltag104
}\htmlrtf }\htmlrtf0 +{\*\htmltag58 } +{\*\htmltag27 }} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/theta.rtf b/tests/test_data/rtf_parsing/theta.rtf new file mode 100644 index 0000000..17a658e --- /dev/null +++ b/tests/test_data/rtf_parsing/theta.rtf @@ -0,0 +1 @@ +{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0{\fonttbl {\f0\fswiss\fcharset0 Arial;} {\f1\fmodern Courier New;} {\f2\fnil\fcharset2 Symbol;} {\f3\fmodern\fcharset0 Courier New;} {\f4\fswiss\fcharset204 Arial;}} {\colortbl\red0\green0\blue0;\red0\green0\blue255;} \uc1\pard\plain\deftab360 \f0\fs20 \htmlrtf{\f4\fs20\htmlrtf0 \'f4\htmlrtf\f0}\htmlrtf0 \par } diff --git a/tests/test_data/rtf_parsing/translated_by.rtf b/tests/test_data/rtf_parsing/translated_by.rtf new file mode 100644 index 0000000..46c10f2 --- /dev/null +++ b/tests/test_data/rtf_parsing/translated_by.rtf @@ -0,0 +1,5 @@ +{\rtf1\fromtext\fbidis\ansi\deff0{\fonttbl{\f0\fnil\fcharset178 MS Sans Serif;}{\f1\fnil\fcharset0 MS Sans Serif;}} + +\viewkind4\uc1\pard\ltrpar\lang12289\f0\rtlch\fs16\'ca\'d1\'cc\'e3\'c9: \'d3\'e3\'ed\'d1 \'c7\'e1\'e3\'cc\'d0\'e6\'c8\f1\ltrch\par + +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/unicode_HH_replacement.rtf b/tests/test_data/rtf_parsing/unicode_HH_replacement.rtf new file mode 100644 index 0000000..fb7286d --- /dev/null +++ b/tests/test_data/rtf_parsing/unicode_HH_replacement.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;}{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";} +{\f5\fswiss\fcharset0 "Century Gothic";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain +{\*\htmltag84 😊}\htmlrtf \u55357\'20\u56542 ?\htmlrtf0 +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/unicode_HH_replacement_01.rtf b/tests/test_data/rtf_parsing/unicode_HH_replacement_01.rtf new file mode 100644 index 0000000..df057d3 --- /dev/null +++ b/tests/test_data/rtf_parsing/unicode_HH_replacement_01.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;}{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fswiss\fcharset0 "Segoe UI Emoji";} +{\f5\fswiss\fcharset0 "Century Gothic";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain +{\*\htmltag84 😊} \u55357\'20\u56542 ? +} \ No newline at end of file diff --git a/tests/test_data/rtf_parsing/windows_950.rtf b/tests/test_data/rtf_parsing/windows_950.rtf new file mode 100644 index 0000000..9d9405c --- /dev/null +++ b/tests/test_data/rtf_parsing/windows_950.rtf @@ -0,0 +1,14 @@ +{\rtf1\ansi\ansicpg1252\fromhtml1 \fbidis \deff0{\fonttbl +{\f0\fswiss\fcharset0 Arial;} +{\f1\fmodern Courier New;} +{\f2\fnil\fcharset2 Symbol;} +{\f3\fmodern\fcharset0 Courier New;} +{\f4\fnil\fcharset2 Symbol;} +{\f5\fswiss\fcharset136 New MingLiu;} +{\f6\fswiss\fcharset0 "Century Gothic";} +{\f7\fswiss\fcharset0 "Arial";} +{\f8\fswiss "Calibri Light";} +{\f9\fswiss\fcharset0 "Courier New";}} +{\colortbl\red0\green0\blue0;\red5\green99\blue193;} +\uc1\pard\plain\deftab360 \f5\fs24 +Hello \'84\'68} \ No newline at end of file diff --git a/tests/test_utils/test_main_utils.py b/tests/test_utils/test_main_utils.py new file mode 100644 index 0000000..1e30150 --- /dev/null +++ b/tests/test_utils/test_main_utils.py @@ -0,0 +1,172 @@ +"""Test Utilities +""" + +import unittest +import logging +from contextlib import contextmanager +from io import StringIO + +from RTFDE.deencapsulate import DeEncapsulator +from RTFDE.utils import log_validators, log_transformations +from RTFDE.utils import encode_escaped_control_chars +from tests.test_utils import DATA_BASE_DIR +from RTFDE.utils import get_tree_diff, log_string_diff +from os.path import join + +@contextmanager +def capture_log(logger): + for i in logger.handlers: + logger.removeHandler(i) + stream = StringIO() + strm_handler = logging.StreamHandler(stream) + f = '%(name)s - %(levelname)s - %(message)s' + formatter = logging.Formatter(f) + strm_handler.setFormatter(formatter) + logger.addHandler(strm_handler) + logger.propagate = False + yield stream + +def get_logger_defaults(logger): + defaults = {} + defaults.setdefault('level', logger.level) + defaults.setdefault('handlers', logger.handlers) + defaults.setdefault('propagate', logger.propagate) + return defaults + +def set_logger_defaults(logger, defaults): + defaults = {} + logger.level = defaults.get('level', logging.NOTSET) + logger.handlers = defaults.get('handlers', []) + logger.propagate = defaults.get('propagate', True) + + +class TestLogging(unittest.TestCase): + """Test that custom logging is working + """ + + def test_validators(self): + logger = logging.getLogger("RTFDE.validation_logger") + defaults = get_logger_defaults(logger) + data = "we will check validators" + + with capture_log(logger) as log: + log_validators(data) + log = log.getvalue() + self.assertNotIn("RTFDE.transform_logger", log) + self.assertNotIn("check transformations", log) + self.assertNotIn("DEBUG", log) + + logger.setLevel(logging.DEBUG) + with capture_log(logger) as log: + log_validators(data) + log = log.getvalue() + self.assertIn("RTFDE.validation_logger", log) + self.assertIn("check validators", log) + self.assertIn("DEBUG", log) + # Cleaning back up for future tests + set_logger_defaults(logger, defaults) + + def test_transformations(self): + logger = logging.getLogger("RTFDE.transform_logger") + defaults = get_logger_defaults(logger) + data = "we will check transformations" + + with capture_log(logger) as log: + log_transformations(data) + self.assertNotIn("RTFDE.transform_logger", log) + self.assertNotIn("check transformations", log) + self.assertNotIn("DEBUG", log) + + logger.setLevel(logging.DEBUG) + with capture_log(logger) as log: + log_transformations(data) + + log = log.getvalue() + self.assertIn("RTFDE.transform_logger", log) + self.assertIn("check transformations", log) + self.assertIn("DEBUG", log) + # Cleaning back up for future tests + set_logger_defaults(logger, defaults) + + def test_string_diff(self): + logger = logging.getLogger("RTFDE") + logger.setLevel(logging.DEBUG) + rtf_path = join(DATA_BASE_DIR, "plain_text", "test_data.rtf") + with open(rtf_path, 'rb') as fp: + raw_rtf = fp.read() + mod_rtf = raw_rtf + mod_rtf = mod_rtf.replace(b'\\fswiss ', b'\\things ') + mod_rtf = mod_rtf.replace(b'\\blue255', b'\\notblueothernumber') + # log_string_diff(raw_rtf, mod_rtf) + print("===========================sep========================") + with capture_log(logger) as log: + log_string_diff(raw_rtf, mod_rtf, sep=b'\\{|\\}') + log = log.getvalue() + self.assertIn(r"! \f0\fswiss Arial;", log) + self.assertIn(r"! \f0\things Arial;", log) + self.assertIn(r"! \colortbl\red0\green0\blue0;\red0\green0\blue255;", log) + self.assertIn(r"! \colortbl\red0\green0\blue0;\red0\green0\notblueothernumber;", log) + + + def test_tree_diff(self): + rtf_path = join(DATA_BASE_DIR, "plain_text", "test_data.rtf") + with open(rtf_path, 'rb') as fp: + raw_rtf = fp.read() + mod_rtf = raw_rtf + mod_rtf = mod_rtf.replace(b'\\fswiss ', b'\\things ') + mod_rtf = mod_rtf.replace(b'\\blue255', b'\\notblueothernumber') + # Create Trees + rtf_obj = DeEncapsulator(raw_rtf) + rtf_obj.deencapsulate() + mod_rtf_obj = DeEncapsulator(mod_rtf) + mod_rtf_obj.deencapsulate() + log = get_tree_diff(rtf_obj.full_tree, + mod_rtf_obj.full_tree) + self.assertIn(r"! Token('CONTROLWORD', b'\\fswiss ')", log) + self.assertIn(r"! Token('CONTROLWORD', b'\\things ')", log) + self.assertIn(r"! Token('CONTROLWORD', b'\\blue255')", log) + self.assertIn(r"! Token('CONTROLWORD', b'\\notblueothernumber')", log) + +class TestUtilities(unittest.TestCase): + """Test that important utilities are working + """ + + def test_encode_escape_chars(self): + raw_text = r"test\\thing\{stuff\}test".encode() + converted = encode_escaped_control_chars(raw_text) + self.assertIn(b"\\'5c", converted) + self.assertIn(b"\\'7b", converted) + self.assertIn(b"\\'7d", converted) + + +def embed(): + import os + import readline + import rlcompleter + import code + import inspect + import traceback + + history = os.path.join(os.path.expanduser('~'), '.python_history') + if os.path.isfile(history): + readline.read_history_file(history) + + frame = inspect.currentframe().f_back + namespace = frame.f_locals.copy() + namespace.update(frame.f_globals) + + readline.set_completer(rlcompleter.Completer(namespace).complete) + readline.parse_and_bind("tab: complete") + + file = frame.f_code.co_filename + line = frame.f_lineno + function = frame.f_code.co_name + + stack = ''.join(traceback.format_stack()[:-1]) + print(stack) + banner = f" [ {os.path.basename(file)}:{line} in {function}() ]" + banner += "\n Entering interactive mode (Ctrl-D to exit) ..." + try: + code.interact(banner=banner, local=namespace) + finally: + readline.write_history_file(history)