Skip to content

Commit

Permalink
Add parse_xml method to bloblang
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffail committed Jan 14, 2021
1 parent 196fa30 commit 13413a3
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file.
- The bloblang method `hash` and the `hash` processor now support `md5`.
- Field `collector_url` added to the `jaeger` tracer.
- The bloblang method `strip_html` now allows you to specify a list of allowed elements.
- New bloblang method `parse_xml`.

## 3.37.0 - 2021-01-06

Expand Down
43 changes: 43 additions & 0 deletions internal/bloblang/query/methods_strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"strings"
"time"

"github.com/Jeffail/benthos/v3/internal/xml"
"github.com/OneOfOne/xxhash"
"github.com/microcosm-cc/bluemonday"
"github.com/tilinna/z85"
Expand Down Expand Up @@ -940,6 +941,48 @@ func parseJSONMethod(target Function, _ ...interface{}) (Function, error) {

//------------------------------------------------------------------------------

var _ = RegisterMethod(
NewMethodSpec(
"parse_xml", "",
).InCategory(
MethodCategoryParsing,
`Attempts to parse a string as an XML document and returns a structured result, where elements appear as keys of an object according to the following rules:
- If an element contains attributes they are parsed by prefixing a hyphen, `+"`-`"+`, to the attribute label.
- If the element is a simple element and has attributes, the element value is given the key `+"`#text`"+`.
- XML comments, directives, and process instructions are ignored.
- When elements are repeated the resulting JSON value is an array.`,
NewExampleSpec("",
`root.doc = this.doc.parse_xml()`,
`{"doc":"<root><title>This is a title</title><content>This is some content</content></root>"}`,
`{"doc":{"root":{"content":"This is some content","title":"This is a title"}}}`,
),
).Beta(),
false, parseXMLMethod,
ExpectNArgs(0),
)

func parseXMLMethod(target Function, _ ...interface{}) (Function, error) {
return simpleMethod(target, func(v interface{}, ctx FunctionContext) (interface{}, error) {
var xmlBytes []byte
switch t := v.(type) {
case string:
xmlBytes = []byte(t)
case []byte:
xmlBytes = t
default:
return nil, NewTypeError(v, ValueString)
}
xmlObj, err := xml.ToMap(xmlBytes)
if err != nil {
return nil, fmt.Errorf("failed to parse value as XML: %w", err)
}
return xmlObj, nil
}), nil
}

//------------------------------------------------------------------------------

var _ = RegisterMethod(
NewMethodSpec(
"parse_timestamp_unix", "",
Expand Down
29 changes: 29 additions & 0 deletions internal/xml/package.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Package xml is a temporary way to convert XML to JSON. This package is only
// necessary because github.com/clbanning/mxj has global configuration. If we
// are able to configure a decoder etc at the API level then this package can be
// removed.
package xml

import (
"encoding/xml"

"github.com/clbanning/mxj"
"golang.org/x/net/html/charset"
)

func init() {
dec := xml.NewDecoder(nil)
dec.Strict = false
dec.CharsetReader = charset.NewReaderLabel
mxj.CustomDecoder = dec
}

// ToMap parses a byte slice as XML and returns a generic structure that can be
// serialized to JSON.
func ToMap(xmlBytes []byte) (map[string]interface{}, error) {
root, err := mxj.NewMapXml(xmlBytes)
if err != nil {
return nil, err
}
return map[string]interface{}(root), nil
}
17 changes: 3 additions & 14 deletions lib/processor/xml.go
Original file line number Diff line number Diff line change
@@ -1,27 +1,18 @@
package processor

import (
"encoding/xml"
"fmt"
"time"

"github.com/Jeffail/benthos/v3/internal/docs"
"github.com/Jeffail/benthos/v3/internal/xml"
"github.com/Jeffail/benthos/v3/lib/log"
"github.com/Jeffail/benthos/v3/lib/metrics"
"github.com/Jeffail/benthos/v3/lib/types"
"github.com/clbanning/mxj"
"github.com/opentracing/opentracing-go"
"golang.org/x/net/html/charset"
)

//------------------------------------------------------------------------------

func init() {
dec := xml.NewDecoder(nil)
dec.Strict = false
dec.CharsetReader = charset.NewReaderLabel
mxj.CustomDecoder = dec

Constructors[TypeXML] = TypeSpec{
constructor: NewXML,
Status: docs.StatusBeta,
Expand Down Expand Up @@ -146,13 +137,13 @@ func (p *XML) ProcessMessage(msg types.Message) ([]types.Message, types.Response
newMsg := msg.Copy()

proc := func(index int, span opentracing.Span, part types.Part) error {
root, err := mxj.NewMapXml(part.Get())
root, err := xml.ToMap(part.Get())
if err != nil {
p.mErr.Incr(1)
p.log.Debugf("Failed to parse part as XML: %v\n", err)
return err
}
if err = part.SetJSON(map[string]interface{}(root)); err != nil {
if err = part.SetJSON(root); err != nil {
p.mErr.Incr(1)
p.log.Debugf("Failed to marshal XML as JSON: %v\n", err)
return err
Expand All @@ -175,5 +166,3 @@ func (p *XML) CloseAsync() {
func (p *XML) WaitForClose(timeout time.Duration) error {
return nil
}

//------------------------------------------------------------------------------
18 changes: 18 additions & 0 deletions website/docs/guides/bloblang/methods.md
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,24 @@ root.doc = this.doc.parse_json()
# Out: {"doc":{"foo":"bar"}}
```

### `parse_xml`

BETA: This method is mostly stable but breaking changes could still be made outside of major version releases if a fundamental problem with it is found.

Attempts to parse a string as an XML document and returns a structured result, where elements appear as keys of an object according to the following rules:

- If an element contains attributes they are parsed by prefixing a hyphen, `-`, to the attribute label.
- If the element is a simple element and has attributes, the element value is given the key `#text`.
- XML comments, directives, and process instructions are ignored.
- When elements are repeated the resulting JSON value is an array.

```coffee
root.doc = this.doc.parse_xml()

# In: {"doc":"<root><title>This is a title</title><content>This is some content</content></root>"}
# Out: {"doc":{"root":{"content":"This is some content","title":"This is a title"}}}
```

## Encoding and Encryption

### `encode`
Expand Down

0 comments on commit 13413a3

Please sign in to comment.