Skip to content

Commit

Permalink
Support names with multiple parts in loose parser (#10)
Browse files Browse the repository at this point in the history
This allows the loose parser to include parts of the name before and
after other things like mark, amount or children.

In "cheese (MILK) with 2.3% fat" the whole ingredient name is now
included. Also things like "foo* 50%" now recognize the amount.
  • Loading branch information
wvengen committed May 31, 2024
1 parent 2f4eb9d commit 67bfa0e
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 36 deletions.
15 changes: 11 additions & 4 deletions lib/food_ingredient_parser/loose/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
class Node
include ToHtml

attr_accessor :name, :mark, :amount, :contains, :notes
attr_accessor :name_parts, :mark, :amount, :contains, :notes
attr_reader :input, :interval, :auto_close

def initialize(input, interval, auto_close: false)
Expand All @@ -14,7 +14,8 @@ def initialize(input, interval, auto_close: false)
@auto_close = auto_close
@contains = []
@notes = []
@name = @mark = @amount = nil
@name_parts = []
@mark = @amount = nil
end

def ends(index)
Expand All @@ -31,14 +32,20 @@ def text_value

def to_h
r = {}
r[:name] = name.text_value.strip if name && name.text_value.strip != ''
_name = name
r[:name] = _name if _name
r[:marks] = [mark.text_value.strip] if mark
r[:amount] = amount.text_value.strip if amount
r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
r
end

def name
strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
return strings.any? ? strings.join(" ") : nil
end

def inspect(indent="", variant="")
inspect_self(indent, variant) +
inspect_children(indent)
Expand All @@ -47,7 +54,7 @@ def inspect(indent="", variant="")
def inspect_self(indent="", variant="")
[
indent + "Node#{variant} interval=#{@interval}",
name ? "name=#{name.text_value.strip.inspect}" : nil,
name ? "name=#{name.inspect}" : nil,
mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
auto_close ? "auto_close" : nil
Expand Down
31 changes: 20 additions & 11 deletions lib/food_ingredient_parser/loose/scanner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ class Scanner

def initialize(s, index: 0)
@s = s # input string
@i = index # current index in string
@i = index # current index in string, the iterator looks at this character
@cur = nil # current node we're populating
@curifree = nil # last index in string for current node that we haven't added to a child node yet
@ancestors = [Node.new(@s, @i)] # nesting hierarchy
@iterator = :beginning # scan_iteration_<iterator> to use for parsing
@dest = :contains # append current node to this attribute on parent
Expand Down Expand Up @@ -79,6 +80,7 @@ def scan_iteration_standard
# after bracket check for 'and' to not lose text
if is_and_sep?(@i+1)
@i += and_sep_len(@i+1)
@curifree = @i # don't include 'and' in cur name
add_child
end
elsif is_notes_start? # usually a dot marks the start of notes
Expand Down Expand Up @@ -147,7 +149,11 @@ def parent
end

def cur
@cur ||= Node.new(@s, @i)
if !@cur
@cur ||= Node.new(@s, @i)
@curifree = @i
end
@cur
end

def is_sep?(chars: SEP_CHARS)
Expand Down Expand Up @@ -201,16 +207,19 @@ def add_child
cur.ends(@i-1)
parent.send(@dest) << cur
@cur = nil
@curifree = nil
end

def open_parent(**options)
name_until_here
@ancestors << cur
@cur = Node.new(@s, @i + 1, **options)
@curifree = @i + 1
end

def close_parent
return unless @ancestors.count > 1
@curifree = @i + 1
@cur = @ancestors.pop
while @cur.auto_close
add_child
Expand All @@ -227,15 +236,15 @@ def close_all_ancestors
end

def name_until_here
cur.name ||= begin
i, j = cur.interval.first, @i - 1
i += mark_len(i) # skip any mark in front
# Set name if there is any. There is one corner-case that needs to be avoided when
# a nesting was opened without a name, which would set the name to the nesting text.
# In this case, the name starts with an open-nesting symbol, which should never happen.
if j >= i && !"([:".include?(@s[i])
Node.new(@s, i .. j)
end
return unless @curifree # no cur started yet
i, j = @curifree, @i - 1
i += mark_len(i) # skip any mark in front
# Set name if there is any. There is one corner-case that needs to be avoided when
# a nesting was opened without a name, which would set the name to the nesting text.
# In this case, the name starts with an open-nesting symbol, which should never happen.
if j >= i && !"([:".include?(@s[i])
cur.name_parts << Node.new(@s, i .. j)
@curifree = @i
end
end

Expand Down
26 changes: 17 additions & 9 deletions lib/food_ingredient_parser/loose/transform/amount.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,26 @@ def transform!

# Extract amount from name, if any.
def transform_name(node = @node)
if !node.amount && parsed = parse_amount(node.name&.text_value)
offset = node.name.interval.first
if !node.amount
node.name_parts.each_with_index do |name, i|
parsed = parse_amount(name.text_value)
next unless parsed
offset = name.interval.first

amount = parsed.amount.amount
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
amount = parsed.amount.amount
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)

name = parsed.respond_to?(:name) && parsed.name
if name && name.interval.count > 0
node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
else
node.name = nil
name = parsed.respond_to?(:name) && parsed.name
node.name_parts[i] = if name && name.interval.count > 0
Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
else
nil
end
# found an amount, stop looking in other parts
break
end
# remove cleared name parts
node.name_parts.reject!(&:nil?)
end

# recursively transform contained nodes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def transform_children!(node)
# Apply recursively. Do it before processing to handle multiple depth levels of missing names.
transform_children!(child) if child.contains.any?

if child.name.nil? || child.name.text_value.strip == ''
name = child.name
if name.nil? || name == ''
# Name is empty, we need to do something.
if prev
# there is a previous ingredient: move children to new parent
Expand Down
28 changes: 17 additions & 11 deletions lib/food_ingredient_parser/loose/transform/split_e_numbers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,27 @@ def transform!
def transform_node!(node)
if node.contains.any?
node.contains.each {|n| transform_node!(n) }
elsif node.name && m = MATCH_RE.match(node.name.text_value)
i = 0
while m = node.name.text_value.match(SPLIT_RE, i)
node.contains << new_node(node, i, m.begin(0)-1)
i = m.end(0)
else
node.name_parts.each_with_index do |name, name_index|
if m = MATCH_RE.match(name.text_value)
i = 0
while m = name.text_value.match(SPLIT_RE, i)
node.contains << new_node(name, i, m.begin(0)-1)
i = m.end(0)
end
node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
node.name_parts[name_index] = nil
end
end
node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
node.name = nil
# remove cleared name parts
node.name_parts.reject!(&:nil?)
end
end

def new_node(node, begins, ends)
offset = node.name.interval.first
new_node = Node.new(node.input, offset + begins .. offset + ends)
new_node.name = Node.new(node.input, new_node.interval)
def new_node(name, begins, ends)
offset = name.interval.first
new_node = Node.new(name.input, offset + begins .. offset + ends)
new_node.name_parts = [Node.new(name.input, new_node.interval)]
new_node
end
end
Expand Down

0 comments on commit 67bfa0e

Please sign in to comment.