Support names with multiple parts in loose parser (#10)

This allows the loose parser to include parts of the name before and after other things like mark, amount or children. In "cheese (MILK) with 2.3% fat" the whole ingredient name is now included. Also things like "foo* 50%" now recognize the amount.
q-m · May 31, 2024 · 67bfa0e · 67bfa0e
1 parent 2f4eb9d
commit 67bfa0e
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 36 deletions.
diff --git a/lib/food_ingredient_parser/loose/node.rb b/lib/food_ingredient_parser/loose/node.rb
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
   class Node
     include ToHtml
 
-    attr_accessor :name, :mark, :amount, :contains, :notes
+    attr_accessor :name_parts, :mark, :amount, :contains, :notes
     attr_reader :input, :interval, :auto_close
 
     def initialize(input, interval, auto_close: false)
@@ -14,7 +14,8 @@ def initialize(input, interval, auto_close: false)
       @auto_close = auto_close
       @contains = []
       @notes = []
-      @name = @mark = @amount = nil
+      @name_parts = []
+      @mark = @amount = nil
     end
 
     def ends(index)
@@ -31,14 +32,20 @@ def text_value
 
     def to_h
       r = {}
-      r[:name] = name.text_value.strip if name && name.text_value.strip != ''
+      _name = name
+      r[:name] = _name if _name
       r[:marks] = [mark.text_value.strip] if mark
       r[:amount] = amount.text_value.strip if amount
       r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
       r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
       r
     end
 
+    def name
+      strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
+      return strings.any? ? strings.join(" ") : nil
+    end
+
     def inspect(indent="", variant="")
       inspect_self(indent, variant) +
       inspect_children(indent)
@@ -47,7 +54,7 @@ def inspect(indent="", variant="")
     def inspect_self(indent="", variant="")
       [
         indent + "Node#{variant} interval=#{@interval}",
-        name ? "name=#{name.text_value.strip.inspect}" : nil,
+        name ? "name=#{name.inspect}" : nil,
         mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
         amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
         auto_close ? "auto_close" : nil

diff --git a/lib/food_ingredient_parser/loose/scanner.rb b/lib/food_ingredient_parser/loose/scanner.rb
@@ -33,8 +33,9 @@ class Scanner
 
     def initialize(s, index: 0)
       @s = s                           # input string
-      @i = index                       # current index in string
+      @i = index                       # current index in string, the iterator looks at this character
       @cur = nil                       # current node we're populating
+      @curifree = nil                  # last index in string for current node that we haven't added to a child node yet
       @ancestors = [Node.new(@s, @i)]  # nesting hierarchy
       @iterator = :beginning           # scan_iteration_<iterator> to use for parsing
       @dest = :contains                # append current node to this attribute on parent
@@ -79,6 +80,7 @@ def scan_iteration_standard
         # after bracket check for 'and' to not lose text
         if is_and_sep?(@i+1)
           @i += and_sep_len(@i+1)
+          @curifree = @i # don't include 'and' in cur name
           add_child
         end
       elsif is_notes_start?       # usually a dot marks the start of notes
@@ -147,7 +149,11 @@ def parent
     end
 
     def cur
-      @cur ||= Node.new(@s, @i)
+      if !@cur
+        @cur ||= Node.new(@s, @i)
+        @curifree = @i
+      end
+      @cur
     end
 
     def is_sep?(chars: SEP_CHARS)
@@ -201,16 +207,19 @@ def add_child
       cur.ends(@i-1)
       parent.send(@dest) << cur
       @cur = nil
+      @curifree = nil
     end
 
     def open_parent(**options)
       name_until_here
       @ancestors << cur
       @cur = Node.new(@s, @i + 1, **options)
+      @curifree = @i + 1
     end
 
     def close_parent
       return unless @ancestors.count > 1
+      @curifree = @i + 1
       @cur = @ancestors.pop
       while @cur.auto_close
         add_child
@@ -227,15 +236,15 @@ def close_all_ancestors
     end
 
     def name_until_here
-      cur.name ||= begin
-        i, j = cur.interval.first, @i - 1
-        i += mark_len(i) # skip any mark in front
-        # Set name if there is any. There is one corner-case that needs to be avoided when
-        # a nesting was opened without a name, which would set the name to the nesting text.
-        # In this case, the name starts with an open-nesting symbol, which should never happen.
-        if j >= i && !"([:".include?(@s[i])
-          Node.new(@s, i .. j)
-        end
+      return unless @curifree # no cur started yet
+      i, j = @curifree, @i - 1
+      i += mark_len(i) # skip any mark in front
+      # Set name if there is any. There is one corner-case that needs to be avoided when
+      # a nesting was opened without a name, which would set the name to the nesting text.
+      # In this case, the name starts with an open-nesting symbol, which should never happen.
+      if j >= i && !"([:".include?(@s[i])
+        cur.name_parts << Node.new(@s, i .. j)
+        @curifree = @i
       end
     end
 

diff --git a/lib/food_ingredient_parser/loose/transform/amount.rb b/lib/food_ingredient_parser/loose/transform/amount.rb
@@ -29,18 +29,26 @@ def transform!
 
       # Extract amount from name, if any.
       def transform_name(node = @node)
-        if !node.amount && parsed = parse_amount(node.name&.text_value)
-          offset = node.name.interval.first
+        if !node.amount
+          node.name_parts.each_with_index do |name, i|
+            parsed = parse_amount(name.text_value)
+            next unless parsed
+            offset = name.interval.first
 
-          amount = parsed.amount.amount
-          node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
+            amount = parsed.amount.amount
+            node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
 
-          name = parsed.respond_to?(:name) && parsed.name
-          if name && name.interval.count > 0
-            node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
-          else
-            node.name = nil
+            name = parsed.respond_to?(:name) && parsed.name
+            node.name_parts[i] = if name && name.interval.count > 0
+              Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
+            else
+              nil
+            end
+            # found an amount, stop looking in other parts
+            break
           end
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
 
         # recursively transform contained nodes

diff --git a/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb b/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
@@ -42,7 +42,8 @@ def transform_children!(node)
           # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
           transform_children!(child) if child.contains.any?
 
-          if child.name.nil? || child.name.text_value.strip == ''
+          name = child.name
+          if name.nil? || name == ''
             # Name is empty, we need to do something.
             if prev
               # there is a previous ingredient: move children to new parent

diff --git a/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb b/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb
@@ -29,21 +29,27 @@ def transform!
       def transform_node!(node)
         if node.contains.any?
           node.contains.each {|n| transform_node!(n) }
-        elsif node.name && m = MATCH_RE.match(node.name.text_value)
-          i = 0
-          while m = node.name.text_value.match(SPLIT_RE, i)
-            node.contains << new_node(node, i, m.begin(0)-1)
-            i = m.end(0)
+        else
+          node.name_parts.each_with_index do |name, name_index|
+            if m = MATCH_RE.match(name.text_value)
+              i = 0
+              while m = name.text_value.match(SPLIT_RE, i)
+                node.contains << new_node(name, i, m.begin(0)-1)
+                i = m.end(0)
+              end
+              node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
+              node.name_parts[name_index] = nil
+            end
           end
-          node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
-          node.name = nil
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
       end
 
-      def new_node(node, begins, ends)
-        offset = node.name.interval.first
-        new_node = Node.new(node.input, offset + begins .. offset + ends)
-        new_node.name = Node.new(node.input, new_node.interval)
+      def new_node(name, begins, ends)
+        offset = name.interval.first
+        new_node = Node.new(name.input, offset + begins .. offset + ends)
+        new_node.name_parts = [Node.new(name.input, new_node.interval)]
         new_node
       end
     end