Skip to content

Commit

Permalink
Fix brace matching problem inside strings
Browse files Browse the repository at this point in the history
Previously unmatched braces inside strings would cause tokenization
errors. The expected behavior is that braces inside of strings would not
participate in brace matching (as they are part of a string). This
commit handles strings/brace-groups separately, making sure to match
opening/closing braces/quotations correctly.
  • Loading branch information
project-eutopia committed Mar 12, 2020
1 parent 3d434d4 commit 2a3642e
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 8 deletions.
136 changes: 130 additions & 6 deletions lib/keisan/tokenizer.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
module Keisan
class Tokenizer
TOKEN_CLASSES = [
Tokens::Group,
Tokens::String,
Tokens::Null,
Tokens::Boolean,
Tokens::Word,
Expand All @@ -27,8 +25,21 @@ class Tokenizer

def initialize(expression)
@expression = self.class.normalize_expression(expression)
@scan = @expression.scan(TOKEN_REGEX)
@tokens = tokenize!
intermediate_tokens = parse_strings_and_groups
@tokens = intermediate_tokens.map do |sym, string|
case sym
when :string
Tokens::String.new(string)
when :group
Tokens::Group.new(string)
when :other
scan = string.scan(TOKEN_REGEX)
tokens = tokenize!(scan)
tokens
else
raise Keisan::Exceptions::TokenizingError.new("Internal error, unexpected symbol: #{sym}")
end
end.flatten
end

def self.normalize_expression(expression)
Expand All @@ -38,6 +49,118 @@ def self.normalize_expression(expression)

private

def parse_strings_and_groups
braces = []
braces_start = nil

tokens = []

current_other = nil
current_string = nil

i = 0
while i < @expression.size
c = @expression[i]

if !braces.empty?
if current_string
# Escape character
if c == "\\"
i += 1
c = @expression[i]
# Exit string
elsif c == current_string[0]
current_string = nil
end

# Not in string
else
case c
# New string
when '"', "'"
current_string = c
# New opening brace
when "(", "[", "{"
braces << c
# Closing brace
when ")"
if braces[-1] != "("
raise Keisan::Exceptions::TokenizingError.new("Expected closing brace ')', found '#{c}'")
end
braces.pop
when "]"
if braces[-1] != "["
raise Keisan::Exceptions::TokenizingError.new("Expected closing brace ']', found '#{c}'")
end
braces.pop
when "}"
if braces[-1] != "{"
raise Keisan::Exceptions::TokenizingError.new("Expected closing brace '}', found '#{c}'")
end
braces.pop
end
end

if braces.empty?
tokens << [:group, @expression[braces_start..i]]
end
elsif current_string
# Escape character
if c == "\\"
i += 1
c = @expression[i]
current_string << c
else
current_string << c
# Exit string
if c == current_string[0]
tokens << [:string, current_string]
current_string = nil
end
end
else
case c
# New string
when '"', "'"
if current_other
tokens << [:other, current_other]
current_other = nil
end

current_string = c
# New opening brace
when "(", "[", "{"
if current_other
tokens << [:other, current_other]
current_other = nil
end

braces = [c]
braces_start = i
# Closing brace
when ")", "]", "}"
raise Keisan::Exceptions::TokenizingError.new("Found unmatched closing braced '#{c}'")
else
current_other ||= ""
current_other << c
end
end

i += 1
end

if current_other
tokens << [:other, current_other]
current_other = nil
end

if !braces.empty?
raise Keisan::Exceptions::TokenizingError.new("Found unmatched closing brace '#{braces[0]}'")
end

return tokens
end

def self.normalize_line_delimiters(expression)
expression.gsub(/\n/, ";")
end
Expand All @@ -46,11 +169,12 @@ def self.remove_comments(expression)
expression.gsub(/#[^;]*/, "")
end

def tokenize!
@scan.map do |scan_result|
def tokenize!(scan)
scan.map do |scan_result|
i = scan_result.find_index {|token| !token.nil?}
token_string = scan_result[i]
token = TOKEN_CLASSES[i].new(token_string)
# binding.pry
if token.is_a?(Tokens::Unknown)
raise Keisan::Exceptions::TokenizingError.new("Unexpected token: \"#{token.string}\"")
end
Expand Down
5 changes: 3 additions & 2 deletions lib/keisan/tokens/group.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
module Keisan
module Tokens
class Group < Token
REGEX = /(\((?:[^\[\]\(\)\{\}]*+\g<1>*+)*+\)|\[(?:[^\[\]\(\)\{\}]*+\g<1>*+)*+\]|\{(?:[^\[\]\(\)\{\}]*+\g<1>*+)*+\})/
REGEX = /(\(|\)|\[|\]|\{|\})/

attr_reader :sub_tokens

def initialize(string)
super
@string = string
raise Exceptions::InvalidToken.new(string) unless string[0].match(regex) && string[-1].match(regex)
@sub_tokens = Tokenizer.new(string[1...-1]).tokens
end

Expand Down
6 changes: 6 additions & 0 deletions spec/keisan/calculator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@
end
end

describe "unmatched braces inside strings" do
it "does not match against actual braces outside strings" do
expect(calculator.evaluate("'1'+'2'+(']\\]') + (('3') + '4')")).to eq "12]]34"
end
end

describe "#simplify" do
it "allows for undefined variables to still exist and returns a string representation of the expression" do
expect{calculator.evaluate("0*x+1")}.to raise_error(Keisan::Exceptions::UndefinedVariableError)
Expand Down
26 changes: 26 additions & 0 deletions spec/keisan/tokenizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,32 @@
end
end

it "has nested groups properly tokenized" do
tokenizer = described_class.new("'1'+'2'+(']]') + (('3') + '4')")

expect(tokenizer.tokens.map(&:class)).to match_array([
Keisan::Tokens::String,
Keisan::Tokens::ArithmeticOperator,
Keisan::Tokens::String,
Keisan::Tokens::ArithmeticOperator,
Keisan::Tokens::Group,
Keisan::Tokens::ArithmeticOperator,
Keisan::Tokens::Group
])

group = tokenizer.tokens[4]
expect(group.string).to eq "(']]')"
expect(group.sub_tokens.map(&:class)).to match_array([Keisan::Tokens::String])

group = tokenizer.tokens[6]
expect(group.string).to eq "(('3') + '4')"
expect(group.sub_tokens.map(&:class)).to match_array([
Keisan::Tokens::Group,
Keisan::Tokens::ArithmeticOperator,
Keisan::Tokens::String
])
end

it "has nested groups properly tokenized" do
tokenizer = described_class.new("1+(2+(3+4)+5)+(6)")

Expand Down

0 comments on commit 2a3642e

Please sign in to comment.