From d605312d64648e440e58dd19ea1f6566a1d2c10a Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Mon, 8 Aug 2022 09:39:07 -0400 Subject: [PATCH] Initial commit --- .github/workflows/main.yml | 24 + .gitignore | 12 + .gitmodules | 3 + CHANGELOG.md | 5 + Gemfile | 5 + Gemfile.lock | 30 + KNOWN_FAILURES | 652 +++++++++++++++++++++ LICENSE | 21 + README.md | 25 + Rakefile | 64 +++ bin/console | 8 + bin/lex | 20 + ext/yarp/extconf.rb | 4 + ext/yarp/yarp.c | 1106 ++++++++++++++++++++++++++++++++++++ ext/yarp/yarp.h | 260 +++++++++ lib/yarp.rb | 199 +++++++ lib/yarp/version.rb | 5 + test/fixtures/lex.rb | 183 ++++++ test/lex_test.rb | 33 ++ test/test_helper.rb | 6 + vendor/spec | 1 + yarp.gemspec | 32 ++ 22 files changed, 2698 insertions(+) create mode 100644 .github/workflows/main.yml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CHANGELOG.md create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 KNOWN_FAILURES create mode 100644 LICENSE create mode 100644 README.md create mode 100644 Rakefile create mode 100755 bin/console create mode 100755 bin/lex create mode 100644 ext/yarp/extconf.rb create mode 100644 ext/yarp/yarp.c create mode 100644 ext/yarp/yarp.h create mode 100644 lib/yarp.rb create mode 100644 lib/yarp/version.rb create mode 100644 test/fixtures/lex.rb create mode 100644 test/lex_test.rb create mode 100644 test/test_helper.rb create mode 160000 vendor/spec create mode 100644 yarp.gemspec diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..8d89d26bf --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,24 @@ +name: Ruby + +on: + push: + branches: + - main + + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: head + bundler-cache: true + + - name: Run the default task + run: bundle exec rake diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..a5fbc8cc6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/.bundle/ +/.vscode/ +/.yardoc +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ + +/lib/yarp/yarp.* +test.rb diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..5975e2e26 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/spec"] + path = vendor/spec + url = git@github.com:ruby/spec.git diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..ace59941d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +## [Unreleased] + +## [0.1.0] - 2022-08-08 + +- Initial release diff --git a/Gemfile b/Gemfile new file mode 100644 index 000000000..be173b205 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gemspec diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 000000000..bfb8e4425 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,30 @@ +PATH + remote: . + specs: + yarp (0.1.0) + +GEM + remote: https://rubygems.org/ + specs: + minitest (5.16.2) + power_assert (2.0.1) + rake (13.0.6) + rake-compiler (1.2.0) + rake + test-unit (3.5.3) + power_assert + +PLATFORMS + arm64-darwin-21 + x86_64-linux + +DEPENDENCIES + bundler (~> 2) + minitest (~> 5) + rake (~> 13) + rake-compiler (~> 1) + test-unit (~> 3) + yarp! + +BUNDLED WITH + 2.3.6 diff --git a/KNOWN_FAILURES b/KNOWN_FAILURES new file mode 100644 index 000000000..a0ef0b424 --- /dev/null +++ b/KNOWN_FAILURES @@ -0,0 +1,652 @@ +vendor/spec/command_line/backtrace_limit_spec.rb +vendor/spec/command_line/dash_n_spec.rb +vendor/spec/command_line/feature_spec.rb +vendor/spec/command_line/fixtures/debug.rb +vendor/spec/command_line/fixtures/freeze_flag_required_diff_enc.rb +vendor/spec/command_line/rubyopt_spec.rb +vendor/spec/core/argf/readpartial_spec.rb +vendor/spec/core/array/comparison_spec.rb +vendor/spec/core/array/cycle_spec.rb +vendor/spec/core/array/element_reference_spec.rb +vendor/spec/core/array/element_set_spec.rb +vendor/spec/core/array/equal_value_spec.rb +vendor/spec/core/array/fill_spec.rb +vendor/spec/core/array/fixtures/classes.rb +vendor/spec/core/array/fixtures/encoded_strings.rb +vendor/spec/core/array/hash_spec.rb +vendor/spec/core/array/include_spec.rb +vendor/spec/core/array/intersection_spec.rb +vendor/spec/core/array/minus_spec.rb +vendor/spec/core/array/multiply_spec.rb +vendor/spec/core/array/pack/m_spec.rb +vendor/spec/core/array/pack/u_spec.rb +vendor/spec/core/array/rindex_spec.rb +vendor/spec/core/array/shared/inspect.rb +vendor/spec/core/array/sum_spec.rb +vendor/spec/core/array/union_spec.rb +vendor/spec/core/array/uniq_spec.rb +vendor/spec/core/basicobject/basicobject_spec.rb +vendor/spec/core/basicobject/equal_spec.rb +vendor/spec/core/basicobject/equal_value_spec.rb +vendor/spec/core/basicobject/instance_eval_spec.rb +vendor/spec/core/basicobject/instance_exec_spec.rb +vendor/spec/core/basicobject/not_equal_spec.rb +vendor/spec/core/basicobject/not_spec.rb +vendor/spec/core/binding/local_variable_get_spec.rb +vendor/spec/core/binding/local_variable_set_spec.rb +vendor/spec/core/comparable/clamp_spec.rb +vendor/spec/core/comparable/equal_value_spec.rb +vendor/spec/core/comparable/gt_spec.rb +vendor/spec/core/comparable/gte_spec.rb +vendor/spec/core/comparable/lt_spec.rb +vendor/spec/core/comparable/lte_spec.rb +vendor/spec/core/complex/divide_spec.rb +vendor/spec/core/complex/equal_value_spec.rb +vendor/spec/core/complex/multiply_spec.rb +vendor/spec/core/complex/to_f_spec.rb +vendor/spec/core/complex/to_i_spec.rb +vendor/spec/core/complex/to_r_spec.rb +vendor/spec/core/complex/uminus_spec.rb +vendor/spec/core/dir/children_spec.rb +vendor/spec/core/dir/element_reference_spec.rb +vendor/spec/core/dir/entries_spec.rb +vendor/spec/core/dir/fixtures/common.rb +vendor/spec/core/dir/glob_spec.rb +vendor/spec/core/dir/pwd_spec.rb +vendor/spec/core/dir/shared/exist.rb +vendor/spec/core/dir/shared/glob.rb +vendor/spec/core/encoding/converter/convert_spec.rb +vendor/spec/core/encoding/converter/last_error_spec.rb +vendor/spec/core/encoding/converter/primitive_convert_spec.rb +vendor/spec/core/encoding/converter/replacement_spec.rb +vendor/spec/core/encoding/find_spec.rb +vendor/spec/core/enumerable/grep_v_spec.rb +vendor/spec/core/enumerable/shared/inject.rb +vendor/spec/core/enumerable/slice_after_spec.rb +vendor/spec/core/enumerable/slice_before_spec.rb +vendor/spec/core/enumerable/sum_spec.rb +vendor/spec/core/env/element_set_spec.rb +vendor/spec/core/env/fetch_spec.rb +vendor/spec/core/env/to_s_spec.rb +vendor/spec/core/exception/hierarchy_spec.rb +vendor/spec/core/exception/inspect_spec.rb +vendor/spec/core/exception/interrupt_spec.rb +vendor/spec/core/exception/name_spec.rb +vendor/spec/core/exception/signal_exception_spec.rb +vendor/spec/core/exception/top_level_spec.rb +vendor/spec/core/fiber/resume_spec.rb +vendor/spec/core/file/atime_spec.rb +vendor/spec/core/file/basename_spec.rb +vendor/spec/core/file/expand_path_spec.rb +vendor/spec/core/file/extname_spec.rb +vendor/spec/core/file/flock_spec.rb +vendor/spec/core/file/mkfifo_spec.rb +vendor/spec/core/file/open_spec.rb +vendor/spec/core/file/printf_spec.rb +vendor/spec/core/file/readlink_spec.rb +vendor/spec/core/file/shared/fnmatch.rb +vendor/spec/core/float/case_compare_spec.rb +vendor/spec/core/float/ceil_spec.rb +vendor/spec/core/float/divide_spec.rb +vendor/spec/core/float/equal_value_spec.rb +vendor/spec/core/float/exponent_spec.rb +vendor/spec/core/float/fixtures/coerce.rb +vendor/spec/core/float/floor_spec.rb +vendor/spec/core/float/gt_spec.rb +vendor/spec/core/float/gte_spec.rb +vendor/spec/core/float/lt_spec.rb +vendor/spec/core/float/lte_spec.rb +vendor/spec/core/float/minus_spec.rb +vendor/spec/core/float/modulo_spec.rb +vendor/spec/core/float/multiply_spec.rb +vendor/spec/core/float/next_float_spec.rb +vendor/spec/core/float/plus_spec.rb +vendor/spec/core/float/prev_float_spec.rb +vendor/spec/core/float/round_spec.rb +vendor/spec/core/float/shared/modulo.rb +vendor/spec/core/float/shared/to_i.rb +vendor/spec/core/float/uminus_spec.rb +vendor/spec/core/float/uplus_spec.rb +vendor/spec/core/hash/default_proc_spec.rb +vendor/spec/core/hash/element_reference_spec.rb +vendor/spec/core/hash/element_set_spec.rb +vendor/spec/core/hash/equal_value_spec.rb +vendor/spec/core/hash/fetch_spec.rb +vendor/spec/core/hash/fetch_values_spec.rb +vendor/spec/core/hash/gt_spec.rb +vendor/spec/core/hash/gte_spec.rb +vendor/spec/core/hash/lt_spec.rb +vendor/spec/core/hash/lte_spec.rb +vendor/spec/core/hash/ruby2_keywords_hash_spec.rb +vendor/spec/core/hash/shared/each.rb +vendor/spec/core/hash/shared/eql.rb +vendor/spec/core/hash/shared/equal.rb +vendor/spec/core/hash/shared/index.rb +vendor/spec/core/hash/shared/store.rb +vendor/spec/core/hash/shared/to_s.rb +vendor/spec/core/hash/slice_spec.rb +vendor/spec/core/hash/to_proc_spec.rb +vendor/spec/core/hash/transform_keys_spec.rb +vendor/spec/core/integer/case_compare_spec.rb +vendor/spec/core/integer/div_spec.rb +vendor/spec/core/integer/divide_spec.rb +vendor/spec/core/integer/dup_spec.rb +vendor/spec/core/integer/equal_value_spec.rb +vendor/spec/core/integer/exponent_spec.rb +vendor/spec/core/integer/gt_spec.rb +vendor/spec/core/integer/gte_spec.rb +vendor/spec/core/integer/lt_spec.rb +vendor/spec/core/integer/lte_spec.rb +vendor/spec/core/integer/minus_spec.rb +vendor/spec/core/integer/modulo_spec.rb +vendor/spec/core/integer/multiply_spec.rb +vendor/spec/core/integer/plus_spec.rb +vendor/spec/core/integer/remainder_spec.rb +vendor/spec/core/integer/round_spec.rb +vendor/spec/core/integer/shared/abs.rb +vendor/spec/core/integer/shared/equal.rb +vendor/spec/core/integer/shared/exponent.rb +vendor/spec/core/integer/to_f_spec.rb +vendor/spec/core/integer/uminus_spec.rb +vendor/spec/core/io/close_spec.rb +vendor/spec/core/io/fixtures/classes.rb +vendor/spec/core/io/getc_spec.rb +vendor/spec/core/io/gets_spec.rb +vendor/spec/core/io/inspect_spec.rb +vendor/spec/core/io/read_spec.rb +vendor/spec/core/io/shared/chars.rb +vendor/spec/core/io/shared/write.rb +vendor/spec/core/io/ungetc_spec.rb +vendor/spec/core/io/write_spec.rb +vendor/spec/core/kernel/Complex_spec.rb +vendor/spec/core/kernel/at_exit_spec.rb +vendor/spec/core/kernel/backtick_spec.rb +vendor/spec/core/kernel/chomp_spec.rb +vendor/spec/core/kernel/chop_spec.rb +vendor/spec/core/kernel/comparison_spec.rb +vendor/spec/core/kernel/eval_spec.rb +vendor/spec/core/kernel/fixtures/chop.rb +vendor/spec/core/kernel/fixtures/chop_f.rb +vendor/spec/core/kernel/fixtures/classes.rb +vendor/spec/core/kernel/freeze_spec.rb +vendor/spec/core/kernel/global_variables_spec.rb +vendor/spec/core/kernel/instance_variable_defined_spec.rb +vendor/spec/core/kernel/instance_variable_get_spec.rb +vendor/spec/core/kernel/instance_variable_set_spec.rb +vendor/spec/core/kernel/instance_variables_spec.rb +vendor/spec/core/kernel/lambda_spec.rb +vendor/spec/core/kernel/match_spec.rb +vendor/spec/core/kernel/not_match_spec.rb +vendor/spec/core/kernel/open_spec.rb +vendor/spec/core/kernel/p_spec.rb +vendor/spec/core/kernel/printf_spec.rb +vendor/spec/core/kernel/remove_instance_variable_spec.rb +vendor/spec/core/kernel/shared/kind_of.rb +vendor/spec/core/kernel/shared/sprintf.rb +vendor/spec/core/kernel/shared/sprintf_encoding.rb +vendor/spec/core/kernel/sprintf_spec.rb +vendor/spec/core/kernel/trace_var_spec.rb +vendor/spec/core/kernel/warn_spec.rb +vendor/spec/core/main/using_spec.rb +vendor/spec/core/marshal/dump_spec.rb +vendor/spec/core/marshal/fixtures/marshal_data.rb +vendor/spec/core/marshal/shared/load.rb +vendor/spec/core/matchdata/begin_spec.rb +vendor/spec/core/matchdata/dup_spec.rb +vendor/spec/core/matchdata/end_spec.rb +vendor/spec/core/matchdata/equal_value_spec.rb +vendor/spec/core/matchdata/offset_spec.rb +vendor/spec/core/math/asin_spec.rb +vendor/spec/core/math/atan_spec.rb +vendor/spec/core/math/cos_spec.rb +vendor/spec/core/math/sin_spec.rb +vendor/spec/core/method/arity_spec.rb +vendor/spec/core/method/case_compare_spec.rb +vendor/spec/core/method/compose_spec.rb +vendor/spec/core/method/element_reference_spec.rb +vendor/spec/core/method/equal_value_spec.rb +vendor/spec/core/method/fixtures/classes.rb +vendor/spec/core/method/parameters_spec.rb +vendor/spec/core/method/to_proc_spec.rb +vendor/spec/core/module/attr_writer_spec.rb +vendor/spec/core/module/autoload_spec.rb +vendor/spec/core/module/class_variable_defined_spec.rb +vendor/spec/core/module/class_variable_get_spec.rb +vendor/spec/core/module/class_variable_set_spec.rb +vendor/spec/core/module/class_variables_spec.rb +vendor/spec/core/module/comparison_spec.rb +vendor/spec/core/module/const_added_spec.rb +vendor/spec/core/module/const_defined_spec.rb +vendor/spec/core/module/const_get_spec.rb +vendor/spec/core/module/const_missing_spec.rb +vendor/spec/core/module/const_source_location_spec.rb +vendor/spec/core/module/define_method_spec.rb +vendor/spec/core/module/define_singleton_method_spec.rb +vendor/spec/core/module/equal_value_spec.rb +vendor/spec/core/module/fixtures/classes.rb +vendor/spec/core/module/fixtures/constant_unicode.rb +vendor/spec/core/module/fixtures/name.rb +vendor/spec/core/module/fixtures/repeated_concurrent_autoload.rb +vendor/spec/core/module/module_function_spec.rb +vendor/spec/core/module/refine_spec.rb +vendor/spec/core/module/remove_class_variable_spec.rb +vendor/spec/core/module/ruby2_keywords_spec.rb +vendor/spec/core/module/shared/set_visibility.rb +vendor/spec/core/numeric/abs2_spec.rb +vendor/spec/core/numeric/div_spec.rb +vendor/spec/core/numeric/divmod_spec.rb +vendor/spec/core/numeric/eql_spec.rb +vendor/spec/core/numeric/modulo_spec.rb +vendor/spec/core/numeric/negative_spec.rb +vendor/spec/core/numeric/positive_spec.rb +vendor/spec/core/numeric/remainder_spec.rb +vendor/spec/core/numeric/shared/abs.rb +vendor/spec/core/numeric/shared/arg.rb +vendor/spec/core/numeric/shared/quo.rb +vendor/spec/core/numeric/shared/step.rb +vendor/spec/core/numeric/step_spec.rb +vendor/spec/core/numeric/uminus_spec.rb +vendor/spec/core/numeric/uplus_spec.rb +vendor/spec/core/numeric/zero_spec.rb +vendor/spec/core/objectspace/define_finalizer_spec.rb +vendor/spec/core/objectspace/fixtures/classes.rb +vendor/spec/core/objectspace/weakmap/each_key_spec.rb +vendor/spec/core/objectspace/weakmap/each_pair_spec.rb +vendor/spec/core/objectspace/weakmap/each_spec.rb +vendor/spec/core/objectspace/weakmap/each_value_spec.rb +vendor/spec/core/objectspace/weakmap/keys_spec.rb +vendor/spec/core/objectspace/weakmap/values_spec.rb +vendor/spec/core/proc/arity_spec.rb +vendor/spec/core/proc/case_compare_spec.rb +vendor/spec/core/proc/compose_spec.rb +vendor/spec/core/proc/curry_spec.rb +vendor/spec/core/proc/element_reference_spec.rb +vendor/spec/core/proc/equal_value_spec.rb +vendor/spec/core/proc/lambda_spec.rb +vendor/spec/core/proc/new_spec.rb +vendor/spec/core/proc/parameters_spec.rb +vendor/spec/core/proc/ruby2_keywords_spec.rb +vendor/spec/core/proc/shared/call.rb +vendor/spec/core/proc/shared/call_arguments.rb +vendor/spec/core/proc/source_location_spec.rb +vendor/spec/core/process/clock_getres_spec.rb +vendor/spec/core/process/egid_spec.rb +vendor/spec/core/process/euid_spec.rb +vendor/spec/core/process/exec_spec.rb +vendor/spec/core/process/fixtures/kill.rb +vendor/spec/core/process/spawn_spec.rb +vendor/spec/core/process/uid_spec.rb +vendor/spec/core/queue/append_spec.rb +vendor/spec/core/range/case_compare_spec.rb +vendor/spec/core/range/each_spec.rb +vendor/spec/core/range/equal_value_spec.rb +vendor/spec/core/range/fixtures/classes.rb +vendor/spec/core/range/max_spec.rb +vendor/spec/core/range/min_spec.rb +vendor/spec/core/range/minmax_spec.rb +vendor/spec/core/range/new_spec.rb +vendor/spec/core/range/shared/cover.rb +vendor/spec/core/range/shared/cover_and_include.rb +vendor/spec/core/range/step_spec.rb +vendor/spec/core/range/to_a_spec.rb +vendor/spec/core/rational/comparison_spec.rb +vendor/spec/core/rational/divide_spec.rb +vendor/spec/core/rational/equal_value_spec.rb +vendor/spec/core/rational/exponent_spec.rb +vendor/spec/core/rational/minus_spec.rb +vendor/spec/core/rational/modulo_spec.rb +vendor/spec/core/rational/multiply_spec.rb +vendor/spec/core/rational/plus_spec.rb +vendor/spec/core/regexp/encoding_spec.rb +vendor/spec/core/regexp/equal_value_spec.rb +vendor/spec/core/regexp/fixed_encoding_spec.rb +vendor/spec/core/regexp/inspect_spec.rb +vendor/spec/core/regexp/match_spec.rb +vendor/spec/core/regexp/shared/new.rb +vendor/spec/core/regexp/source_spec.rb +vendor/spec/core/regexp/union_spec.rb +vendor/spec/core/signal/trap_spec.rb +vendor/spec/core/sizedqueue/append_spec.rb +vendor/spec/core/sizedqueue/enq_spec.rb +vendor/spec/core/sizedqueue/max_spec.rb +vendor/spec/core/sizedqueue/new_spec.rb +vendor/spec/core/sizedqueue/num_waiting_spec.rb +vendor/spec/core/sizedqueue/push_spec.rb +vendor/spec/core/string/append_spec.rb +vendor/spec/core/string/ascii_only_spec.rb +vendor/spec/core/string/b_spec.rb +vendor/spec/core/string/bytes_spec.rb +vendor/spec/core/string/capitalize_spec.rb +vendor/spec/core/string/case_compare_spec.rb +vendor/spec/core/string/casecmp_spec.rb +vendor/spec/core/string/center_spec.rb +vendor/spec/core/string/chomp_spec.rb +vendor/spec/core/string/chop_spec.rb +vendor/spec/core/string/comparison_spec.rb +vendor/spec/core/string/delete_spec.rb +vendor/spec/core/string/downcase_spec.rb +vendor/spec/core/string/dump_spec.rb +vendor/spec/core/string/element_reference_spec.rb +vendor/spec/core/string/element_set_spec.rb +vendor/spec/core/string/encode_spec.rb +vendor/spec/core/string/equal_value_spec.rb +vendor/spec/core/string/fixtures/classes.rb +vendor/spec/core/string/fixtures/iso-8859-9-encoding.rb +vendor/spec/core/string/fixtures/utf-8-encoding.rb +vendor/spec/core/string/force_encoding_spec.rb +vendor/spec/core/string/gsub_spec.rb +vendor/spec/core/string/include_spec.rb +vendor/spec/core/string/index_spec.rb +vendor/spec/core/string/insert_spec.rb +vendor/spec/core/string/inspect_spec.rb +vendor/spec/core/string/ljust_spec.rb +vendor/spec/core/string/lstrip_spec.rb +vendor/spec/core/string/match_spec.rb +vendor/spec/core/string/modulo_spec.rb +vendor/spec/core/string/multiply_spec.rb +vendor/spec/core/string/next_spec.rb +vendor/spec/core/string/ord_spec.rb +vendor/spec/core/string/plus_spec.rb +vendor/spec/core/string/reverse_spec.rb +vendor/spec/core/string/rindex_spec.rb +vendor/spec/core/string/rjust_spec.rb +vendor/spec/core/string/rpartition_spec.rb +vendor/spec/core/string/rstrip_spec.rb +vendor/spec/core/string/scan_spec.rb +vendor/spec/core/string/scrub_spec.rb +vendor/spec/core/string/shared/dedup.rb +vendor/spec/core/string/shared/each_line.rb +vendor/spec/core/string/shared/encode.rb +vendor/spec/core/string/shared/equal_value.rb +vendor/spec/core/string/shared/length.rb +vendor/spec/core/string/shared/slice.rb +vendor/spec/core/string/shared/to_sym.rb +vendor/spec/core/string/slice_spec.rb +vendor/spec/core/string/split_spec.rb +vendor/spec/core/string/sub_spec.rb +vendor/spec/core/string/succ_spec.rb +vendor/spec/core/string/swapcase_spec.rb +vendor/spec/core/string/to_f_spec.rb +vendor/spec/core/string/tr_s_spec.rb +vendor/spec/core/string/tr_spec.rb +vendor/spec/core/string/uminus_spec.rb +vendor/spec/core/string/undump_spec.rb +vendor/spec/core/string/unicode_normalize_spec.rb +vendor/spec/core/string/unpack/m_spec.rb +vendor/spec/core/string/unpack/u_spec.rb +vendor/spec/core/string/upcase_spec.rb +vendor/spec/core/string/upto_spec.rb +vendor/spec/core/struct/element_reference_spec.rb +vendor/spec/core/struct/equal_value_spec.rb +vendor/spec/core/struct/hash_spec.rb +vendor/spec/core/struct/instance_variable_get_spec.rb +vendor/spec/core/struct/instance_variables_spec.rb +vendor/spec/core/struct/new_spec.rb +vendor/spec/core/symbol/all_symbols_spec.rb +vendor/spec/core/symbol/capitalize_spec.rb +vendor/spec/core/symbol/casecmp_spec.rb +vendor/spec/core/symbol/downcase_spec.rb +vendor/spec/core/symbol/element_reference_spec.rb +vendor/spec/core/symbol/empty_spec.rb +vendor/spec/core/symbol/encoding_spec.rb +vendor/spec/core/symbol/equal_value_spec.rb +vendor/spec/core/symbol/inspect_spec.rb +vendor/spec/core/symbol/match_spec.rb +vendor/spec/core/symbol/name_spec.rb +vendor/spec/core/symbol/shared/id2name.rb +vendor/spec/core/symbol/shared/length.rb +vendor/spec/core/symbol/shared/succ.rb +vendor/spec/core/symbol/swapcase_spec.rb +vendor/spec/core/symbol/to_proc_spec.rb +vendor/spec/core/symbol/to_sym_spec.rb +vendor/spec/core/symbol/upcase_spec.rb +vendor/spec/core/thread/fixtures/classes.rb +vendor/spec/core/thread/raise_spec.rb +vendor/spec/core/thread/report_on_exception_spec.rb +vendor/spec/core/time/at_spec.rb +vendor/spec/core/time/comparison_spec.rb +vendor/spec/core/time/fixtures/classes.rb +vendor/spec/core/time/getlocal_spec.rb +vendor/spec/core/time/minus_spec.rb +vendor/spec/core/time/new_spec.rb +vendor/spec/core/time/plus_spec.rb +vendor/spec/core/time/shared/gm.rb +vendor/spec/core/time/shared/inspect.rb +vendor/spec/core/time/shared/now.rb +vendor/spec/core/time/strftime_spec.rb +vendor/spec/core/tracepoint/eval_script_spec.rb +vendor/spec/core/tracepoint/parameters_spec.rb +vendor/spec/core/unboundmethod/arity_spec.rb +vendor/spec/core/warning/warn_spec.rb +vendor/spec/fixtures/class.rb +vendor/spec/fixtures/class_variables.rb +vendor/spec/language/alias_spec.rb +vendor/spec/language/array_spec.rb +vendor/spec/language/block_spec.rb +vendor/spec/language/case_spec.rb +vendor/spec/language/class_spec.rb +vendor/spec/language/class_variable_spec.rb +vendor/spec/language/comment_spec.rb +vendor/spec/language/constants_spec.rb +vendor/spec/language/def_spec.rb +vendor/spec/language/defined_spec.rb +vendor/spec/language/delegation_spec.rb +vendor/spec/language/ensure_spec.rb +vendor/spec/language/fixtures/binary_symbol.rb +vendor/spec/language/fixtures/bytes_magic_comment.rb +vendor/spec/language/fixtures/classes.rb +vendor/spec/language/fixtures/freeze_magic_comment_required_diff_enc.rb +vendor/spec/language/fixtures/rescue_captures.rb +vendor/spec/language/fixtures/send.rb +vendor/spec/language/fixtures/squiggly_heredoc.rb +vendor/spec/language/fixtures/utf16-le-nobom.rb +vendor/spec/language/fixtures/utf8-bom.rb +vendor/spec/language/fixtures/variables.rb +vendor/spec/language/hash_spec.rb +vendor/spec/language/heredoc_spec.rb +vendor/spec/language/keyword_arguments_spec.rb +vendor/spec/language/lambda_spec.rb +vendor/spec/language/line_spec.rb +vendor/spec/language/magic_comment_spec.rb +vendor/spec/language/match_spec.rb +vendor/spec/language/method_spec.rb +vendor/spec/language/numbered_parameters_spec.rb +vendor/spec/language/numbers_spec.rb +vendor/spec/language/optional_assignments_spec.rb +vendor/spec/language/pattern_matching_spec.rb +vendor/spec/language/precedence_spec.rb +vendor/spec/language/predefined/data_spec.rb +vendor/spec/language/predefined/fixtures/data1.rb +vendor/spec/language/predefined/fixtures/data3.rb +vendor/spec/language/predefined/fixtures/data4.rb +vendor/spec/language/predefined/fixtures/data5.rb +vendor/spec/language/predefined/fixtures/data_offset.rb +vendor/spec/language/predefined/fixtures/data_only.rb +vendor/spec/language/predefined/fixtures/empty_data.rb +vendor/spec/language/predefined/toplevel_binding_spec.rb +vendor/spec/language/predefined_spec.rb +vendor/spec/language/proc_spec.rb +vendor/spec/language/regexp/character_classes_spec.rb +vendor/spec/language/regexp/encoding_spec.rb +vendor/spec/language/regexp/escapes_spec.rb +vendor/spec/language/regexp/interpolation_spec.rb +vendor/spec/language/regexp/modifiers_spec.rb +vendor/spec/language/regexp/repetition_spec.rb +vendor/spec/language/regexp_spec.rb +vendor/spec/language/rescue_spec.rb +vendor/spec/language/return_spec.rb +vendor/spec/language/safe_navigator_spec.rb +vendor/spec/language/send_spec.rb +vendor/spec/language/string_spec.rb +vendor/spec/language/super_spec.rb +vendor/spec/language/symbol_spec.rb +vendor/spec/language/undef_spec.rb +vendor/spec/language/variables_spec.rb +vendor/spec/language/yield_spec.rb +vendor/spec/library/abbrev/abbrev_spec.rb +vendor/spec/library/base64/decode64_spec.rb +vendor/spec/library/bigdecimal/BigDecimal_spec.rb +vendor/spec/library/bigdecimal/case_compare_spec.rb +vendor/spec/library/bigdecimal/divide_spec.rb +vendor/spec/library/bigdecimal/divmod_spec.rb +vendor/spec/library/bigdecimal/equal_value_spec.rb +vendor/spec/library/bigdecimal/exponent_spec.rb +vendor/spec/library/bigdecimal/modulo_spec.rb +vendor/spec/library/bigdecimal/multiply_spec.rb +vendor/spec/library/bigdecimal/precs_spec.rb +vendor/spec/library/bigdecimal/to_s_spec.rb +vendor/spec/library/bigdecimal/truncate_spec.rb +vendor/spec/library/bigdecimal/uminus_spec.rb +vendor/spec/library/bigdecimal/uplus_spec.rb +vendor/spec/library/bigmath/log_spec.rb +vendor/spec/library/cgi/cookie/initialize_spec.rb +vendor/spec/library/cgi/cookie/parse_spec.rb +vendor/spec/library/cgi/cookie/to_s_spec.rb +vendor/spec/library/cgi/cookie/value_spec.rb +vendor/spec/library/cgi/escapeHTML_spec.rb +vendor/spec/library/cgi/escape_spec.rb +vendor/spec/library/cgi/htmlextension/a_spec.rb +vendor/spec/library/cgi/out_spec.rb +vendor/spec/library/cgi/pretty_spec.rb +vendor/spec/library/cgi/queryextension/multipart_spec.rb +vendor/spec/library/cgi/shared/http_header.rb +vendor/spec/library/cgi/unescapeHTML_spec.rb +vendor/spec/library/cgi/unescape_spec.rb +vendor/spec/library/cmath/math/shared/asin.rb +vendor/spec/library/cmath/math/shared/atan.rb +vendor/spec/library/cmath/math/shared/cos.rb +vendor/spec/library/cmath/math/shared/sin.rb +vendor/spec/library/coverage/fixtures/eval_code.rb +vendor/spec/library/csv/generate_line_spec.rb +vendor/spec/library/csv/parse_spec.rb +vendor/spec/library/date/strftime_spec.rb +vendor/spec/library/datetime/strftime_spec.rb +vendor/spec/library/delegate/delegator/case_compare_spec.rb +vendor/spec/library/delegate/delegator/compare_spec.rb +vendor/spec/library/delegate/delegator/complement_spec.rb +vendor/spec/library/delegate/delegator/equal_value_spec.rb +vendor/spec/library/delegate/delegator/marshal_spec.rb +vendor/spec/library/delegate/delegator/not_equal_spec.rb +vendor/spec/library/delegate/delegator/not_spec.rb +vendor/spec/library/digest/instance/append_spec.rb +vendor/spec/library/digest/md5/append_spec.rb +vendor/spec/library/digest/sha256/append_spec.rb +vendor/spec/library/digest/sha384/append_spec.rb +vendor/spec/library/digest/sha384/shared/constants.rb +vendor/spec/library/digest/sha512/append_spec.rb +vendor/spec/library/erb/def_class_spec.rb +vendor/spec/library/erb/def_method_spec.rb +vendor/spec/library/erb/def_module_spec.rb +vendor/spec/library/erb/defmethod/def_erb_method_spec.rb +vendor/spec/library/erb/new_spec.rb +vendor/spec/library/erb/result_spec.rb +vendor/spec/library/erb/run_spec.rb +vendor/spec/library/erb/src_spec.rb +vendor/spec/library/erb/util/shared/html_escape.rb +vendor/spec/library/erb/util/shared/url_encode.rb +vendor/spec/library/find/fixtures/common.rb +vendor/spec/library/matrix/build_spec.rb +vendor/spec/library/matrix/coerce_spec.rb +vendor/spec/library/matrix/constructor_spec.rb +vendor/spec/library/matrix/divide_spec.rb +vendor/spec/library/matrix/equal_value_spec.rb +vendor/spec/library/net/ftp/close_spec.rb +vendor/spec/library/net/ftp/closed_spec.rb +vendor/spec/library/net/ftp/fixtures/server.rb +vendor/spec/library/net/ftp/initialize_spec.rb +vendor/spec/library/net/ftp/mkdir_spec.rb +vendor/spec/library/net/ftp/pwd_spec.rb +vendor/spec/library/net/http/httpgenericrequest/exec_spec.rb +vendor/spec/library/net/http/httpresponse/read_new_spec.rb +vendor/spec/library/objectspace/memsize_of_spec.rb +vendor/spec/library/openstruct/shared/inspect.rb +vendor/spec/library/pathname/divide_spec.rb +vendor/spec/library/pathname/plus_spec.rb +vendor/spec/library/rbconfig/rbconfig_spec.rb +vendor/spec/library/rexml/attributes/append_spec.rb +vendor/spec/library/rexml/document/add_spec.rb +vendor/spec/library/rexml/document/write_spec.rb +vendor/spec/library/rexml/text/normalize_spec.rb +vendor/spec/library/rexml/text/read_with_substitution_spec.rb +vendor/spec/library/rexml/text/unnormalize_spec.rb +vendor/spec/library/rexml/text/value_spec.rb +vendor/spec/library/ripper/sexp_spec.rb +vendor/spec/library/securerandom/base64_spec.rb +vendor/spec/library/set/append_spec.rb +vendor/spec/library/set/case_compare_spec.rb +vendor/spec/library/set/case_equality_spec.rb +vendor/spec/library/set/comparison_spec.rb +vendor/spec/library/set/intersection_spec.rb +vendor/spec/library/set/minus_spec.rb +vendor/spec/library/set/plus_spec.rb +vendor/spec/library/set/shared/include.rb +vendor/spec/library/set/sortedset/add_spec.rb +vendor/spec/library/set/sortedset/append_spec.rb +vendor/spec/library/set/sortedset/case_equality_spec.rb +vendor/spec/library/set/sortedset/intersection_spec.rb +vendor/spec/library/set/sortedset/minus_spec.rb +vendor/spec/library/set/sortedset/plus_spec.rb +vendor/spec/library/set/sortedset/union_spec.rb +vendor/spec/library/set/union_spec.rb +vendor/spec/library/shellwords/shellwords_spec.rb +vendor/spec/library/socket/addrinfo/getaddrinfo_spec.rb +vendor/spec/library/socket/addrinfo/marshal_dump_spec.rb +vendor/spec/library/socket/ancillarydata/initialize_spec.rb +vendor/spec/library/socket/ancillarydata/unix_rights_spec.rb +vendor/spec/library/socket/basicsocket/local_address_spec.rb +vendor/spec/library/socket/basicsocket/remote_address_spec.rb +vendor/spec/library/socket/basicsocket/sendmsg_nonblock_spec.rb +vendor/spec/library/socket/basicsocket/sendmsg_spec.rb +vendor/spec/library/socket/shared/pack_sockaddr.rb +vendor/spec/library/socket/socket/gethostbyaddr_spec.rb +vendor/spec/library/socket/socket/getifaddrs_spec.rb +vendor/spec/library/stringio/fixtures/classes.rb +vendor/spec/library/stringio/getch_spec.rb +vendor/spec/library/stringio/printf_spec.rb +vendor/spec/library/stringio/puts_spec.rb +vendor/spec/library/stringio/shared/codepoints.rb +vendor/spec/library/stringio/shared/each_char.rb +vendor/spec/library/stringscanner/append_spec.rb +vendor/spec/library/stringscanner/inspect_spec.rb +vendor/spec/library/stringscanner/shared/peek.rb +vendor/spec/library/stringscanner/unscan_spec.rb +vendor/spec/library/syslog/inspect_spec.rb +vendor/spec/library/time/to_date_spec.rb +vendor/spec/library/tmpdir/dir/mktmpdir_spec.rb +vendor/spec/library/uri/equality_spec.rb +vendor/spec/library/yaml/fixtures/strings.rb +vendor/spec/library/yaml/shared/load.rb +vendor/spec/library/yaml/to_yaml_spec.rb +vendor/spec/library/zlib/gzipfile/close_spec.rb +vendor/spec/library/zlib/gzipfile/comment_spec.rb +vendor/spec/library/zlib/gzipfile/orig_name_spec.rb +vendor/spec/library/zlib/gzipreader/ungetc_spec.rb +vendor/spec/library/zlib/gzipwriter/mtime_spec.rb +vendor/spec/optional/capi/bignum_spec.rb +vendor/spec/optional/capi/class_spec.rb +vendor/spec/optional/capi/encoding_spec.rb +vendor/spec/optional/capi/hash_spec.rb +vendor/spec/optional/capi/io_spec.rb +vendor/spec/optional/capi/kernel_spec.rb +vendor/spec/optional/capi/numeric_spec.rb +vendor/spec/optional/capi/object_spec.rb +vendor/spec/optional/capi/range_spec.rb +vendor/spec/optional/capi/spec_helper.rb +vendor/spec/optional/capi/string_spec.rb +vendor/spec/optional/capi/symbol_spec.rb +vendor/spec/optional/capi/thread_spec.rb +vendor/spec/security/cve_2014_8080_spec.rb +vendor/spec/security/cve_2019_8322_spec.rb +vendor/spec/shared/process/exit.rb +vendor/spec/shared/rational/comparison.rb +vendor/spec/shared/rational/divide.rb +vendor/spec/shared/rational/equal_value.rb +vendor/spec/shared/rational/exponent.rb +vendor/spec/shared/rational/multiply.rb +vendor/spec/shared/rational/plus.rb +vendor/spec/shared/rational/round.rb +vendor/spec/shared/string/end_with.rb +vendor/spec/shared/string/start_with.rb +vendor/spec/shared/time/strftime_for_date.rb +vendor/spec/shared/time/strftime_for_time.rb +vendor/spec/spec_helper.rb diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..a5efe60fc --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2022 Kevin Newton + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 000000000..0f4d5496a --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# Yet Another Ruby Parser + +## Installation + +Add this line to your application's Gemfile: + +```ruby +gem "yarp" +``` + +And then execute: + + $ bundle install + +Or install it yourself as: + + $ gem install yarp + +## Contributing + +Bug reports and pull requests are welcome on GitHub at https://github.com/ruby-syntax-tree/yarp. + +## License + +The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). diff --git a/Rakefile b/Rakefile new file mode 100644 index 000000000..04ffe10b5 --- /dev/null +++ b/Rakefile @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "rake/extensiontask" +require "rake/testtask" + +Rake::ExtensionTask.new(:compile) do |ext| + ext.name = "yarp" + ext.ext_dir = "ext/yarp" + ext.lib_dir = "lib/yarp" + ext.gem_spec = Gem::Specification.load("yarp.gemspec") +end + +Rake::TestTask.new(test: :compile) do |t| + t.libs << "test" + t.libs << "lib" + t.test_files = FileList["test/**/*_test.rb"] +end + +desc "Lex ruby/spec files and compare with compat_lex" +task lex: :compile do + require "bundler/setup" + require "yarp" + require "ripper" + + filepath = File.expand_path("KNOWN_FAILURES", __dir__) + known_failures = File.readlines(filepath, chomp: true) + + results = { passing: 0, failing: 0 } + colorize = ->(code, string) { "\033[#{code}m#{string}\033[0m" } + + passing = 0 + failing = 0 + + filepaths = + if ENV["FILEPATHS"] + Dir[ENV["FILEPATHS"]] + else + Dir["vendor/spec/**/*.rb"] + end + + filepaths.each do |filepath| + result = + YARP.ripper_lex(filepath).zip(YARP.compat_lex(filepath)).all? do |(ripper, yarp)| + break false if yarp.nil? + ripper[0...-1] == yarp[0...-1] + end + + print result ? colorize.call(32, ".") : colorize.call(31, "E") + + if result + known_failures.delete(filepath) if known_failures.include?(filepath) + passing += 1 + else + known_failures << filepath unless known_failures.include?(filepath) + failing += 1 + end + end + + File.write(filepath, known_failures.sort.join("\n") + "\n") unless ENV["FILEPATHS"] + puts "\n\nPASS=#{passing}\nFAIL=#{failing}" +end + +task default: :test diff --git a/bin/console b/bin/console new file mode 100755 index 000000000..84f54072d --- /dev/null +++ b/bin/console @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "yarp" + +require "irb" +IRB.start(__FILE__) diff --git a/bin/lex b/bin/lex new file mode 100755 index 000000000..9f5d44877 --- /dev/null +++ b/bin/lex @@ -0,0 +1,20 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "ripper" +require "yarp" + +filepath = ARGV.first +pattern = "%-70s %-70s" + +puts pattern % ["Ripper lex", "YARP lex"] +puts pattern % ["-" * 70, "-" * 70] + +YARP.ripper_lex(filepath).zip(YARP.compat_lex(filepath)).each do |(ripper, yarp)| + left = ripper[...-1].inspect + right = (yarp || [])[...-1].inspect + + color = left == right ? "38;5;102" : "1;31" + puts "\033[#{color}m#{pattern}\033[0m" % [ripper.inspect, yarp.inspect] +end diff --git a/ext/yarp/extconf.rb b/ext/yarp/extconf.rb new file mode 100644 index 000000000..132fe710a --- /dev/null +++ b/ext/yarp/extconf.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +require "mkmf" +create_makefile "yarp/yarp" diff --git a/ext/yarp/yarp.c b/ext/yarp/yarp.c new file mode 100644 index 000000000..b5f03adb7 --- /dev/null +++ b/ext/yarp/yarp.c @@ -0,0 +1,1106 @@ +#include "yarp.h" + +/******************************************************************************/ +/* Basic character checks */ +/******************************************************************************/ + +static inline bool +is_binary_number_char(const char *c) { + return *c == '0' || *c == '1'; +} + +static inline bool +is_octal_number_char(const char *c) { + return *c >= '0' && *c <= '7'; +} + +static inline bool +is_decimal_number_char(const char *c) { + return *c >= '0' && *c <= '9'; +} + +static inline bool +is_hexadecimal_number_char(const char *c) { + return (*c >= '0' && *c <= '9') || (*c >= 'a' && *c <= 'f') || (*c >= 'A' && *c <= 'F'); +} + +static inline bool +is_identifier_start_char(const char *c) { + return (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z') || (*c == '_'); +} + +static inline bool +is_identifier_char(const char *c) { + return is_identifier_start_char(c) || is_decimal_number_char(c); +} + +static inline bool +is_non_newline_whitespace_char(const char *c) { + return *c == ' ' || *c == '\t' || *c == '\f' || *c == '\r' || *c == '\v'; +} + +static inline bool +is_whitespace_char(const char *c) { + return is_non_newline_whitespace_char(c) || *c == '\n'; +} + +/******************************************************************************/ +/* Lexer check helpers */ +/******************************************************************************/ + +// If the character to be read matches the given value, then returns true and +// advanced the current pointer. +static inline bool +match(yp_parser_t *parser, char value) { + if (*parser->current.end == value) { + parser->current.end++; + return true; + } + return false; +} + +// Returns the matching character that should be used to terminate a list +// beginning with the given character. +static char +terminator(const char start) { + switch (start) { + case '(': return ')'; + case '[': return ']'; + case '{': return '}'; + case '<': return '>'; + default: return start; + } +} + +/******************************************************************************/ +/* Lex mode manipulations */ +/******************************************************************************/ + +// Push a new lex state onto the stack. If we're still within the pre-allocated +// space of the lex state stack, then we'll just use a new slot. Otherwise we'll +// allocate a new pointer and use that. +static void +push_lex_mode(yp_parser_t *parser, yp_lex_mode_t lex_mode) { + lex_mode.prev = parser->lex_modes.current; + parser->lex_modes.index++; + + if (parser->lex_modes.index > YP_LEX_STACK_SIZE - 1) { + parser->lex_modes.current = (yp_lex_mode_t *) malloc(sizeof(yp_lex_mode_t)); + } else { + parser->lex_modes.stack[parser->lex_modes.index] = lex_mode; + parser->lex_modes.current = &parser->lex_modes.stack[parser->lex_modes.index]; + } +} + +// Pop the current lex state off the stack. If we're within the pre-allocated +// space of the lex state stack, then we'll just decrement the index. Otherwise +// we'll free the current pointer and use the previous pointer. +static void +pop_lex_mode(yp_parser_t *parser) { + if (parser->lex_modes.index == 0) { + parser->lex_modes.current->mode = YP_LEX_DEFAULT; + } else if (parser->lex_modes.index < YP_LEX_STACK_SIZE) { + parser->lex_modes.index--; + parser->lex_modes.current = &parser->lex_modes.stack[parser->lex_modes.index]; + } else { + parser->lex_modes.index--; + yp_lex_mode_t *prev = parser->lex_modes.current->prev; + free(parser->lex_modes.current); + parser->lex_modes.current = prev; + } +} + +/******************************************************************************/ +/* Specific token lexers */ +/******************************************************************************/ + +static yp_token_type_t +lex_optional_float_suffix(yp_parser_t *parser) { + yp_token_type_t type = YP_TOKEN_INTEGER; + + // Here we're going to attempt to parse the optional decimal portion of a + // float. If it's not there, then it's okay and we'll just continue on. + if (*parser->current.end == '.') { + if ((parser->current.end + 1 < parser->end) && is_decimal_number_char(parser->current.end + 1)) { + parser->current.end += 2; + while (is_decimal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + + type = YP_TOKEN_FLOAT; + } else { + // If we had a . and then something else, then it's not a float suffix on + // a number it's a method call or something else. + return type; + } + } + + // Here we're going to attempt to parse the optional exponent portion of a + // float. If it's not there, it's okay and we'll just continue on. + if (match(parser, 'e') || match(parser, 'E')) { + (void) (match(parser, '+') || match(parser, '-')); + + if (is_decimal_number_char(parser->current.end)) { + parser->current.end++; + while (is_decimal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + + type = YP_TOKEN_FLOAT; + } else { + return YP_TOKEN_INVALID; + } + } + + return type; +} + +static yp_token_type_t +lex_numeric_prefix(yp_parser_t *parser) { + yp_token_type_t type = YP_TOKEN_INTEGER; + + if (parser->current.end[-1] == '0') { + switch (*parser->current.end) { + // 0d1111 is a decimal number + case 'd': case 'D': + if (!is_decimal_number_char(++parser->current.end)) return YP_TOKEN_INVALID; + while (is_decimal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + break; + + // 0b1111 is a binary number + case 'b': case 'B': + if (!is_binary_number_char(++parser->current.end)) return YP_TOKEN_INVALID; + while (is_binary_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + break; + + // 0o1111 is an octal number + case 'o': case 'O': + if (!is_octal_number_char(++parser->current.end)) return YP_TOKEN_INVALID; + // fall through + + // 01111 is an octal number + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': + while (is_octal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + break; + + // 0x1111 is a hexadecimal number + case 'x': case 'X': + if (!is_hexadecimal_number_char(++parser->current.end)) return YP_TOKEN_INVALID; + while (is_hexadecimal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + break; + + // 0.xxx is a float + case '.': { + type = lex_optional_float_suffix(parser); + break; + } + + // 0exxx is a float + case 'e': case 'E': { + type = lex_optional_float_suffix(parser); + break; + } + } + } else { + // If it didn't start with a 0, then we'll lex as far as we can into a + // decimal number. + while (is_decimal_number_char(parser->current.end)) { + parser->current.end++; + match(parser, '_'); + } + + // Afterward, we'll lex as far as we can into an optional float suffix. + type = lex_optional_float_suffix(parser); + } + + // If the last character that we consumed was an underscore, then this is + // actually an invalid integer value, and we should return an invalid token. + if (parser->current.end[-1] == '_') return YP_TOKEN_INVALID; + return type; +} + +static yp_token_type_t +lex_numeric(yp_parser_t *parser) { + yp_token_type_t type = lex_numeric_prefix(parser); + + if (type != YP_TOKEN_INVALID) { + if (match(parser, 'r')) type = YP_TOKEN_RATIONAL_NUMBER; + if (match(parser, 'i')) type = YP_TOKEN_IMAGINARY_NUMBER; + } + + return type; +} + +static yp_token_type_t +lex_global_variable(yp_parser_t *parser) { + switch (*parser->current.end) { + case '~': // $~: match-data + case '*': // $*: argv + case '$': // $$: pid + case '?': // $?: last status + case '!': // $!: error string + case '@': // $@: error position + case '/': // $/: input record separator + case '\\': // $\: output record separator + case ';': // $;: field separator + case ',': // $,: output field separator + case '.': // $.: last read line number + case '=': // $=: ignorecase + case ':': // $:: load path + case '<': // $<: reading filename + case '>': // $>: default output handle + case '\"': // $": already loaded files + parser->current.end++; + return YP_TOKEN_GLOBAL_VARIABLE; + + case '&': // $&: last match + case '`': // $`: string before last match + case '\'': // $': string after last match + case '+': // $+: string matches last paren. + parser->current.end++; + return YP_TOKEN_BACK_REFERENCE; + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + do { parser->current.end++; } while (is_decimal_number_char(parser->current.end)); + return YP_TOKEN_NTH_REFERENCE; + + default: + if (is_identifier_char(parser->current.end)) { + do { parser->current.end++; } while (is_identifier_char(parser->current.end)); + return YP_TOKEN_GLOBAL_VARIABLE; + } + + // If we get here, then we have a $ followed by something that isn't + // recognized as a global variable. + return YP_TOKEN_INVALID; + } +} + +static yp_token_type_t +lex_identifier(yp_parser_t *parser) { + // Lex as far as we can into the current identifier. + while (is_identifier_char(parser->current.end)) { + parser->current.end++; + } + + off_t width = parser->current.end - parser->current.start; + +#define KEYWORD(value, size, token) if (width == size && strncmp(parser->current.start, value, size) == 0) return YP_TOKEN_KEYWORD_##token; + + if ((parser->current.end + 1 < parser->end) && (parser->current.end[1] != '=') && (match(parser, '!') || match(parser, '?'))) { + width++; + if (parser->previous.type != YP_TOKEN_DOT) { + KEYWORD("defined?", 8, DEFINED) + } + return YP_TOKEN_IDENTIFIER; + } + + if (parser->previous.type != YP_TOKEN_DOT) { + KEYWORD("__ENCODING__", 12, __ENCODING__) + KEYWORD("__LINE__", 8, __LINE__) + KEYWORD("__FILE__", 8, __FILE__) + KEYWORD("alias", 5, ALIAS) + KEYWORD("and", 3, AND) + KEYWORD("begin", 5, BEGIN) + KEYWORD("BEGIN", 5, BEGIN_UPCASE) + KEYWORD("break", 5, BREAK) + KEYWORD("case", 4, CASE) + KEYWORD("class", 5, CLASS) + KEYWORD("def", 3, DEF) + KEYWORD("do", 2, DO) + KEYWORD("else", 4, ELSE) + KEYWORD("elsif", 5, ELSIF) + KEYWORD("end", 3, END) + KEYWORD("END", 3, END_UPCASE) + KEYWORD("ensure", 6, ENSURE) + KEYWORD("false", 5, FALSE) + KEYWORD("for", 3, FOR) + KEYWORD("if", 2, IF) + KEYWORD("in", 2, IN) + KEYWORD("module", 6, MODULE) + KEYWORD("next", 4, NEXT) + KEYWORD("nil", 3, NIL) + KEYWORD("not", 3, NOT) + KEYWORD("or", 2, OR) + KEYWORD("redo", 4, REDO) + KEYWORD("rescue", 6, RESCUE) + KEYWORD("retry", 5, RETRY) + KEYWORD("return", 6, RETURN) + KEYWORD("self", 4, SELF) + KEYWORD("super", 5, SUPER) + KEYWORD("then", 4, THEN) + KEYWORD("true", 4, TRUE) + KEYWORD("undef", 5, UNDEF) + KEYWORD("unless", 6, UNLESS) + KEYWORD("until", 5, UNTIL) + KEYWORD("when", 4, WHEN) + KEYWORD("while", 5, WHILE) + KEYWORD("yield", 5, YIELD) + } + +#undef KEYWORD + + char start = parser->current.start[0]; + return start >= 'A' && start <= 'Z' ? YP_TOKEN_CONSTANT : YP_TOKEN_IDENTIFIER; +} + +// This is the overall lexer function. It is responsible for advancing both +// parser->current.start and parser->current.end such that they point to the +// beginning and end of the next token. It should return the type of token that +// was found. +static yp_token_type_t +lex_token_type(yp_parser_t *parser) { + switch (parser->lex_modes.current->mode) { + case YP_LEX_DEFAULT: + case YP_LEX_EMBEXPR: { + // First, we're going to skip past any whitespace at the front of the next + // token. + while (is_non_newline_whitespace_char(parser->current.end)) { + parser->current.end++; + } + + // Next, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // Finally, we'll check the current character to determine the next token. + switch (*parser->current.end++) { + case '\0': // NUL or end of script + case '\004': // ^D + case '\032': // ^Z + return YP_TOKEN_EOF; + + case '#': // comments + while (*parser->current.end != '\n' && *parser->current.end != '\0') { + parser->current.end++; + } + (void) match(parser, '\n'); + return YP_TOKEN_COMMENT; + + case '\n': { + parser->lineno++; + return YP_TOKEN_NEWLINE; + } + + // , ( ) ; + case ',': return YP_TOKEN_COMMA; + case '(': return YP_TOKEN_PARENTHESIS_LEFT; + case ')': return YP_TOKEN_PARENTHESIS_RIGHT; + case ';': return YP_TOKEN_SEMICOLON; + + // [ [] + case '[': + if (parser->previous.type == YP_TOKEN_DOT && match(parser, ']')) { + return YP_TOKEN_BRACKET_LEFT_RIGHT; + } + return YP_TOKEN_BRACKET_LEFT; + + // ] + case ']': return YP_TOKEN_BRACKET_RIGHT; + + // { + case '{': + if (parser->previous.type == YP_TOKEN_MINUS_GREATER) return YP_TOKEN_LAMBDA_BEGIN; + return YP_TOKEN_BRACE_LEFT; + + // } + case '}': + if (parser->lex_modes.current->mode == YP_LEX_EMBEXPR) { + pop_lex_mode(parser); + return YP_TOKEN_EMBEXPR_END; + } + return YP_TOKEN_BRACE_RIGHT; + + // * ** **= *= + case '*': + if (match(parser, '*')) return match(parser, '=') ? YP_TOKEN_STAR_STAR_EQUAL : YP_TOKEN_STAR_STAR; + return match(parser, '=') ? YP_TOKEN_STAR_EQUAL : YP_TOKEN_STAR; + + // ! != !~ !@ + case '!': + if (match(parser, '=')) return YP_TOKEN_BANG_EQUAL; + if (match(parser, '~')) return YP_TOKEN_BANG_TILDE; + if ((parser->previous.type == YP_TOKEN_KEYWORD_DEF || parser->previous.type == YP_TOKEN_DOT) && match(parser, '@')) return YP_TOKEN_BANG_AT; + return YP_TOKEN_BANG; + + // = => =~ == === =begin + case '=': + if (parser->current.end[-2] == '\n' && (strncmp(parser->current.end, "begin\n", 6) == 0)) { + parser->current.end += 6; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_EMBDOC, .term = '\0', .interp = false }); + return YP_TOKEN_EMBDOC_BEGIN; + } + + if (match(parser, '>')) return YP_TOKEN_EQUAL_GREATER; + if (match(parser, '~')) return YP_TOKEN_EQUAL_TILDE; + if (match(parser, '=')) return match(parser, '=') ? YP_TOKEN_EQUAL_EQUAL_EQUAL : YP_TOKEN_EQUAL_EQUAL; + return YP_TOKEN_EQUAL; + + // < << <<= <= <=> + case '<': + if (match(parser, '<')) { + if (match(parser, '=')) return YP_TOKEN_LESS_LESS_EQUAL; + + // We don't yet handle heredocs. + if (match(parser, '-') || match(parser, '~')) return YP_TOKEN_EOF; + + return YP_TOKEN_LESS_LESS; + } + if (match(parser, '=')) return match(parser, '>') ? YP_TOKEN_LESS_EQUAL_GREATER : YP_TOKEN_LESS_EQUAL; + return YP_TOKEN_LESS; + + // > >> >>= >= + case '>': + if (match(parser, '>')) return match(parser, '=') ? YP_TOKEN_GREATER_GREATER_EQUAL : YP_TOKEN_GREATER_GREATER; + return match(parser, '=') ? YP_TOKEN_GREATER_EQUAL : YP_TOKEN_GREATER; + + // double-quoted string literal + case '"': + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = '"', .interp = true }); + return YP_TOKEN_STRING_BEGIN; + + // xstring literal + case '`': + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = '`', .interp = true }); + return YP_TOKEN_BACKTICK; + + // single-quoted string literal + case '\'': + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = '\'', .interp = false }); + return YP_TOKEN_STRING_BEGIN; + + // ? character literal + case '?': + if (is_identifier_char(parser->current.end)) { + parser->current.end++; + return YP_TOKEN_CHARACTER_LITERAL; + } + return YP_TOKEN_QUESTION_MARK; + + // & && &&= &= + case '&': + if (match(parser, '&')) return match(parser, '=') ? YP_TOKEN_AMPERSAND_AMPERSAND_EQUAL : YP_TOKEN_AMPERSAND_AMPERSAND; + return match(parser, '=') ? YP_TOKEN_AMPERSAND_EQUAL : YP_TOKEN_AMPERSAND; + + // | || ||= |= + case '|': + if (match(parser, '|')) return match(parser, '=') ? YP_TOKEN_PIPE_PIPE_EQUAL : YP_TOKEN_PIPE_PIPE; + return match(parser, '=') ? YP_TOKEN_PIPE_EQUAL : YP_TOKEN_PIPE; + + // + += +@ + case '+': + if (match(parser, '=')) return YP_TOKEN_PLUS_EQUAL; + if ((parser->previous.type == YP_TOKEN_KEYWORD_DEF || parser->previous.type == YP_TOKEN_DOT) && match(parser, '@')) return YP_TOKEN_PLUS_AT; + return YP_TOKEN_PLUS; + + // - -= -@ + case '-': + if (match(parser, '>')) return YP_TOKEN_MINUS_GREATER; + if (match(parser, '=')) return YP_TOKEN_MINUS_EQUAL; + if ((parser->previous.type == YP_TOKEN_KEYWORD_DEF || parser->previous.type == YP_TOKEN_DOT) && match(parser, '@')) return YP_TOKEN_MINUS_AT; + return YP_TOKEN_MINUS; + + // . .. ... + case '.': + if (!match(parser, '.')) return YP_TOKEN_DOT; + return match(parser, '.') ? YP_TOKEN_DOT_DOT_DOT : YP_TOKEN_DOT_DOT; + + // integer + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return lex_numeric(parser); + + // :: symbol + case ':': + if (match(parser, ':')) return YP_TOKEN_COLON_COLON; + if (is_identifier_char(parser->current.end)) { + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_SYMBOL, .term = '\0' }); + return YP_TOKEN_SYMBOL_BEGIN; + } + return YP_TOKEN_COLON; + + // / /= + case '/': + if (match(parser, '=')) return YP_TOKEN_SLASH_EQUAL; + if (*parser->current.end == ' ') return YP_TOKEN_SLASH; + + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_REGEXP, .term = '/' }); + return YP_TOKEN_REGEXP_BEGIN; + + // ^ ^= + case '^': return match(parser, '=') ? YP_TOKEN_CARET_EQUAL : YP_TOKEN_CARET; + + // ~ ~@ + case '~': + if ((parser->previous.type == YP_TOKEN_KEYWORD_DEF || parser->previous.type == YP_TOKEN_DOT) && match(parser, '@')) return YP_TOKEN_TILDE_AT; + return YP_TOKEN_TILDE; + + // TODO + case '\\': + return YP_TOKEN_INVALID; + + // % %= %i %I %q %Q %w %W + case '%': + switch (*parser->current.end) { + case '=': + parser->current.end++; + return YP_TOKEN_PERCENT_EQUAL; + case 'i': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_LIST, .term = terminator(*parser->current.end++), .interp = false }); + return YP_TOKEN_PERCENT_LOWER_I; + case 'I': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_LIST, .term = terminator(*parser->current.end++), .interp = true }); + return YP_TOKEN_PERCENT_UPPER_I; + case 'r': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_REGEXP, .term = terminator(*parser->current.end++), .interp = true }); + return YP_TOKEN_REGEXP_BEGIN; + case 'q': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = terminator(*parser->current.end++), .interp = false }); + return YP_TOKEN_STRING_BEGIN; + case 'Q': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = terminator(*parser->current.end++), .interp = true }); + return YP_TOKEN_STRING_BEGIN; + case 'w': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_LIST, .term = terminator(*parser->current.end++), .interp = false }); + return YP_TOKEN_PERCENT_LOWER_W; + case 'W': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_LIST, .term = terminator(*parser->current.end++), .interp = true }); + return YP_TOKEN_PERCENT_UPPER_W; + case 'x': + parser->current.end++; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_STRING, .term = terminator(*parser->current.end++), .interp = true }); + return YP_TOKEN_PERCENT_LOWER_X; + default: + return YP_TOKEN_PERCENT; + } + + // global variable + case '$': return lex_global_variable(parser); + + // instance variable, class variable + case '@': { + yp_token_type_t type = match(parser, '@') ? YP_TOKEN_CLASS_VARIABLE : YP_TOKEN_INSTANCE_VARIABLE; + + if (is_identifier_start_char(parser->current.end)) { + do { parser->current.end++; } while (is_identifier_char(parser->current.end)); + return type; + } + + return YP_TOKEN_INVALID; + } + + default: { + // If this isn't the beginning of an identifier, then it's an invalid + // token as we've exhausted all of the other options. + if (!is_identifier_start_char(parser->current.start)) { + return YP_TOKEN_INVALID; + } + + yp_token_type_t type = lex_identifier(parser); + + // If we're lexing in a place that allows labels and we've hit a + // colon, then we can return a label token. + if ((parser->current.end[0] == ':') && (parser->current.end[1] != ':')) { + parser->current.end++; + return YP_TOKEN_LABEL; + } + + return type; + } + } + } + case YP_LEX_EMBDOC: { + parser->current.start = parser->current.end; + + // If we've hit the end of the embedded documentation then we'll return that token here. + if (strncmp(parser->current.end, "=end\n", 5) == 0) { + parser->current.end += 5; + pop_lex_mode(parser); + return YP_TOKEN_EMBDOC_END; + } + + // Otherwise, we'll parse until the end of the line and return a line of + // embedded documentation. + while ((parser->current.end < parser->end) && (*parser->current.end++ != '\n')); + + // If we've still got content, then we'll return a line of embedded + // documentation. + if (parser->current.end < parser->end) { + parser->lineno++; + return YP_TOKEN_EMBDOC_LINE; + } + + // Otherwise, fall back to error recovery. + return parser->error_handler->unterminated_embdoc(parser); + } + case YP_LEX_LIST: { + // If there's any whitespace at the start of the list, then we're going to + // trim it off the beginning and create a new token. + if (is_whitespace_char(parser->current.end)) { + parser->current.start = parser->current.end; + + do { + if (*parser->current.end == '\n') parser->lineno++; + parser->current.end++; + } while (is_whitespace_char(parser->current.end)); + + return YP_TOKEN_WORDS_SEP; + } + + // Next, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // Lex as far as we can into the word. + while (parser->current.end < parser->end) { + // If we've hit whitespace, then we must have received content by now, + // so we can return an element of the list. + if (is_whitespace_char(parser->current.end)) { + return YP_TOKEN_STRING_CONTENT; + } + + if (*parser->current.end == parser->lex_modes.current->term) { + // If we've hit the terminator and we've already skipped past content, + // then we can return a list node. + if (parser->current.start < parser->current.end) { + return YP_TOKEN_STRING_CONTENT; + } + + // Otherwise, switch back to the default state and return the end of + // the list. + parser->current.end++; + pop_lex_mode(parser); + return YP_TOKEN_STRING_END; + } + + // Otherwise, just skip past the content as it's part of an element of + // the list. + parser->current.end++; + } + + // Otherwise, fall back to error recovery. + return parser->error_handler->unterminated_list(parser); + } + case YP_LEX_REGEXP: { + // First, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // If we've hit the end of the string, then we can return to the default + // state of the lexer and return a string ending token. + if (match(parser, parser->lex_modes.current->term)) { + // Since we've hit the terminator of the regular expression, we now need + // to parse the options. + bool options = true; + while (options) { + switch (*parser->current.end) { + case 'e': case 'i': case 'm': case 'n': case 's': case 'u': case 'x': + parser->current.end++; + break; + default: + options = false; + break; + } + } + + pop_lex_mode(parser); + return YP_TOKEN_REGEXP_END; + } + + // Otherwise, we'll lex as far as we can into the regular expression. If + // we hit the end of the regular expression, then we'll return everything + // up to that point. + while (parser->current.end < parser->end) { + // If we hit the terminator, then return this element of the string. + if (*parser->current.end == parser->lex_modes.current->term) { + return YP_TOKEN_STRING_CONTENT; + } + + // If we hit a newline, make sure to do the required bookkeeping. + if (*parser->current.end == '\n') parser->lineno++; + + // If we've hit a #, then check if it's used as the beginning of either + // an embedded variable or an embedded expression. + if (*parser->current.end == '#') { + switch (parser->current.end[1]) { + case '{': + // In this case it's the start of an embedded expression. + + // If we have already consumed content, then we need to return + // that content as string content first. + if (parser->current.end > parser->current.start) { + return YP_TOKEN_STRING_CONTENT; + } + + parser->current.end += 2; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_EMBEXPR }); + return YP_TOKEN_EMBEXPR_BEGIN; + } + } + + parser->current.end++; + } + + // Otherwise, fall back to error recovery. + return parser->error_handler->unterminated_regexp(parser); + } + case YP_LEX_STRING: { + // First, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // If we've hit the end of the string, then we can return to the default + // state of the lexer and return a string ending token. + if (match(parser, parser->lex_modes.current->term)) { + pop_lex_mode(parser); + return YP_TOKEN_STRING_END; + } + + // Otherwise, we'll lex as far as we can into the string. If we hit the + // end of the string, then we'll return everything up to that point. + while (parser->current.end < parser->end) { + // If we hit the terminator, then return this element of the string. + if (*parser->current.end == parser->lex_modes.current->term) { + return YP_TOKEN_STRING_CONTENT; + } + + // If we hit a newline, make sure to do the required bookkeeping. + if (*parser->current.end == '\n') parser->lineno++; + + // If our current lex state allows interpolation and we've hit a #, then + // check if it's used as the beginning of either an embedded variable or + // an embedded expression. + if (parser->lex_modes.current->interp && *parser->current.end == '#') { + switch (parser->current.end[1]) { + case '@': + // In this case it could be an embedded instance or class + // variable. + break; + case '$': + // In this case it could be an embedded global variable. + break; + case '{': + // In this case it's the start of an embedded expression. + + // If we have already consumed content, then we need to return + // that content as string content first. + if (parser->current.end > parser->current.start) { + return YP_TOKEN_STRING_CONTENT; + } + + parser->current.end += 2; + push_lex_mode(parser, (yp_lex_mode_t) { .mode = YP_LEX_EMBEXPR }); + return YP_TOKEN_EMBEXPR_BEGIN; + } + } + + parser->current.end++; + } + + // Otherwise, fall back to error recovery. + return parser->error_handler->unterminated_string(parser); + } + case YP_LEX_SYMBOL: { + // First, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // Lex as far as we can into the symbol. + if (parser->current.end < parser->end && is_identifier_start_char(parser->current.end++)) { + pop_lex_mode(parser); + + yp_token_type_t type = lex_identifier(parser); + return match(parser, '=') ? YP_TOKEN_IDENTIFIER : type; + } + + // If we get here then we have the start of a symbol with no content. In + // that case return an invalid token. + return YP_TOKEN_INVALID; + } + } + + // We shouldn't be able to get here at all, but some compilers can't figure + // that out, so just returning a value here to make them happy. + return YP_TOKEN_INVALID; +} + +/******************************************************************************/ +/* External functions */ +/******************************************************************************/ + +// Initialize a parser with the given start and end pointers. +void +yp_parser_init(yp_parser_t *parser, const char *source, off_t size, yp_error_handler_t *error_handler) { + *parser = (yp_parser_t) { + .lex_modes = { + .index = 0, + .stack = {{ .mode = YP_LEX_DEFAULT }}, + .current = &parser->lex_modes.stack[0] + }, + .start = source, + .end = source + size, + .current = { .start = source, .end = source }, + .lineno = 1, + .error_handler = error_handler + }; +} + +// Get the next token type and set its value on the current pointer. +void +yp_lex_token(yp_parser_t *parser) { + parser->previous = parser->current; + parser->current.type = lex_token_type(parser); +} + +/******************************************************************************/ +/* C-extension functions */ +/******************************************************************************/ + +// By default, the lexer won't attempt to recover from lexer errors at all. This +// function provides that implementation. +static yp_token_type_t +unrecoverable(yp_parser_t *parser) { + return YP_TOKEN_EOF; +} + +static VALUE +token_inspect(yp_parser_t *parser) { + yp_token_t token = parser->current; + VALUE parts = rb_ary_new(); + + // First, we're going to push on the location information. + VALUE location = rb_ary_new(); + rb_ary_push(location, LONG2FIX(token.start - parser->start)); + rb_ary_push(location, LONG2FIX(token.end - parser->start)); + rb_ary_push(parts, location); + + // Next, we're going to push on a symbol that represents the type of token. + switch (token.type) { + // We're going to special-case the invalid token here since that doesn't + // actually exist in Ripper. This is going to give us a little more + // information when our tests fail. + case YP_TOKEN_INVALID: + rb_ary_push(parts, ID2SYM(rb_intern("INVALID"))); + // fprintf(stderr, "Invalid token: %.*s\n", (int) (token.end - token.start), token.start); + break; + +#define CASE(type) case YP_TOKEN_##type: rb_ary_push(parts, ID2SYM(rb_intern(#type))); break; + + CASE(AMPERSAND) + CASE(AMPERSAND_AMPERSAND) + CASE(AMPERSAND_AMPERSAND_EQUAL) + CASE(AMPERSAND_EQUAL) + CASE(BACK_REFERENCE) + CASE(BACKTICK) + CASE(BANG) + CASE(BANG_AT) + CASE(BANG_EQUAL) + CASE(BANG_TILDE) + CASE(BRACE_LEFT) + CASE(BRACE_RIGHT) + CASE(BRACKET_LEFT) + CASE(BRACKET_LEFT_RIGHT) + CASE(BRACKET_RIGHT) + CASE(CARET) + CASE(CARET_EQUAL) + CASE(CHARACTER_LITERAL) + CASE(CLASS_VARIABLE) + CASE(COLON) + CASE(COLON_COLON) + CASE(COMMA) + CASE(COMMENT) + CASE(CONSTANT) + CASE(DOT) + CASE(DOT_DOT) + CASE(DOT_DOT_DOT) + CASE(EMBDOC_BEGIN) + CASE(EMBDOC_END) + CASE(EMBDOC_LINE) + CASE(EMBEXPR_BEGIN) + CASE(EMBEXPR_END) + CASE(EQUAL) + CASE(EQUAL_EQUAL) + CASE(EQUAL_EQUAL_EQUAL) + CASE(EQUAL_GREATER) + CASE(EQUAL_TILDE) + CASE(FLOAT) + CASE(GREATER) + CASE(GREATER_EQUAL) + CASE(GREATER_GREATER) + CASE(GREATER_GREATER_EQUAL) + CASE(GLOBAL_VARIABLE) + CASE(IDENTIFIER) + CASE(IMAGINARY_NUMBER) + CASE(INTEGER) + CASE(INSTANCE_VARIABLE) + CASE(KEYWORD___ENCODING__) + CASE(KEYWORD___LINE__) + CASE(KEYWORD___FILE__) + CASE(KEYWORD_ALIAS) + CASE(KEYWORD_AND) + CASE(KEYWORD_BEGIN) + CASE(KEYWORD_BEGIN_UPCASE) + CASE(KEYWORD_BREAK) + CASE(KEYWORD_CASE) + CASE(KEYWORD_CLASS) + CASE(KEYWORD_DEF) + CASE(KEYWORD_DEFINED) + CASE(KEYWORD_DO) + CASE(KEYWORD_ELSE) + CASE(KEYWORD_ELSIF) + CASE(KEYWORD_END) + CASE(KEYWORD_END_UPCASE) + CASE(KEYWORD_ENSURE) + CASE(KEYWORD_FALSE) + CASE(KEYWORD_FOR) + CASE(KEYWORD_IF) + CASE(KEYWORD_IN) + CASE(KEYWORD_MODULE) + CASE(KEYWORD_NEXT) + CASE(KEYWORD_NIL) + CASE(KEYWORD_NOT) + CASE(KEYWORD_OR) + CASE(KEYWORD_REDO) + CASE(KEYWORD_RESCUE) + CASE(KEYWORD_RETRY) + CASE(KEYWORD_RETURN) + CASE(KEYWORD_SELF) + CASE(KEYWORD_SUPER) + CASE(KEYWORD_THEN) + CASE(KEYWORD_TRUE) + CASE(KEYWORD_UNDEF) + CASE(KEYWORD_UNLESS) + CASE(KEYWORD_UNTIL) + CASE(KEYWORD_WHEN) + CASE(KEYWORD_WHILE) + CASE(KEYWORD_YIELD) + CASE(LABEL) + CASE(LAMBDA_BEGIN) + CASE(LESS) + CASE(LESS_EQUAL) + CASE(LESS_EQUAL_GREATER) + CASE(LESS_LESS) + CASE(LESS_LESS_EQUAL) + CASE(MINUS) + CASE(MINUS_AT) + CASE(MINUS_EQUAL) + CASE(MINUS_GREATER) + CASE(NEWLINE) + CASE(NTH_REFERENCE) + CASE(PARENTHESIS_LEFT) + CASE(PARENTHESIS_RIGHT) + CASE(PERCENT) + CASE(PERCENT_EQUAL) + CASE(PERCENT_LOWER_I) + CASE(PERCENT_LOWER_W) + CASE(PERCENT_LOWER_X) + CASE(PERCENT_UPPER_I) + CASE(PERCENT_UPPER_W) + CASE(PIPE) + CASE(PIPE_EQUAL) + CASE(PIPE_PIPE) + CASE(PIPE_PIPE_EQUAL) + CASE(PLUS) + CASE(PLUS_AT) + CASE(PLUS_EQUAL) + CASE(QUESTION_MARK) + CASE(RATIONAL_NUMBER) + CASE(REGEXP_BEGIN) + CASE(REGEXP_END) + CASE(SEMICOLON) + CASE(SLASH) + CASE(SLASH_EQUAL) + CASE(STAR) + CASE(STAR_EQUAL) + CASE(STAR_STAR) + CASE(STAR_STAR_EQUAL) + CASE(STRING_BEGIN) + CASE(STRING_CONTENT) + CASE(STRING_END) + CASE(SYMBOL_BEGIN) + CASE(TILDE) + CASE(TILDE_AT) + CASE(WORDS_SEP) + +#undef CASE + + default: + rb_bug("Unknown token type: %d", token.type); + } + + rb_ary_push(parts, rb_str_new(token.start, token.end - token.start)); + return parts; +} + +static VALUE +each_token(VALUE self, VALUE rb_filepath) { + char *filepath = StringValueCStr(rb_filepath); + + // Open the file for reading + int fd = open(filepath, O_RDONLY); + if (fd == -1) { + perror("open"); + return Qnil; + } + + // Stat the file to get the file size + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + perror("fstat"); + return Qnil; + } + + // mmap the file descriptor to virtually get the contents + off_t size = sb.st_size; + const char *source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + + close(fd); + if (source == MAP_FAILED) { + perror("mmap"); + return Qnil; + } + + yp_error_handler_t default_error_handler = { + .unterminated_embdoc = unrecoverable, + .unterminated_list = unrecoverable, + .unterminated_regexp = unrecoverable, + .unterminated_string = unrecoverable + }; + + // Instantiate the parser struct with all of the necessary information + yp_parser_t parser; + yp_parser_init(&parser, source, size, &default_error_handler); + + // Create an array and populate it with the tokens from the filepath + for (yp_lex_token(&parser); parser.current.type != YP_TOKEN_EOF; yp_lex_token(&parser)) { + rb_yield(token_inspect(&parser)); + } + + // Clean up and free + munmap((void *) source, size); + return Qnil; +} + +void +Init_yarp(void) { + VALUE rb_cYARP = rb_define_module("YARP"); + rb_define_singleton_method(rb_cYARP, "each_token", each_token, 1); +} diff --git a/ext/yarp/yarp.h b/ext/yarp/yarp.h new file mode 100644 index 000000000..ec8eb055b --- /dev/null +++ b/ext/yarp/yarp.h @@ -0,0 +1,260 @@ +#ifndef YARP_H +#define YARP_H + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef enum { + YP_TOKEN_EOF = 0, // final token in the file + YP_TOKEN_INVALID, // an invalid token + YP_TOKEN_AMPERSAND, // & + YP_TOKEN_AMPERSAND_AMPERSAND, // && + YP_TOKEN_AMPERSAND_AMPERSAND_EQUAL, // &&= + YP_TOKEN_AMPERSAND_EQUAL, // &= + YP_TOKEN_BACK_REFERENCE, // a back reference + YP_TOKEN_BACKTICK, // ` + YP_TOKEN_BANG, // ! + YP_TOKEN_BANG_AT, // !@ + YP_TOKEN_BANG_EQUAL, // != + YP_TOKEN_BANG_TILDE, // !~ + YP_TOKEN_BRACE_LEFT, // { + YP_TOKEN_BRACE_RIGHT, // } + YP_TOKEN_BRACKET_LEFT, // [ + YP_TOKEN_BRACKET_LEFT_RIGHT, // [] + YP_TOKEN_BRACKET_RIGHT, // ] + YP_TOKEN_CARET, // ^ + YP_TOKEN_CARET_EQUAL, // ^= + YP_TOKEN_CHARACTER_LITERAL, // a character literal + YP_TOKEN_CLASS_VARIABLE, // a class variable + YP_TOKEN_COLON, // : + YP_TOKEN_COLON_COLON, // :: + YP_TOKEN_COMMA, // , + YP_TOKEN_COMMENT, // a comment + YP_TOKEN_CONSTANT, // a constant + YP_TOKEN_DOT, // . + YP_TOKEN_DOT_DOT, // .. + YP_TOKEN_DOT_DOT_DOT, // ... + YP_TOKEN_EMBDOC_BEGIN, // =begin + YP_TOKEN_EMBDOC_END, // =end + YP_TOKEN_EMBDOC_LINE, // a line inside of embedded documentation + YP_TOKEN_EMBEXPR_BEGIN, // #{ + YP_TOKEN_EMBEXPR_END, // } + YP_TOKEN_EQUAL, // = + YP_TOKEN_EQUAL_EQUAL, // == + YP_TOKEN_EQUAL_EQUAL_EQUAL, // === + YP_TOKEN_EQUAL_GREATER, // => + YP_TOKEN_EQUAL_TILDE, // =~ + YP_TOKEN_FLOAT, // a floating point number + YP_TOKEN_GREATER, // > + YP_TOKEN_GREATER_EQUAL, // >= + YP_TOKEN_GREATER_GREATER, // >> + YP_TOKEN_GREATER_GREATER_EQUAL, // >>= + YP_TOKEN_GLOBAL_VARIABLE, // a global variable + YP_TOKEN_IDENTIFIER, // an identifier + YP_TOKEN_IMAGINARY_NUMBER, // an imaginary number literal + YP_TOKEN_INSTANCE_VARIABLE, // an instance variable + YP_TOKEN_INTEGER, // an integer (any base) + YP_TOKEN_KEYWORD___ENCODING__, // __ENCODING__ + YP_TOKEN_KEYWORD___LINE__, // __LINE__ + YP_TOKEN_KEYWORD___FILE__, // __FILE__ + YP_TOKEN_KEYWORD_ALIAS, // alias + YP_TOKEN_KEYWORD_AND, // and + YP_TOKEN_KEYWORD_BEGIN, // begin + YP_TOKEN_KEYWORD_BEGIN_UPCASE, // BEGIN + YP_TOKEN_KEYWORD_BREAK, // break + YP_TOKEN_KEYWORD_CASE, // case + YP_TOKEN_KEYWORD_CLASS, // class + YP_TOKEN_KEYWORD_DEF, // def + YP_TOKEN_KEYWORD_DEFINED, // defined? + YP_TOKEN_KEYWORD_DO, // do + YP_TOKEN_KEYWORD_ELSE, // else + YP_TOKEN_KEYWORD_ELSIF, // elsif + YP_TOKEN_KEYWORD_END, // end + YP_TOKEN_KEYWORD_END_UPCASE, // END + YP_TOKEN_KEYWORD_ENSURE, // ensure + YP_TOKEN_KEYWORD_FALSE, // false + YP_TOKEN_KEYWORD_FOR, // for + YP_TOKEN_KEYWORD_IF, // if + YP_TOKEN_KEYWORD_IN, // in + YP_TOKEN_KEYWORD_MODULE, // module + YP_TOKEN_KEYWORD_NEXT, // next + YP_TOKEN_KEYWORD_NIL, // nil + YP_TOKEN_KEYWORD_NOT, // not + YP_TOKEN_KEYWORD_OR, // or + YP_TOKEN_KEYWORD_REDO, // redo + YP_TOKEN_KEYWORD_RESCUE, // rescue + YP_TOKEN_KEYWORD_RETRY, // retry + YP_TOKEN_KEYWORD_RETURN, // return + YP_TOKEN_KEYWORD_SELF, // self + YP_TOKEN_KEYWORD_SUPER, // super + YP_TOKEN_KEYWORD_THEN, // then + YP_TOKEN_KEYWORD_TRUE, // true + YP_TOKEN_KEYWORD_UNDEF, // undef + YP_TOKEN_KEYWORD_UNLESS, // unless + YP_TOKEN_KEYWORD_UNTIL, // until + YP_TOKEN_KEYWORD_WHEN, // when + YP_TOKEN_KEYWORD_WHILE, // while + YP_TOKEN_KEYWORD_YIELD, // yield + YP_TOKEN_LABEL, // a label + YP_TOKEN_LAMBDA_BEGIN, // { + YP_TOKEN_LESS, // < + YP_TOKEN_LESS_EQUAL, // <= + YP_TOKEN_LESS_EQUAL_GREATER, // <=> + YP_TOKEN_LESS_LESS, // << + YP_TOKEN_LESS_LESS_EQUAL, // <<= + YP_TOKEN_MINUS, // - + YP_TOKEN_MINUS_AT, // -@ + YP_TOKEN_MINUS_EQUAL, // -= + YP_TOKEN_MINUS_GREATER, // -> + YP_TOKEN_NEWLINE, // a newline character outside of other tokens + YP_TOKEN_NTH_REFERENCE, // an nth global variable reference + YP_TOKEN_PARENTHESIS_LEFT, // ( + YP_TOKEN_PARENTHESIS_RIGHT, // ) + YP_TOKEN_PERCENT, // % + YP_TOKEN_PERCENT_EQUAL, // %= + YP_TOKEN_PERCENT_LOWER_I, // %i + YP_TOKEN_PERCENT_LOWER_W, // %w + YP_TOKEN_PERCENT_LOWER_X, // %x + YP_TOKEN_PERCENT_UPPER_I, // %I + YP_TOKEN_PERCENT_UPPER_W, // %W + YP_TOKEN_PIPE, // | + YP_TOKEN_PIPE_EQUAL, // |= + YP_TOKEN_PIPE_PIPE, // || + YP_TOKEN_PIPE_PIPE_EQUAL, // ||= + YP_TOKEN_PLUS, // + + YP_TOKEN_PLUS_AT, // +@ + YP_TOKEN_PLUS_EQUAL, // += + YP_TOKEN_QUESTION_MARK, // ? + YP_TOKEN_RATIONAL_NUMBER, // a rational number literal + YP_TOKEN_REGEXP_BEGIN, // the beginning of a regular expression + YP_TOKEN_REGEXP_END, // the end of a regular expression + YP_TOKEN_SEMICOLON, // ; + YP_TOKEN_SLASH, // / + YP_TOKEN_SLASH_EQUAL, // /= + YP_TOKEN_STAR, // * + YP_TOKEN_STAR_EQUAL, // *= + YP_TOKEN_STAR_STAR, // ** + YP_TOKEN_STAR_STAR_EQUAL, // **= + YP_TOKEN_STRING_BEGIN, // the beginning of a string + YP_TOKEN_STRING_CONTENT, // the contents of a string + YP_TOKEN_STRING_END, // the end of a string + YP_TOKEN_SYMBOL_BEGIN, // the beginning of a symbol + YP_TOKEN_TILDE, // ~ + YP_TOKEN_TILDE_AT, // ~@ + YP_TOKEN_WORDS_SEP, // a separator between words in a list +} yp_token_type_t; + +// This struct represents a token in the Ruby source. We use it to track both +// type and location information. +typedef struct { + yp_token_type_t type; + const char *start; + const char *end; +} yp_token_t; + +// When lexing Ruby source, the lexer has a small amount of state to tell which +// kind of token it is currently lexing. For example, when we find the start of +// a string, the first token that we return is a TOKEN_STRING_BEGIN token. After +// that the lexer is now in the YP_LEX_STRING mode, and will return tokens that +// are found as part of a string. +typedef struct yp_lex_mode { + enum { + // This state is used when any given token is being lexed. + YP_LEX_DEFAULT, + + // This state is used when we're lexing an embdoc (a =begin..=end comment). + YP_LEX_EMBDOC, + + // This state is used when we're lexing as normal but inside an embedded + // expression of a string. + YP_LEX_EMBEXPR, + + // This state is used when we are lexing a list of tokens, as in a %w word + // list literal or a %i symbol list literal. + YP_LEX_LIST, + + // This state is used when a regular expression has been begun and we are + // looking for the terminator. + YP_LEX_REGEXP, + + // This state is used when we are lexing a string or a string-like token, as + // in string content with either quote or an xstring. + YP_LEX_STRING, + + // This state is used when a symbol has already been begun (e.g., by a + // colon) and we still need to lex the rest of the symbol. + YP_LEX_SYMBOL, + } mode; + + // This is the terminator of the current state. It is used when lexing a + // string (either single or double quoted) and an xstring. + char term; + + // Whether or not interpolation is allowed in this lex state. This corresponds + // to some LEX_LIST states (e.g., %W) and LEX_STRING states (e.g., double + // quotes). + bool interp; + + // The previous lex state so that it knows how to pop. + struct yp_lex_mode *prev; +} yp_lex_mode_t; + +// We pre-allocate a certain number of lex states in order to avoid having to +// call malloc too many times while parsing. You really shouldn't need more than +// this because you only really nest deeply when doing string interpolation. +#define YP_LEX_STACK_SIZE 4 + +// A forward declaration since our error handler struct accepts a parser for +// each of its function calls. +typedef struct yp_parser yp_parser_t; + +// This struct is for handling error recovery. We're going to provide our own +// implementation for default, but this is an extension point if folks want to +// provide their own. +// +// Each function is going to be provided with a pointer to the struct itself, at +// which point it is expected to set the parsers state to whatever it should be +// in order to recover from the error. If it can't recover, it should return +// TOKEN_INVALID. +typedef struct { + yp_token_type_t (*unterminated_embdoc)(yp_parser_t *parser); + yp_token_type_t (*unterminated_list)(yp_parser_t *parser); + yp_token_type_t (*unterminated_regexp)(yp_parser_t *parser); + yp_token_type_t (*unterminated_string)(yp_parser_t *parser); +} yp_error_handler_t; + +// This struct represents the overall parser. It contains a reference to the +// source file, as well as pointers that indicate where in the source it's +// currently parsing. It also contains the most recent and current token that +// it's considering. +struct yp_parser { + struct { + yp_lex_mode_t *current; // the current state of the lexer + yp_lex_mode_t stack[YP_LEX_STACK_SIZE]; // the stack of lexer states + size_t index; // the current index into the lexer state stack + } lex_modes; + + const char *start; // the pointer to the start of the source + const char *end; // the pointer to the end of the source + yp_token_t previous; // the previous token we were considering + yp_token_t current; // the current token we're considering + int lineno; // the current line number we're looking at + + yp_error_handler_t *error_handler; // the error handler +}; + +// Initialize a parser with the given start and end pointers. +void +yp_parser_init(yp_parser_t *parser, const char *source, off_t size, yp_error_handler_t *error_handler); + +// Get the next token type and set its value on the current pointer. +void +yp_lex_token(yp_parser_t *parser); + +#endif diff --git a/lib/yarp.rb b/lib/yarp.rb new file mode 100644 index 000000000..35220bd92 --- /dev/null +++ b/lib/yarp.rb @@ -0,0 +1,199 @@ +# frozen_string_literal: true + +require_relative "yarp/yarp" +require_relative "yarp/version" + +module YARP + # This lexes with the Ripper lex. It drops any space events and normalizes all + # ignored newlines into regular newlines. + def self.ripper_lex(filepath) + Ripper.lex(File.read(filepath)).each_with_object([]) do |token, tokens| + case token[1] + when :on_ignored_nl + tokens << [token[0], :on_nl, token[2], token[3]] + when :on_sp + # skip + else + tokens << token + end + end + end + + # Returns an array of tokens that closely resembles that of the Ripper lexer. + # The only difference is that since we don't keep track of lexer state in the + # same way, it's going to always return the NONE state. + def self.compat_lex(filepath) + offsets = [0] + File.foreach(filepath) { |line| offsets << offsets.last + line.bytesize } + + lexer_state = Ripper::Lexer::State.new(0) + tokens = [] + + each_token(filepath) do |((start_char, _), type, value)| + line_number, line_offset = + offsets.each_with_index.detect do |(offset, line)| + break [line, offsets[line - 1]] if start_char < offset + end + + line_number ||= offsets.length + 1 + line_offset ||= offsets.last + + line_byte = start_char - line_offset + event = RIPPER.fetch(type) + + unescaped = + if %i[on_comment on_tstring_content].include?(event) && value.include?("\\") + # Ripper unescapes string content and comments, so we need to do the + # same here. + value.force_encoding("UTF-8").unicode_normalize + else + value + end + + tokens << [[line_number, line_byte], event, unescaped, lexer_state] + end + + tokens + end + + RIPPER = { + AMPERSAND: :on_op, + AMPERSAND_AMPERSAND: :on_op, + AMPERSAND_AMPERSAND_EQUAL: :on_op, + AMPERSAND_EQUAL: :on_op, + BACK_REFERENCE: :on_backref, + BACKTICK: :on_backtick, + BANG: :on_op, + BANG_AT: :on_op, + BANG_EQUAL: :on_op, + BANG_TILDE: :on_op, + BRACE_LEFT: :on_lbrace, + BRACE_RIGHT: :on_rbrace, + BRACKET_LEFT: :on_lbracket, + BRACKET_LEFT_RIGHT: :on_op, + BRACKET_RIGHT: :on_rbracket, + CARET: :on_op, + CARET_EQUAL: :on_op, + CHARACTER_LITERAL: :on_CHAR, + CLASS_VARIABLE: :on_cvar, + COLON: :on_op, + COLON_COLON: :on_op, + COMMA: :on_comma, + COMMENT: :on_comment, + CONSTANT: :on_const, + DOT: :on_period, + DOT_DOT: :on_op, + DOT_DOT_DOT: :on_op, + EMBDOC_BEGIN: :on_embdoc_beg, + EMBDOC_END: :on_embdoc_end, + EMBDOC_LINE: :on_embdoc, + EMBEXPR_BEGIN: :on_embexpr_beg, + EMBEXPR_END: :on_embexpr_end, + EQUAL: :on_op, + EQUAL_EQUAL: :on_op, + EQUAL_EQUAL_EQUAL: :on_op, + EQUAL_GREATER: :on_op, + EQUAL_TILDE: :on_op, + FLOAT: :on_float, + GREATER: :on_op, + GREATER_EQUAL: :on_op, + GREATER_GREATER: :on_op, + GREATER_GREATER_EQUAL: :on_op, + GLOBAL_VARIABLE: :on_gvar, + IDENTIFIER: :on_ident, + IMAGINARY_NUMBER: :on_imaginary, + INTEGER: :on_int, + INSTANCE_VARIABLE: :on_ivar, + INVALID: :INVALID, + KEYWORD___ENCODING__: :on_kw, + KEYWORD___LINE__: :on_kw, + KEYWORD___FILE__: :on_kw, + KEYWORD_ALIAS: :on_kw, + KEYWORD_AND: :on_kw, + KEYWORD_BEGIN: :on_kw, + KEYWORD_BEGIN_UPCASE: :on_kw, + KEYWORD_BREAK: :on_kw, + KEYWORD_CASE: :on_kw, + KEYWORD_CLASS: :on_kw, + KEYWORD_DEF: :on_kw, + KEYWORD_DEFINED: :on_kw, + KEYWORD_DO: :on_kw, + KEYWORD_ELSE: :on_kw, + KEYWORD_ELSIF: :on_kw, + KEYWORD_END: :on_kw, + KEYWORD_END_UPCASE: :on_kw, + KEYWORD_ENSURE: :on_kw, + KEYWORD_FALSE: :on_kw, + KEYWORD_FOR: :on_kw, + KEYWORD_IF: :on_kw, + KEYWORD_IN: :on_kw, + KEYWORD_MODULE: :on_kw, + KEYWORD_NEXT: :on_kw, + KEYWORD_NIL: :on_kw, + KEYWORD_NOT: :on_kw, + KEYWORD_OR: :on_kw, + KEYWORD_REDO: :on_kw, + KEYWORD_RESCUE: :on_kw, + KEYWORD_RETRY: :on_kw, + KEYWORD_RETURN: :on_kw, + KEYWORD_SELF: :on_kw, + KEYWORD_SUPER: :on_kw, + KEYWORD_THEN: :on_kw, + KEYWORD_TRUE: :on_kw, + KEYWORD_UNDEF: :on_kw, + KEYWORD_UNLESS: :on_kw, + KEYWORD_UNTIL: :on_kw, + KEYWORD_WHEN: :on_kw, + KEYWORD_WHILE: :on_kw, + KEYWORD_YIELD: :on_kw, + LABEL: :on_label, + LAMBDA_BEGIN: :on_tlambeg, + LESS: :on_op, + LESS_EQUAL: :on_op, + LESS_EQUAL_GREATER: :on_op, + LESS_LESS: :on_op, + LESS_LESS_EQUAL: :on_op, + MINUS: :on_op, + MINUS_AT: :on_op, + MINUS_EQUAL: :on_op, + MINUS_GREATER: :on_tlambda, + NEWLINE: :on_nl, + NTH_REFERENCE: :on_backref, + PARENTHESIS_LEFT: :on_lparen, + PARENTHESIS_RIGHT: :on_rparen, + PERCENT: :on_op, + PERCENT_EQUAL: :on_op, + PERCENT_LOWER_I: :on_qsymbols_beg, + PERCENT_LOWER_W: :on_qwords_beg, + PERCENT_LOWER_X: :on_backtick, + PERCENT_UPPER_I: :on_symbols_beg, + PERCENT_UPPER_W: :on_words_beg, + PIPE: :on_op, + PIPE_EQUAL: :on_op, + PIPE_PIPE: :on_op, + PIPE_PIPE_EQUAL: :on_op, + PLUS: :on_op, + PLUS_AT: :on_op, + PLUS_EQUAL: :on_op, + QUESTION_MARK: :on_op, + RATIONAL_NUMBER: :on_rational, + REGEXP_BEGIN: :on_regexp_beg, + REGEXP_END: :on_regexp_end, + SEMICOLON: :on_semicolon, + SLASH: :on_op, + SLASH_EQUAL: :on_op, + STAR: :on_op, + STAR_EQUAL: :on_op, + STAR_STAR: :on_op, + STAR_STAR_EQUAL: :on_op, + STRING_BEGIN: :on_tstring_beg, + STRING_CONTENT: :on_tstring_content, + STRING_END: :on_tstring_end, + SYMBOL_BEGIN: :on_symbeg, + TILDE: :on_op, + TILDE_AT: :on_op, + WORDS_SEP: :on_words_sep, + }.freeze + + private_constant :RIPPER +end diff --git a/lib/yarp/version.rb b/lib/yarp/version.rb new file mode 100644 index 000000000..a1bdd04ae --- /dev/null +++ b/lib/yarp/version.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +module YARP + VERSION = "0.1.0" +end diff --git a/test/fixtures/lex.rb b/test/fixtures/lex.rb new file mode 100644 index 000000000..a1b0b38df --- /dev/null +++ b/test/fixtures/lex.rb @@ -0,0 +1,183 @@ +# This file isn't actually valid Ruby. It's used to exercise the lexer. + +& +&& +&&= +&= +! + +# If the lexer is in a state where it can accept a method name (either by +# defining a method or by calling a method), it will accept a !@. +def !@() end +foo.!@ + +!= +!~ +{} +[] +^ +^= +?a +@@abc +, +.. +... + +=begin +embdoc +content +=end + +"#{abc}" + += +== +=== +100 => 100 +=~ +> +>= +>> +>>= +$~ +$* +$$ +$? +$! +$@ +$/ +$\ +$; +$, +$. +$= +$: +$< +$> +$" +abc +1i +1ri +0 +0d100 +0d100_100 +0D100 +0D100_100 +0b100 +0b100_100 +0B100 +0B100_100 +0o100 +0o100_100 +0O100 +0O100_100 +0100 +0100_100 +0x100 +0x100_100 +0X100 +0X100_100 +@abc +__ENCODING__ +__LINE__ +__FILE__ +alias +and +begin +BEGIN +break +case +class +def +defined? +do +else +elsif +end +END +ensure +false +for +if +in +module +next +nil +not +or +redo +rescue +retry +return +self +super +then +true +undef +unless +until +when +while +yield +{ label: abc } +< +<= +<=> +<< +<<= +- + +# If the lexer is in a state where it can accept a method name (either by +# defining a method or by calling a method), it will accept a -@. +def -@() end +abc.-@ + +-= +() +100 % 100 +100 %= 100 +%i[abc def ghi] +%w[abc def ghi] +%I[abc def ghi] +%W[abc def ghi] +| +|= +100 || 100 +||= ++ ++= + +# If the lexer is in a state where it can accept a method name (either by +# defining a method or by calling a method), it will accept a +@. +def +@() end +abc.+@ + +? +1r +%r{abc} +; +100 / 100 +100 /= 100 +* +*= +** +**= + +# Lexing strings involves a whole state change. It ends up being at minimum +# three tokens: the beginning, the content, and then end. It gets more +# complicated if you have interpolation. +"abc" +%q[abc] +%Q[abc] + +:abc +:ABC + +~ + +# If the lexer is in a state where it can accept a method name (either by +# defining a method or by calling a method), it will accept a ~@. +def ~@() end +abc.~@ + +`abc` diff --git a/test/lex_test.rb b/test/lex_test.rb new file mode 100644 index 000000000..d149b1d1a --- /dev/null +++ b/test/lex_test.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require "test_helper" + +class LexTest < Test::Unit::TestCase + test "lex ext/yarp/extconf.rb" do + assert_lex File.expand_path("../ext/yarp/extconf.rb", __dir__) + end + + test "lex test/fixtures/lex.rb" do + assert_lex File.expand_path("fixtures/lex.rb", __dir__) + end + + test "lex test/test_helper.rb" do + assert_lex File.expand_path("test_helper.rb", __dir__) + end + + test "lex test/yarp_test.rb" do + assert_lex __FILE__ + end + + test "lex yarp.gemspec" do + assert_lex File.expand_path("../yarp.gemspec", __dir__) + end + + private + + def assert_lex(filepath) + YARP.ripper_lex(filepath).zip(YARP.compat_lex(filepath)).each do |(ripper, yarp)| + assert_equal ripper[0...-1], yarp[0...-1] + end + end +end diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 000000000..a178814fb --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../lib", __dir__) +require "yarp" + +require "test-unit" diff --git a/vendor/spec b/vendor/spec new file mode 160000 index 000000000..b8a82400c --- /dev/null +++ b/vendor/spec @@ -0,0 +1 @@ +Subproject commit b8a82400c04d8badf1f455a4d36809592d2328d7 diff --git a/yarp.gemspec b/yarp.gemspec new file mode 100644 index 000000000..576a2c09f --- /dev/null +++ b/yarp.gemspec @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require_relative "lib/yarp/version" + +Gem::Specification.new do |spec| + spec.name = "yarp" + spec.version = YARP::VERSION + spec.authors = ["Kevin Newton"] + spec.email = ["kddnewton@gmail.com"] + + spec.summary = "Yet Another Ruby Parser" + spec.homepage = "https://github.com/ruby-syntax-tree/yarp" + spec.license = "MIT" + spec.required_ruby_version = ">= 2.6.0" + + spec.files = Dir.chdir(File.expand_path(__dir__)) do + `git ls-files -z`.split("\x0").reject do |f| + (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)}) + end + end + + spec.bindir = "exe" + spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } + spec.require_paths = ["lib"] + spec.extensions = ["ext/yarp/extconf.rb"] + + spec.add_development_dependency "bundler", "~> 2" + spec.add_development_dependency "minitest", "~> 5" + spec.add_development_dependency "rake", "~> 13" + spec.add_development_dependency "rake-compiler", "~> 1" + spec.add_development_dependency "test-unit", "~> 3" +end