Skip to content

Commit 8a60879

Browse files
koujinroq
andauthored
[Feature #21943] Add StringScanner#integer_at (#205)
See also: https://bugs.ruby-lang.org/issues/21943 This is semantically equivalent to `scanner[specifier]&.to_i(base)` but this is faster than `scanner[specifier]&.to_i(base)` because `integer_at` doesn't create a temporary String when possible. This PR also includes a benchmark for them: ```console $ ruby -v -S benchmark-driver benchmark/integer_at.yaml ruby 4.1.0dev (2026-05-01T19:25:51Z master f2845eab29) +PRISM [x86_64-linux] Warming up -------------------------------------- [].to_i 24.272M i/s - 25.109M times in 1.034481s (41.20ns/i, 32clocks/i) integer_at 61.188M i/s - 62.491M times in 1.021289s (16.34ns/i, 62clocks/i) Calculating ------------------------------------- [].to_i 26.831M i/s - 72.816M times in 2.713883s (37.27ns/i, 169clocks/i) integer_at 81.331M i/s - 183.564M times in 2.256998s (12.30ns/i, 43clocks/i) Comparison: integer_at: 81331225.5 i/s [].to_i: 26831046.3 i/s - 3.03x slower ``` In this environment, `integer_at` is 3.03x faster than `[].to_i`. Co-authored-by: jinroq <jinroq@gmail.com>
1 parent a78da3c commit 8a60879

7 files changed

Lines changed: 198 additions & 25 deletions

File tree

Rakefile

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,24 @@ task :test do
5858
ruby("run-test.rb", *ENV["TESTOPTS"]&.shellsplit)
5959
end
6060

61-
desc "Run benchmark"
62-
task :benchmark do
63-
ruby("-S",
64-
"benchmark-driver",
65-
"benchmark/scan.yaml")
61+
benchmark_tasks = []
62+
namespace :benchmark do
63+
Dir.glob("benchmark/*.yaml").sort.each do |yaml|
64+
name = File.basename(yaml, ".*")
65+
desc "Run #{name} benchmark"
66+
task name do
67+
puts("```console")
68+
print("$ ")
69+
sh(RbConfig.ruby, "-v", "-S", "benchmark-driver", yaml)
70+
puts("```")
71+
end
72+
benchmark_tasks << "benchmark:#{name}"
73+
end
6674
end
6775

76+
desc "Run all benchmarks"
77+
task :benchmark => benchmark_tasks
78+
6879
RDoc::Task.new
6980

7081
release_task = Rake.application["release"]

benchmark/integer_at.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
prelude: |
2+
$LOAD_PATH.unshift(File.expand_path("lib"))
3+
require "strscan"
4+
scanner = StringScanner.new("2026")
5+
scanner.scan(/(2026)/)
6+
benchmark:
7+
"[].to_i": |
8+
scanner[1].to_i
9+
integer_at: |
10+
scanner.integer_at(1)

ext/strscan/extconf.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
have_func("onig_region_memsize(NULL)")
66
have_func("rb_reg_onig_match", "ruby/re.h")
77
have_func("rb_deprecate_constant")
8+
have_func("rb_int_parse_cstr", "ruby.h") # RUBY_VERSION >= 2.5
89
have_func("rb_gc_location", "ruby.h") # RUBY_VERSION >= 2.7
910
have_const("RUBY_TYPED_EMBEDDABLE", "ruby.h") # RUBY_VERSION >= 3.3
1011
create_makefile 'strscan'

ext/strscan/strscan.c

Lines changed: 110 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,6 +1689,38 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name
16891689
rb_long2int(name_end - name), name);
16901690
}
16911691

1692+
/*
1693+
* Resolve capture group index from Integer, Symbol, or String.
1694+
* Returns the resolved register index, or -1 if unmatched/out of range.
1695+
* For Symbol/String specifiers, raises IndexError if the named group
1696+
* does not exist.
1697+
*/
1698+
static long
1699+
resolve_capture_index(struct strscanner *p, VALUE specifier)
1700+
{
1701+
const char *name;
1702+
long i;
1703+
if (! MATCHED_P(p)) return -1;
1704+
switch (TYPE(specifier)) {
1705+
case T_SYMBOL:
1706+
specifier = rb_sym2str(specifier);
1707+
/* fall through */
1708+
case T_STRING:
1709+
RSTRING_GETMEM(specifier, name, i);
1710+
i = name_to_backref_number(&(p->regs), p->regex, name, name + i,
1711+
rb_enc_get(specifier));
1712+
break;
1713+
default:
1714+
i = NUM2LONG(specifier);
1715+
}
1716+
if (i < 0)
1717+
i += p->regs.num_regs;
1718+
if (i < 0) return -1;
1719+
if (i >= p->regs.num_regs) return -1;
1720+
if (p->regs.beg[i] == -1) return -1;
1721+
return i;
1722+
}
1723+
16921724
/*
16931725
*
16941726
* :markup: markdown
@@ -1763,36 +1795,93 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name
17631795
static VALUE
17641796
strscan_aref(VALUE self, VALUE idx)
17651797
{
1766-
const char *name;
17671798
struct strscanner *p;
17681799
long i;
17691800

17701801
GET_SCANNER(self, p);
1771-
if (! MATCHED_P(p)) return Qnil;
1772-
1773-
switch (TYPE(idx)) {
1774-
case T_SYMBOL:
1775-
idx = rb_sym2str(idx);
1776-
/* fall through */
1777-
case T_STRING:
1778-
RSTRING_GETMEM(idx, name, i);
1779-
i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx));
1780-
break;
1781-
default:
1782-
i = NUM2LONG(idx);
1783-
}
1784-
1785-
if (i < 0)
1786-
i += p->regs.num_regs;
1787-
if (i < 0) return Qnil;
1788-
if (i >= p->regs.num_regs) return Qnil;
1789-
if (p->regs.beg[i] == -1) return Qnil;
1802+
i = resolve_capture_index(p, idx);
1803+
if (i < 0) return Qnil;
17901804

17911805
return extract_range(p,
17921806
adjust_register_position(p, p->regs.beg[i]),
17931807
adjust_register_position(p, p->regs.end[i]));
17941808
}
17951809

1810+
/*
1811+
* :markup: markdown
1812+
*
1813+
* call-seq:
1814+
* integer_at(specifier, base=10) -> integer or nil
1815+
*
1816+
* Returns the captured substring at the given `specifier` as an Integer,
1817+
* following the behavior of `String#to_i(base)`.
1818+
*
1819+
* `specifier` can be an Integer (positive, negative, or zero), a Symbol,
1820+
* or a String for named capture groups.
1821+
*
1822+
* Returns `nil` if:
1823+
* - No match has been performed or the last match failed
1824+
* - The `specifier` is an Integer and is out of range
1825+
* - The group at `specifier` did not participate in the match
1826+
*
1827+
* Raises IndexError if `specifier` is a Symbol or String that does not
1828+
* correspond to a named capture group, consistent with
1829+
* `StringScanner#[]`.
1830+
*
1831+
* This is semantically equivalent to `self[specifier]&.to_i(base)`
1832+
* but avoids the allocation of a temporary String when possible.
1833+
*
1834+
* ```rb
1835+
* scanner = StringScanner.new("2024-06-15")
1836+
* scanner.scan(/(\d{4})-(\d{2})-(\d{2})/)
1837+
* scanner.integer_at(1) # => 2024
1838+
* scanner.integer_at(1, 16) # => 8228
1839+
* ```
1840+
*/
1841+
static VALUE
1842+
strscan_integer_at(int argc, VALUE *argv, VALUE self)
1843+
{
1844+
struct strscanner *p;
1845+
long i;
1846+
long beg, end, len;
1847+
const char *ptr;
1848+
VALUE rb_specifier;
1849+
VALUE rb_base;
1850+
int base = 10;
1851+
1852+
GET_SCANNER(self, p);
1853+
rb_scan_args(argc, argv, "11", &rb_specifier, &rb_base);
1854+
if (argc > 1)
1855+
base = NUM2INT(rb_base);
1856+
i = resolve_capture_index(p, rb_specifier);
1857+
if (i < 0)
1858+
return Qnil;
1859+
1860+
beg = adjust_register_position(p, p->regs.beg[i]);
1861+
end = adjust_register_position(p, p->regs.end[i]);
1862+
len = end - beg;
1863+
ptr = S_PBEG(p) + beg;
1864+
#ifdef HAVE_RB_INT_PARSE_CSTR
1865+
{
1866+
/*
1867+
* Ruby 2.5 or later export the rb_int_parse_cstr() symbol but
1868+
* prototype definition isn't provided. Ruby 4.1 or later
1869+
* provide prototype definition.
1870+
*/
1871+
# ifndef RB_INT_PARSE_DEFAULT
1872+
VALUE rb_int_parse_cstr(const char *str, ssize_t len, char **endp,
1873+
size_t *ndigits, int base, int flags);
1874+
# define RB_INT_PARSE_DEFAULT 0x07
1875+
# endif
1876+
char *endp;
1877+
return rb_int_parse_cstr(ptr, len, &endp, NULL, base,
1878+
RB_INT_PARSE_DEFAULT);
1879+
}
1880+
#else
1881+
return rb_str_to_inum(rb_str_new(ptr, len), base, 0);
1882+
#endif
1883+
}
1884+
17961885
/*
17971886
* :markup: markdown
17981887
* :include: strscan/link_refs.txt
@@ -2353,6 +2442,7 @@ Init_strscan(void)
23532442
rb_define_method(StringScanner, "matched", strscan_matched, 0);
23542443
rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
23552444
rb_define_method(StringScanner, "[]", strscan_aref, 1);
2445+
rb_define_method(StringScanner, "integer_at", strscan_integer_at, -1);
23562446
rb_define_method(StringScanner, "pre_match", strscan_pre_match, 0);
23572447
rb_define_method(StringScanner, "post_match", strscan_post_match, 0);
23582448
rb_define_method(StringScanner, "size", strscan_size, 0);

lib/strscan/strscan.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# frozen_string_literal: true
22

33
class StringScanner
4+
unless method_defined?(:integer_at) # For JRuby
5+
def integer_at(specifier, *to_i_args)
6+
self[specifier]&.to_i(*to_i_args)
7+
end
8+
end
9+
410
# :markup: markdown
511
#
612
# call-seq:

lib/strscan/truffleruby.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ def [](group)
133133
end
134134
end
135135

136+
def integer_at(group, *to_i_args) = self[group]&.to_i(*to_i_args)
137+
136138
def values_at(*groups) = @last_match&.values_at(*groups)
137139

138140
def captures = @last_match&.captures

test/strscan/test_stringscanner.rb

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,59 @@ def test_AREF
525525
end
526526
end
527527

528+
def assert_integer_at(s, specifier, *to_i_args)
529+
assert_equal(s[specifier]&.to_i(*to_i_args),
530+
s.integer_at(specifier, *to_i_args))
531+
end
532+
533+
def test_integer_at
534+
s = create_string_scanner("before 20260514 after")
535+
s.skip_until(" ")
536+
assert_equal("20260514", s.scan(/(\d{4})(\d{2})(\d{2})/))
537+
assert_integer_at(s, 0) # 20260514
538+
assert_integer_at(s, 1) # 2026
539+
assert_integer_at(s, 2) # 5
540+
assert_integer_at(s, 3) # 14
541+
assert_integer_at(s, 4) # nil
542+
assert_integer_at(s, -1) # 14
543+
assert_integer_at(s, -2) # 5
544+
assert_integer_at(s, -3) # 2026
545+
assert_integer_at(s, -4) # 20260514
546+
assert_integer_at(s, -5) # nil
547+
end
548+
549+
def test_integer_at_name_string
550+
s = create_string_scanner("before 20260514 after")
551+
s.skip_until(" ")
552+
assert_equal("20260514", s.scan(/(?<y>\d{4})(?<m>\d{2})(?<d>\d{2})/))
553+
assert_integer_at(s, "y")
554+
assert_integer_at(s, "m")
555+
assert_integer_at(s, "d")
556+
end
557+
558+
def test_integer_at_name_symbol
559+
s = create_string_scanner("before 20260514 after")
560+
s.skip_until(" ")
561+
assert_equal("20260514", s.scan(/(?<y>\d{4})(?<m>\d{2})(?<d>\d{2})/))
562+
assert_integer_at(s, :y)
563+
assert_integer_at(s, :m)
564+
assert_integer_at(s, :d)
565+
end
566+
567+
def test_integer_at_base
568+
s = create_string_scanner("before 111 after")
569+
s.skip_until(" ")
570+
assert_equal("111", s.scan(/\d+/))
571+
assert_integer_at(s, 0, 2)
572+
end
573+
574+
def test_integer_at_base_auto
575+
s = create_string_scanner("before 0xa_f after")
576+
s.skip_until(" ")
577+
assert_equal("0xa_f", s.scan(/0x[\h_]+/))
578+
assert_integer_at(s, 0, 0) # 0xaf
579+
end
580+
528581
def test_pre_match
529582
s = create_string_scanner('a b c d e')
530583
s.scan(/\w/)

0 commit comments

Comments
 (0)