Skip to content

Commit 519a4bd

Browse files
committed
Optimize File.basename
The actual algorithm is largely unchanged, just allowed to use singlebyte checks for common encodings. It could certainly be optimized much further, as here again it often scans from the front of the string when we're interested in the back of it. But the algorithm as many Windows only corner cases so I'd rather ship a good improvement now and eventually come back to it later. Most of improvement here is from the reduced setup cost (avodi double null checks, avoid duping the argument, etc), and skipping the multi-byte checks. ``` compare-ruby: ruby 4.1.0dev (2026-01-19T03:51:30Z master 631bf19) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-21T08:21:05Z opt-basename 7eb1174) +PRISM [arm64-darwin25] ``` | |compare-ruby|built-ruby| |:----------|-----------:|---------:| |long | 3.412M| 18.158M| | | -| 5.32x| |long_name | 1.981M| 8.580M| | | -| 4.33x| |withext | 3.200M| 12.986M| | | -| 4.06x|
1 parent 0f1eea0 commit 519a4bd

File tree

4 files changed

+97
-31
lines changed

4 files changed

+97
-31
lines changed

benchmark/file_basename.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
prelude: |
2+
# frozen_string_literal: true
3+
benchmark:
4+
long: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml")
5+
long_name: File.basename("Users_george_src_github.com_ruby_ruby_benchmark_file_dirname.yml")
6+
withext: File.basename("/Users/george/src/github.com/ruby/ruby/benchmark/file_dirname.yml", ".yml")

file.c

Lines changed: 58 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3749,7 +3749,7 @@ strrdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
37493749
}
37503750

37513751
static char *
3752-
chompdirsep(const char *path, const char *end, rb_encoding *enc)
3752+
chompdirsep(const char *path, const char *end, bool mb_enc, rb_encoding *enc)
37533753
{
37543754
while (path < end) {
37553755
if (isdirsep(*path)) {
@@ -3758,7 +3758,7 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc)
37583758
if (path >= end) return (char *)last;
37593759
}
37603760
else {
3761-
Inc(path, end, true, enc);
3761+
Inc(path, end, mb_enc, enc);
37623762
}
37633763
}
37643764
return (char *)path;
@@ -3768,7 +3768,7 @@ char *
37683768
rb_enc_path_end(const char *path, const char *end, rb_encoding *enc)
37693769
{
37703770
if (path < end && isdirsep(*path)) path++;
3771-
return chompdirsep(path, end, enc);
3771+
return chompdirsep(path, end, true, enc);
37723772
}
37733773

37743774
static rb_encoding *
@@ -4088,7 +4088,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
40884088
rb_enc_associate(result, enc = fs_enc_check(result, fname));
40894089
p = pend;
40904090
}
4091-
p = chompdirsep(skiproot(buf, p), p, enc);
4091+
p = chompdirsep(skiproot(buf, p), p, true, enc);
40924092
s += 2;
40934093
}
40944094
}
@@ -4113,7 +4113,7 @@ rb_file_expand_path_internal(VALUE fname, VALUE dname, int abs_mode, int long_na
41134113
}
41144114
else
41154115
#endif /* defined DOSISH || defined __CYGWIN__ */
4116-
p = chompdirsep(skiproot(buf, p), p, enc);
4116+
p = chompdirsep(skiproot(buf, p), p, true, enc);
41174117
}
41184118
else {
41194119
size_t len;
@@ -4656,7 +4656,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum
46564656
root_found:
46574657
RSTRING_GETMEM(resolved, prefixptr, prefixlen);
46584658
pend = prefixptr + prefixlen;
4659-
ptr = chompdirsep(prefixptr, pend, enc);
4659+
ptr = chompdirsep(prefixptr, pend, true, enc);
46604660
if (ptr < pend) {
46614661
prefixlen = ++ptr - prefixptr;
46624662
rb_str_set_len(resolved, prefixlen);
@@ -4910,22 +4910,31 @@ rmext(const char *p, long l0, long l1, const char *e, long l2, rb_encoding *enc)
49104910
return 0;
49114911
}
49124912

4913-
const char *
4914-
ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc)
4913+
static inline const char *
4914+
enc_find_basename(const char *name, long *baselen, long *alllen, bool mb_enc, rb_encoding *enc)
49154915
{
49164916
const char *p, *q, *e, *end;
49174917
#if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC
49184918
const char *root;
49194919
#endif
49204920
long f = 0, n = -1;
49214921

4922-
end = name + (alllen ? (size_t)*alllen : strlen(name));
4923-
name = skipprefix(name, end, true, enc);
4922+
long len = (alllen ? (size_t)*alllen : strlen(name));
4923+
4924+
if (len <= 0) {
4925+
return name;
4926+
}
4927+
4928+
end = name + len;
4929+
name = skipprefix(name, end, mb_enc, enc);
49244930
#if defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC
49254931
root = name;
49264932
#endif
4927-
while (isdirsep(*name))
4933+
4934+
while (isdirsep(*name)) {
49284935
name++;
4936+
}
4937+
49294938
if (!*name) {
49304939
p = name - 1;
49314940
f = 1;
@@ -4947,32 +4956,47 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
49474956
#endif /* defined DOSISH_DRIVE_LETTER || defined DOSISH_UNC */
49484957
}
49494958
else {
4950-
if (!(p = strrdirsep(name, end, true, enc))) {
4959+
p = strrdirsep(name, end, mb_enc, enc);
4960+
if (!p) {
49514961
p = name;
49524962
}
49534963
else {
4954-
while (isdirsep(*p)) p++; /* skip last / */
4964+
while (isdirsep(*p)) {
4965+
p++; /* skip last / */
4966+
}
49554967
}
49564968
#if USE_NTFS
49574969
n = ntfs_tail(p, end, enc) - p;
49584970
#else
4959-
n = chompdirsep(p, end, enc) - p;
4971+
n = chompdirsep(p, end, mb_enc, enc) - p;
49604972
#endif
49614973
for (q = p; q - p < n && *q == '.'; q++);
4962-
for (e = 0; q - p < n; Inc(q, end, true, enc)) {
4974+
for (e = 0; q - p < n; Inc(q, end, mb_enc, enc)) {
49634975
if (*q == '.') e = q;
49644976
}
4965-
if (e) f = e - p;
4966-
else f = n;
4977+
if (e) {
4978+
f = e - p;
4979+
}
4980+
else {
4981+
f = n;
4982+
}
49674983
}
49684984

4969-
if (baselen)
4985+
if (baselen) {
49704986
*baselen = f;
4971-
if (alllen)
4987+
}
4988+
if (alllen) {
49724989
*alllen = n;
4990+
}
49734991
return p;
49744992
}
49754993

4994+
const char *
4995+
ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc)
4996+
{
4997+
return enc_find_basename(name, baselen, alllen, true, enc);
4998+
}
4999+
49765000
/*
49775001
* call-seq:
49785002
* File.basename(file_name [, suffix] ) -> base_name
@@ -4993,7 +5017,7 @@ ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encodin
49935017
static VALUE
49945018
rb_file_s_basename(int argc, VALUE *argv, VALUE _)
49955019
{
4996-
VALUE fname, fext, basename;
5020+
VALUE fname, fext;
49975021
const char *name, *p;
49985022
long f, n;
49995023
rb_encoding *enc;
@@ -5006,15 +5030,19 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _)
50065030
enc = rb_str_enc_get(fext);
50075031
}
50085032
fname = argv[0];
5009-
FilePathStringValue(fname);
5033+
CheckPath(fname, name);
50105034
if (NIL_P(fext) || !(enc = rb_enc_compatible(fname, fext))) {
5011-
enc = rb_enc_get(fname);
5035+
enc = rb_str_enc_get(fname);
50125036
fext = Qnil;
50135037
}
5014-
if ((n = RSTRING_LEN(fname)) == 0 || !*(name = RSTRING_PTR(fname)))
5015-
return rb_str_new_shared(fname);
50165038

5017-
p = ruby_enc_find_basename(name, &f, &n, enc);
5039+
n = RSTRING_LEN(fname);
5040+
if (n == 0 || !*name) {
5041+
rb_enc_str_new(0, 0, enc);
5042+
}
5043+
5044+
bool mb_enc = !rb_str_encindex_fastpath(rb_enc_to_index(enc));
5045+
p = enc_find_basename(name, &f, &n, mb_enc, enc);
50185046
if (n >= 0) {
50195047
if (NIL_P(fext)) {
50205048
f = n;
@@ -5027,12 +5055,12 @@ rb_file_s_basename(int argc, VALUE *argv, VALUE _)
50275055
}
50285056
RB_GC_GUARD(fext);
50295057
}
5030-
if (f == RSTRING_LEN(fname)) return rb_str_new_shared(fname);
5058+
if (f == RSTRING_LEN(fname)) {
5059+
return rb_str_new_shared(fname);
5060+
}
50315061
}
50325062

5033-
basename = rb_str_new(p, f);
5034-
rb_enc_copy(basename, fname);
5035-
return basename;
5063+
return rb_enc_str_new(p, f, enc);
50365064
}
50375065

50385066
static VALUE rb_file_dirname_n(VALUE fname, int n);
@@ -5350,7 +5378,7 @@ rb_file_join_ary(VALUE ary)
53505378
rb_enc_copy(result, tmp);
53515379
}
53525380
else {
5353-
tail = chompdirsep(name, name + len, rb_enc_get(result));
5381+
tail = chompdirsep(name, name + len, true, rb_enc_get(result));
53545382
if (RSTRING_PTR(tmp) && isdirsep(RSTRING_PTR(tmp)[0])) {
53555383
rb_str_set_len(result, tail - name);
53565384
}

internal/string.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@ enum ruby_rstring_private_flags {
3333
static inline bool
3434
rb_str_encindex_fastpath(int encindex)
3535
{
36-
// The overwhelming majority of strings are in one of these 3 encodings.
36+
// The overwhelming majority of strings are in one of these 3 encodings,
37+
// which are all either ASCII or perfect ASCII supersets.
38+
// Hence you can use fast, single byte algorithms on them, such as `memchr` etc,
39+
// without all the overhead of fetching the rb_encoding and using functions such as
40+
// rb_enc_mbminlen etc.
41+
// Many other encodings could qualify, but they are expected to be rare occurences,
42+
// so it's better to keep that list small.
3743
switch (encindex) {
3844
case ENCINDEX_ASCII_8BIT:
3945
case ENCINDEX_UTF_8:

spec/ruby/core/file/basename_spec.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,34 @@
151151
File.basename("c:\\bar.txt", ".*").should == "bar"
152152
File.basename("c:\\bar.txt.exe", ".*").should == "bar.txt"
153153
end
154+
155+
it "handles Shift JIS 0x5C (\\) as second byte of a multi-byte sequence" do
156+
# dir\fileソname.txt
157+
path = "dir\\file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS)
158+
path.valid_encoding?.should be_true
159+
File.basename(path).should == "file\x83\x5cname.txt".b.force_encoding(Encoding::SHIFT_JIS)
160+
end
154161
end
155162

163+
it "rejects strings encoded with non ASCII-compatible encodings" do
164+
Encoding.list.reject(&:ascii_compatible?).reject(&:dummy?).each do |enc|
165+
begin
166+
path = "/foo/bar".encode(enc)
167+
rescue Encoding::ConverterNotFoundError
168+
next
169+
end
170+
171+
-> {
172+
File.basename(path)
173+
}.should raise_error(Encoding::CompatibilityError)
174+
end
175+
end
176+
177+
it "works with all ASCII-compatible encodings" do
178+
Encoding.list.select(&:ascii_compatible?).each do |enc|
179+
File.basename("/foo/bar".encode(enc)).should == "bar".encode(enc)
180+
end
181+
end
156182

157183
it "returns the extension for a multibyte filename" do
158184
File.basename('/path/Офис.m4a').should == "Офис.m4a"

0 commit comments

Comments
 (0)