diff --git a/kernel/common/string.rb b/kernel/common/string.rb index fb252edbe3..481bbf1363 100644 --- a/kernel/common/string.rb +++ b/kernel/common/string.rb @@ -1910,6 +1910,57 @@ def match(pattern, pos=0) result end + # Removes invalid byte sequences from a String, available since Ruby 2.1. + def scrub(replace = nil) + output = '' + input = dup + + # The default replacement character is the "Unicode replacement" character. + # (U+FFFD). + if !replace and !block_given? + replace = "\xEF\xBF\xBD".force_encoding("UTF-8") + .encode(self.encoding, :undef => :replace, :replace => '?') + end + + if replace + unless replace.is_a?(String) + raise( + TypeError, + "no implicit conversion of #{replace.class} into String" + ) + end + + unless replace.valid_encoding? + raise( + ArgumentError, + "replacement must be a valid byte sequence '#{replace.inspect}'" + ) + end + + replace = replace.force_encoding(Encoding::BINARY) + end + + converter = Encoding::Converter.new(input.encoding, Encoding::BINARY) + + while input.length > 0 + result = converter.primitive_convert(input, output, output.length) + + if result == :finished + break + elsif result == :undefined_conversion + output << converter.primitive_errinfo[3] + else + if block_given? + output << yield(converter.primitive_errinfo[3]) + else + output << replace + end + end + end + + return output.force_encoding(encoding) + end + def []=(index, count_or_replacement, replacement=undefined) if undefined.equal?(replacement) replacement = count_or_replacement diff --git a/spec/ruby/core/string/scrub_spec.rb b/spec/ruby/core/string/scrub_spec.rb new file mode 100644 index 0000000000..00901f043e --- /dev/null +++ b/spec/ruby/core/string/scrub_spec.rb @@ -0,0 +1,58 @@ +# -*- encoding: utf-8 -*- +require File.expand_path("../../../spec_helper", __FILE__) + +ruby_version_is "2.1" do + describe "String#scrub with a default replacement" do + it "returns self for valid strings" do + input = "foo" + + input.scrub.should == input + end + + it "replaces invalid byte sequences" do + "abc\u3042\x81".scrub.should == "abc\u3042\uFFFD" + end + end + + describe "String#scrub with a custom replacement" do + it "returns self for valid strings" do + input = "foo" + + input.scrub("*").should == input + end + + it "replaces invalid byte sequences" do + "abc\u3042\x81".scrub("*").should == "abc\u3042*" + end + + it "replaces groups of sequences together with a single replacement" do + "\xE3\x80".scrub("*").should == "*" + end + + it "raises ArgumentError for replacements with an invalid encoding" do + block = lambda { "foo".scrub("\xE4") } + + block.should raise_error(ArgumentError) + end + + it "raises TypeError when a non String replacement is given" do + block = lambda { "foo".scrub(1) } + + block.should raise_error(TypeError) + end + end + + describe "String#scrub with a block" do + it "returns self for valid strings" do + input = "foo" + + input.scrub { |b| "*" }.should == input + end + + it "replaces invalid byte sequences" do + replaced = "abc\u3042\xE3\x80".scrub { |b| "<#{b.unpack("H*")[0]}>" } + + replaced.should == "abc\u3042" + end + end +end