1818use mem:: transmute;
1919use option:: { None , Option , Some } ;
2020use iter:: range_step;
21+ use collections:: Collection ;
2122
2223// UTF-8 ranges and tags for encoding characters
2324static TAG_CONT : u8 = 0b1000_0000u8 ;
@@ -27,7 +28,6 @@ static TAG_FOUR_B: u8 = 0b1111_0000u8;
2728static MAX_ONE_B : u32 = 0x80u32 ;
2829static MAX_TWO_B : u32 = 0x800u32 ;
2930static MAX_THREE_B : u32 = 0x10000u32 ;
30- static MAX_FOUR_B : u32 = 0x200000u32 ;
3131
3232/*
3333 Lu Uppercase_Letter an uppercase letter
@@ -217,14 +217,14 @@ pub fn escape_default(c: char, f: |char|) {
217217}
218218
219219/// Returns the amount of bytes this `char` would need if encoded in UTF-8
220+ #[ inline]
220221pub fn len_utf8_bytes ( c : char ) -> uint {
221222 let code = c as u32 ;
222223 match ( ) {
223224 _ if code < MAX_ONE_B => 1 u,
224225 _ if code < MAX_TWO_B => 2 u,
225226 _ if code < MAX_THREE_B => 3 u,
226- _ if code < MAX_FOUR_B => 4 u,
227- _ => fail ! ( "invalid character!" ) ,
227+ _ => 4 u,
228228 }
229229}
230230
@@ -297,21 +297,19 @@ pub trait Char {
297297 /// UTF-8.
298298 fn len_utf8_bytes ( & self ) -> uint ;
299299
300- /// Encodes this character as UTF-8 into the provided byte buffer.
301- ///
302- /// The buffer must be at least 4 bytes long or a runtime failure may
303- /// occur.
300+ /// Encodes this character as UTF-8 into the provided byte buffer,
301+ /// and then returns the number of bytes written.
304302 ///
305- /// This will then return the number of bytes written to the slice.
306- fn encode_utf8 ( & self , dst : & mut [ u8 ] ) -> uint ;
303+ /// If the buffer is not large enough, nothing will be written into it
304+ /// and a `None` will be returned.
305+ fn encode_utf8 ( & self , dst : & mut [ u8 ] ) -> Option < uint > ;
307306
308- /// Encodes this character as UTF-16 into the provided `u16` buffer.
307+ /// Encodes this character as UTF-16 into the provided `u16` buffer,
308+ /// and then returns the number of `u16`s written.
309309 ///
310- /// The buffer must be at least 2 elements long or a runtime failure may
311- /// occur.
312- ///
313- /// This will then return the number of `u16`s written to the slice.
314- fn encode_utf16 ( & self , dst : & mut [ u16 ] ) -> uint ;
310+ /// If the buffer is not large enough, nothing will be written into it
311+ /// and a `None` will be returned.
312+ fn encode_utf16 ( & self , dst : & mut [ u16 ] ) -> Option < uint > ;
315313}
316314
317315impl Char for char {
@@ -325,45 +323,52 @@ impl Char for char {
325323
326324 fn escape_default ( & self , f: |char|) { escape_default ( * self , f) }
327325
326+ #[ inline]
328327 fn len_utf8_bytes ( & self ) -> uint { len_utf8_bytes ( * self ) }
329328
330- fn encode_utf8 < ' a > ( & self , dst : & ' a mut [ u8 ] ) -> uint {
329+ #[ inline]
330+ fn encode_utf8 < ' a > ( & self , dst : & ' a mut [ u8 ] ) -> Option < uint > {
331+ // Marked #[inline] to allow llvm optimizing it away
331332 let code = * self as u32 ;
332- if code < MAX_ONE_B {
333+ if code < MAX_ONE_B && dst . len ( ) >= 1 {
333334 dst[ 0 ] = code as u8 ;
334- 1
335- } else if code < MAX_TWO_B {
335+ Some ( 1 )
336+ } else if code < MAX_TWO_B && dst . len ( ) >= 2 {
336337 dst[ 0 ] = ( code >> 6 u & 0x1F_u32 ) as u8 | TAG_TWO_B ;
337338 dst[ 1 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
338- 2
339- } else if code < MAX_THREE_B {
339+ Some ( 2 )
340+ } else if code < MAX_THREE_B && dst . len ( ) >= 3 {
340341 dst[ 0 ] = ( code >> 12 u & 0x0F_u32 ) as u8 | TAG_THREE_B ;
341342 dst[ 1 ] = ( code >> 6 u & 0x3F_u32 ) as u8 | TAG_CONT ;
342343 dst[ 2 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
343- 3
344- } else {
344+ Some ( 3 )
345+ } else if dst . len ( ) >= 4 {
345346 dst[ 0 ] = ( code >> 18 u & 0x07_u32 ) as u8 | TAG_FOUR_B ;
346347 dst[ 1 ] = ( code >> 12 u & 0x3F_u32 ) as u8 | TAG_CONT ;
347348 dst[ 2 ] = ( code >> 6 u & 0x3F_u32 ) as u8 | TAG_CONT ;
348349 dst[ 3 ] = ( code & 0x3F_u32 ) as u8 | TAG_CONT ;
349- 4
350+ Some ( 4 )
351+ } else {
352+ None
350353 }
351354 }
352355
353- fn encode_utf16 ( & self , dst : & mut [ u16 ] ) -> uint {
356+ #[ inline]
357+ fn encode_utf16 ( & self , dst : & mut [ u16 ] ) -> Option < uint > {
358+ // Marked #[inline] to allow llvm optimizing it away
354359 let mut ch = * self as u32 ;
355- if ( ch & 0xFFFF_u32 ) == ch {
360+ if ( ch & 0xFFFF_u32 ) == ch && dst . len ( ) >= 1 {
356361 // The BMP falls through (assuming non-surrogate, as it should)
357- assert ! ( ch <= 0xD7FF_u32 || ch >= 0xE000_u32 ) ;
358362 dst[ 0 ] = ch as u16 ;
359- 1
360- } else {
363+ Some ( 1 )
364+ } else if dst . len ( ) >= 2 {
361365 // Supplementary planes break into surrogates.
362- assert ! ( ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32 ) ;
363366 ch -= 0x1_0000_u32 ;
364367 dst[ 0 ] = 0xD800_u16 | ( ( ch >> 10 ) as u16 ) ;
365368 dst[ 1 ] = 0xDC00_u16 | ( ( ch as u16 ) & 0x3FF_u16 ) ;
366- 2
369+ Some ( 2 )
370+ } else {
371+ None
367372 }
368373 }
369374}
0 commit comments