From 5f385829e2a6aa681505af4b3d5d595c9338f1cf Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sat, 15 Nov 2025 08:07:16 -0500 Subject: [PATCH 1/2] refactor(span): rename source_len to normalized_source_len This is a preparation for introducing a unnormalized source length field --- compiler/rustc_interface/src/passes.rs | 2 +- compiler/rustc_metadata/src/rmeta/decoder.rs | 8 +++---- .../src/ich/impls_syntax.rs | 2 +- compiler/rustc_span/src/lib.rs | 23 +++++++++---------- compiler/rustc_span/src/source_map.rs | 12 +++++----- compiler/rustc_span/src/source_map/tests.rs | 4 ++-- src/tools/clippy/clippy_config/src/conf.rs | 2 +- 7 files changed, 26 insertions(+), 27 deletions(-) diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs index e68db4f44ca48..5eb92ddb26059 100644 --- a/compiler/rustc_interface/src/passes.rs +++ b/compiler/rustc_interface/src/passes.rs @@ -596,7 +596,7 @@ fn write_out_deps(tcx: TyCtxt<'_>, outputs: &OutputFilenames, out_filenames: &[P .map(|fmap| { ( escape_dep_filename(&fmap.name.prefer_local().to_string()), - fmap.source_len.0 as u64, + fmap.normalized_source_len.0 as u64, fmap.checksum_hash, ) }) diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs index 808d9fbbc2cef..87853ccd2b47a 100644 --- a/compiler/rustc_metadata/src/rmeta/decoder.rs +++ b/compiler/rustc_metadata/src/rmeta/decoder.rs @@ -1744,7 +1744,7 @@ impl<'a> CrateMetadataRef<'a> { src_hash, checksum_hash, start_pos: original_start_pos, - source_len, + normalized_source_len, lines, multibyte_chars, normalized_pos, @@ -1804,7 +1804,7 @@ impl<'a> CrateMetadataRef<'a> { src_hash, checksum_hash, stable_id, - source_len.to_u32(), + normalized_source_len.to_u32(), self.cnum, lines, multibyte_chars, @@ -1817,9 +1817,9 @@ impl<'a> CrateMetadataRef<'a> { translated (start_pos {:?} source_len {:?})", local_version.name, original_start_pos, - source_len, + normalized_source_len, local_version.start_pos, - local_version.source_len + local_version.normalized_source_len ); ImportedSourceFile { diff --git a/compiler/rustc_query_system/src/ich/impls_syntax.rs b/compiler/rustc_query_system/src/ich/impls_syntax.rs index 044b97c2fea19..977ec6eb7f38a 100644 --- a/compiler/rustc_query_system/src/ich/impls_syntax.rs +++ b/compiler/rustc_query_system/src/ich/impls_syntax.rs @@ -54,7 +54,7 @@ impl<'a> HashStable> for SourceFile { checksum_hash: _, external_src: _, start_pos: _, - source_len: _, + normalized_source_len: _, lines: _, ref multibyte_chars, ref normalized_pos, diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index afd4564f1b6fd..c400f93845277 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -1723,8 +1723,8 @@ pub struct SourceFile { pub external_src: FreezeLock, /// The start position of this source in the `SourceMap`. pub start_pos: BytePos, - /// The byte length of this source. - pub source_len: RelativeBytePos, + /// The byte length of this source after normalization. + pub normalized_source_len: RelativeBytePos, /// Locations of lines beginnings in the source code. pub lines: FreezeLock, /// Locations of multi-byte characters in the source code. @@ -1748,7 +1748,7 @@ impl Clone for SourceFile { checksum_hash: self.checksum_hash, external_src: self.external_src.clone(), start_pos: self.start_pos, - source_len: self.source_len, + normalized_source_len: self.normalized_source_len, lines: self.lines.clone(), multibyte_chars: self.multibyte_chars.clone(), normalized_pos: self.normalized_pos.clone(), @@ -1764,7 +1764,7 @@ impl Encodable for SourceFile { self.src_hash.encode(s); self.checksum_hash.encode(s); // Do not encode `start_pos` as it's global state for this session. - self.source_len.encode(s); + self.normalized_source_len.encode(s); // We are always in `Lines` form by the time we reach here. assert!(self.lines.read().is_lines()); @@ -1837,7 +1837,7 @@ impl Decodable for SourceFile { let name: FileName = Decodable::decode(d); let src_hash: SourceFileHash = Decodable::decode(d); let checksum_hash: Option = Decodable::decode(d); - let source_len: RelativeBytePos = Decodable::decode(d); + let normalized_source_len: RelativeBytePos = Decodable::decode(d); let lines = { let num_lines: u32 = Decodable::decode(d); if num_lines > 0 { @@ -1859,7 +1859,7 @@ impl Decodable for SourceFile { SourceFile { name, start_pos: BytePos::from_u32(0), - source_len, + normalized_source_len, src: None, src_hash, checksum_hash, @@ -1962,9 +1962,8 @@ impl SourceFile { let normalized_pos = normalize_src(&mut src); let stable_id = StableSourceFileId::from_filename_in_current_crate(&name); - let source_len = src.len(); - let source_len = u32::try_from(source_len).map_err(|_| OffsetOverflowError)?; - if source_len > Self::MAX_FILE_SIZE { + let normalized_source_len = u32::try_from(src.len()).map_err(|_| OffsetOverflowError)?; + if normalized_source_len > Self::MAX_FILE_SIZE { return Err(OffsetOverflowError); } @@ -1977,7 +1976,7 @@ impl SourceFile { checksum_hash, external_src: FreezeLock::frozen(ExternalSource::Unneeded), start_pos: BytePos::from_u32(0), - source_len: RelativeBytePos::from_u32(source_len), + normalized_source_len: RelativeBytePos::from_u32(normalized_source_len), lines: FreezeLock::frozen(SourceFileLines::Lines(lines)), multibyte_chars, normalized_pos, @@ -2161,7 +2160,7 @@ impl SourceFile { #[inline] pub fn end_position(&self) -> BytePos { - self.absolute_position(self.source_len) + self.absolute_position(self.normalized_source_len) } /// Finds the line containing the given position. The return value is the @@ -2197,7 +2196,7 @@ impl SourceFile { #[inline] pub fn is_empty(&self) -> bool { - self.source_len.to_u32() == 0 + self.normalized_source_len.to_u32() == 0 } /// Calculates the original byte position relative to the start of the file diff --git a/compiler/rustc_span/src/source_map.rs b/compiler/rustc_span/src/source_map.rs index 166842e374b66..0a99fb3aa34f8 100644 --- a/compiler/rustc_span/src/source_map.rs +++ b/compiler/rustc_span/src/source_map.rs @@ -262,7 +262,7 @@ impl SourceMap { bytes, Span::new( file.start_pos, - BytePos(file.start_pos.0 + file.source_len.0), + BytePos(file.start_pos.0 + file.normalized_source_len.0), SyntaxContext::root(), None, ), @@ -353,14 +353,14 @@ impl SourceMap { src_hash: SourceFileHash, checksum_hash: Option, stable_id: StableSourceFileId, - source_len: u32, + normalized_source_len: u32, cnum: CrateNum, file_local_lines: FreezeLock, multibyte_chars: Vec, normalized_pos: Vec, metadata_index: u32, ) -> Arc { - let source_len = RelativeBytePos::from_u32(source_len); + let normalized_source_len = RelativeBytePos::from_u32(normalized_source_len); let source_file = SourceFile { name: filename, @@ -372,7 +372,7 @@ impl SourceMap { metadata_index, }), start_pos: BytePos(0), - source_len, + normalized_source_len, lines: file_local_lines, multibyte_chars, normalized_pos, @@ -566,7 +566,7 @@ impl SourceMap { let start_index = local_begin.pos.to_usize(); let end_index = local_end.pos.to_usize(); - let source_len = local_begin.sf.source_len.to_usize(); + let source_len = local_begin.sf.normalized_source_len.to_usize(); if start_index > end_index || end_index > source_len { return Err(SpanSnippetError::MalformedForSourcemap(MalformedSourceMapPositions { @@ -997,7 +997,7 @@ impl SourceMap { return 1; } - let source_len = local_begin.sf.source_len.to_usize(); + let source_len = local_begin.sf.normalized_source_len.to_usize(); debug!("source_len=`{:?}`", source_len); // Ensure indexes are also not malformed. if start_index > end_index || end_index > source_len - 1 { diff --git a/compiler/rustc_span/src/source_map/tests.rs b/compiler/rustc_span/src/source_map/tests.rs index 589c2a3635481..bc802277e279b 100644 --- a/compiler/rustc_span/src/source_map/tests.rs +++ b/compiler/rustc_span/src/source_map/tests.rs @@ -230,7 +230,7 @@ fn t10() { name, src_hash, checksum_hash, - source_len, + normalized_source_len, lines, multibyte_chars, normalized_pos, @@ -243,7 +243,7 @@ fn t10() { src_hash, checksum_hash, stable_id, - source_len.to_u32(), + normalized_source_len.to_u32(), CrateNum::ZERO, FreezeLock::new(lines.read().clone()), multibyte_chars, diff --git a/src/tools/clippy/clippy_config/src/conf.rs b/src/tools/clippy/clippy_config/src/conf.rs index 2a042e6c3d853..8cdd99ac44a8e 100644 --- a/src/tools/clippy/clippy_config/src/conf.rs +++ b/src/tools/clippy/clippy_config/src/conf.rs @@ -108,7 +108,7 @@ struct ConfError { impl ConfError { fn from_toml(file: &SourceFile, error: &toml::de::Error) -> Self { - let span = error.span().unwrap_or(0..file.source_len.0 as usize); + let span = error.span().unwrap_or(0..file.normalized_source_len.0 as usize); Self::spanned(file, error.message(), None, span) } From cf57b9b292a47a3f44e9c5dc56d0e24e8b07a10f Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Fri, 14 Nov 2025 17:56:40 -0500 Subject: [PATCH 2/2] fix(span): track unnormalized source len for dep-info Add `unnormalized_source_len` field to track the byte length of source files before normalization (the original length). `unnormalized_source_len` is for writing the correct file length to dep-info for `-Zchecksum-hash-algorithm` --- compiler/rustc_interface/src/passes.rs | 4 +++- compiler/rustc_metadata/src/rmeta/decoder.rs | 2 ++ .../rustc_query_system/src/ich/impls_syntax.rs | 1 + compiler/rustc_span/src/lib.rs | 13 +++++++++++++ compiler/rustc_span/src/source_map.rs | 2 ++ compiler/rustc_span/src/source_map/tests.rs | 2 ++ compiler/rustc_span/src/tests.rs | 14 ++++++++++++++ 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs index 5eb92ddb26059..ddfec9f886a6a 100644 --- a/compiler/rustc_interface/src/passes.rs +++ b/compiler/rustc_interface/src/passes.rs @@ -596,7 +596,9 @@ fn write_out_deps(tcx: TyCtxt<'_>, outputs: &OutputFilenames, out_filenames: &[P .map(|fmap| { ( escape_dep_filename(&fmap.name.prefer_local().to_string()), - fmap.normalized_source_len.0 as u64, + // This needs to be unnormalized, + // as external tools wouldn't know how rustc normalizes them + fmap.unnormalized_source_len as u64, fmap.checksum_hash, ) }) diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs index 87853ccd2b47a..6c796b3a9c8c5 100644 --- a/compiler/rustc_metadata/src/rmeta/decoder.rs +++ b/compiler/rustc_metadata/src/rmeta/decoder.rs @@ -1745,6 +1745,7 @@ impl<'a> CrateMetadataRef<'a> { checksum_hash, start_pos: original_start_pos, normalized_source_len, + unnormalized_source_len, lines, multibyte_chars, normalized_pos, @@ -1805,6 +1806,7 @@ impl<'a> CrateMetadataRef<'a> { checksum_hash, stable_id, normalized_source_len.to_u32(), + unnormalized_source_len, self.cnum, lines, multibyte_chars, diff --git a/compiler/rustc_query_system/src/ich/impls_syntax.rs b/compiler/rustc_query_system/src/ich/impls_syntax.rs index 977ec6eb7f38a..118229ffc9902 100644 --- a/compiler/rustc_query_system/src/ich/impls_syntax.rs +++ b/compiler/rustc_query_system/src/ich/impls_syntax.rs @@ -55,6 +55,7 @@ impl<'a> HashStable> for SourceFile { external_src: _, start_pos: _, normalized_source_len: _, + unnormalized_source_len: _, lines: _, ref multibyte_chars, ref normalized_pos, diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index c400f93845277..2e03ccb1aa1a3 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -1725,6 +1725,8 @@ pub struct SourceFile { pub start_pos: BytePos, /// The byte length of this source after normalization. pub normalized_source_len: RelativeBytePos, + /// The byte length of this source before normalization. + pub unnormalized_source_len: u32, /// Locations of lines beginnings in the source code. pub lines: FreezeLock, /// Locations of multi-byte characters in the source code. @@ -1749,6 +1751,7 @@ impl Clone for SourceFile { external_src: self.external_src.clone(), start_pos: self.start_pos, normalized_source_len: self.normalized_source_len, + unnormalized_source_len: self.unnormalized_source_len, lines: self.lines.clone(), multibyte_chars: self.multibyte_chars.clone(), normalized_pos: self.normalized_pos.clone(), @@ -1765,6 +1768,7 @@ impl Encodable for SourceFile { self.checksum_hash.encode(s); // Do not encode `start_pos` as it's global state for this session. self.normalized_source_len.encode(s); + self.unnormalized_source_len.encode(s); // We are always in `Lines` form by the time we reach here. assert!(self.lines.read().is_lines()); @@ -1838,6 +1842,7 @@ impl Decodable for SourceFile { let src_hash: SourceFileHash = Decodable::decode(d); let checksum_hash: Option = Decodable::decode(d); let normalized_source_len: RelativeBytePos = Decodable::decode(d); + let unnormalized_source_len = Decodable::decode(d); let lines = { let num_lines: u32 = Decodable::decode(d); if num_lines > 0 { @@ -1860,6 +1865,7 @@ impl Decodable for SourceFile { name, start_pos: BytePos::from_u32(0), normalized_source_len, + unnormalized_source_len, src: None, src_hash, checksum_hash, @@ -1959,6 +1965,12 @@ impl SourceFile { SourceFileHash::new_in_memory(checksum_hash_kind, src.as_bytes()) } }); + // Capture the original source length before normalization. + let unnormalized_source_len = u32::try_from(src.len()).map_err(|_| OffsetOverflowError)?; + if unnormalized_source_len > Self::MAX_FILE_SIZE { + return Err(OffsetOverflowError); + } + let normalized_pos = normalize_src(&mut src); let stable_id = StableSourceFileId::from_filename_in_current_crate(&name); @@ -1977,6 +1989,7 @@ impl SourceFile { external_src: FreezeLock::frozen(ExternalSource::Unneeded), start_pos: BytePos::from_u32(0), normalized_source_len: RelativeBytePos::from_u32(normalized_source_len), + unnormalized_source_len, lines: FreezeLock::frozen(SourceFileLines::Lines(lines)), multibyte_chars, normalized_pos, diff --git a/compiler/rustc_span/src/source_map.rs b/compiler/rustc_span/src/source_map.rs index 0a99fb3aa34f8..17de34c8436f1 100644 --- a/compiler/rustc_span/src/source_map.rs +++ b/compiler/rustc_span/src/source_map.rs @@ -354,6 +354,7 @@ impl SourceMap { checksum_hash: Option, stable_id: StableSourceFileId, normalized_source_len: u32, + unnormalized_source_len: u32, cnum: CrateNum, file_local_lines: FreezeLock, multibyte_chars: Vec, @@ -373,6 +374,7 @@ impl SourceMap { }), start_pos: BytePos(0), normalized_source_len, + unnormalized_source_len, lines: file_local_lines, multibyte_chars, normalized_pos, diff --git a/compiler/rustc_span/src/source_map/tests.rs b/compiler/rustc_span/src/source_map/tests.rs index bc802277e279b..c919aacf6b5f4 100644 --- a/compiler/rustc_span/src/source_map/tests.rs +++ b/compiler/rustc_span/src/source_map/tests.rs @@ -231,6 +231,7 @@ fn t10() { src_hash, checksum_hash, normalized_source_len, + unnormalized_source_len, lines, multibyte_chars, normalized_pos, @@ -244,6 +245,7 @@ fn t10() { checksum_hash, stable_id, normalized_source_len.to_u32(), + unnormalized_source_len, CrateNum::ZERO, FreezeLock::new(lines.read().clone()), multibyte_chars, diff --git a/compiler/rustc_span/src/tests.rs b/compiler/rustc_span/src/tests.rs index ed1db34463429..64c40e6116250 100644 --- a/compiler/rustc_span/src/tests.rs +++ b/compiler/rustc_span/src/tests.rs @@ -103,3 +103,17 @@ fn test_trim() { assert_eq!(span(well_before, before).trim_start(other), None); } + +#[test] +fn test_unnormalized_source_length() { + let source = "\u{feff}hello\r\nferries\r\n".to_owned(); + let sf = SourceFile::new( + FileName::Anon(Hash64::ZERO), + source, + SourceFileHashAlgorithm::Sha256, + Some(SourceFileHashAlgorithm::Sha256), + ) + .unwrap(); + assert_eq!(sf.unnormalized_source_len, 19); + assert_eq!(sf.normalized_source_len.0, 14); +}