diff --git a/crates/crates_io_cdn_logs/src/cloudfront.rs b/crates/crates_io_cdn_logs/src/cloudfront.rs index dfe1f454f00..2723dee1c8a 100644 --- a/crates/crates_io_cdn_logs/src/cloudfront.rs +++ b/crates/crates_io_cdn_logs/src/cloudfront.rs @@ -5,6 +5,7 @@ use crate::DownloadsMap; use crate::paths::parse_path; +use crate::user_agent::should_count_user_agent; use chrono::NaiveDate; use std::borrow::Cow; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; @@ -18,6 +19,7 @@ const FIELD_DATE: &str = "date"; const FIELD_METHOD: &str = "cs-method"; const FIELD_PATH: &str = "cs-uri-stem"; const FIELD_STATUS: &str = "sc-status"; +const FIELD_USER_AGENT: &str = "cs(User-Agent)"; #[instrument(level = "debug", skip(reader))] pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result { @@ -26,6 +28,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul let mut method_index = None; let mut path_index = None; let mut status_index = None; + let mut user_agent_index = None; let mut downloads = DownloadsMap::new(); @@ -47,6 +50,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul method_index = fields.iter().position(|f| f == &FIELD_METHOD); path_index = fields.iter().position(|f| f == &FIELD_PATH); status_index = fields.iter().position(|f| f == &FIELD_STATUS); + user_agent_index = fields.iter().position(|f| f == &FIELD_USER_AGENT); continue; } @@ -76,6 +80,12 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul continue; } + let user_agent = get_optional_value(&values, user_agent_index); + if user_agent.is_some_and(|ua| !should_count_user_agent(ua)) { + // Ignore requests from user agents that should not be counted. + continue; + } + let path = get_value(&values, path_index, FIELD_PATH); // Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`. @@ -120,6 +130,10 @@ fn get_value<'a>(values: &'a [&'a str], index: Option, field_name: &'stat }) } +fn get_optional_value<'a>(values: &'a [&'a str], index: Option) -> Option<&'a str> { + index.and_then(|i| values.get(i)).copied() +} + #[cfg(test)] mod tests { use super::*; @@ -149,7 +163,6 @@ mod tests { 2024-01-17 flatbuffers@23.1.21 .. 1 2024-01-17 jemallocator@0.5.4 .. 1 2024-01-17 leveldb-sys@2.0.9 .. 1 - 2024-01-17 num_cpus@1.15.0 .. 1 2024-01-17 paste@1.0.12 .. 1 2024-01-17 quick-error@1.2.3 .. 1 2024-01-17 rand@0.8.5 .. 1 diff --git a/crates/crates_io_cdn_logs/src/fastly/json.rs b/crates/crates_io_cdn_logs/src/fastly/json.rs index 1c500312aba..8a7d5165a58 100644 --- a/crates/crates_io_cdn_logs/src/fastly/json.rs +++ b/crates/crates_io_cdn_logs/src/fastly/json.rs @@ -37,6 +37,15 @@ impl LogLine<'_> { LogLine::V1(line) => line.status, } } + + pub fn user_agent(&self) -> Option<&str> { + match self { + LogLine::V1(line) => line + .http + .as_ref() + .and_then(|http| http.useragent.as_deref()), + } + } } /// This struct corresponds to the `"version": "1"` variant of the [LogLine] enum. @@ -60,6 +69,14 @@ pub struct LogLineV1<'a> { #[serde(borrow)] pub url: Cow<'a, str>, pub status: u16, + #[serde(borrow)] + pub http: Option>, +} + +#[derive(Debug, Deserialize)] +pub struct Http<'a> { + #[serde(borrow)] + pub useragent: Option>, } #[cfg(test)] @@ -79,6 +96,7 @@ mod tests { method: "GET", url: "https://static.staging.crates.io/?1705420437", status: 403, + http: None, }, ) "#); @@ -90,6 +108,7 @@ mod tests { assert_eq!(output.method(), "GET"); assert_eq!(output.url(), "https://static.staging.crates.io/?1705420437"); assert_eq!(output.status(), 403); + assert_eq!(output.user_agent(), None); match output { LogLine::V1(l) => { @@ -99,6 +118,44 @@ mod tests { } } + #[test] + fn test_parse_with_user_agent() { + let input = r#"{"bytes":36308,"content_type":"application/gzip","date_time":"2025-10-26T23:57:34.867635728Z","http":{"protocol":"HTTP/2","referer":null,"useragent":"cargo/1.92.0-nightly (344c4567c 2025-10-21)"},"ip":"192.0.2.1","method":"GET","status":200,"url":"https://static.crates.io/crates/scale-info/2.11.3/download","version":"1"}"#; + let output = assert_ok!(serde_json::from_str::>(input)); + assert_debug_snapshot!(output, @r#" + V1( + LogLineV1 { + date_time: 2025-10-26T23:57:34.867635728Z, + method: "GET", + url: "https://static.crates.io/crates/scale-info/2.11.3/download", + status: 200, + http: Some( + Http { + useragent: Some( + "cargo/1.92.0-nightly (344c4567c 2025-10-21)", + ), + }, + ), + }, + ) + "#); + + assert_eq!( + output.date_time().to_string(), + "2025-10-26 23:57:34.867635728 UTC" + ); + assert_eq!(output.method(), "GET"); + assert_eq!( + output.url(), + "https://static.crates.io/crates/scale-info/2.11.3/download" + ); + assert_eq!(output.status(), 200); + assert_eq!( + output.user_agent(), + Some("cargo/1.92.0-nightly (344c4567c 2025-10-21)") + ); + } + #[allow(clippy::ptr_arg)] fn is_borrowed(s: &Cow<'_, str>) -> bool { match s { diff --git a/crates/crates_io_cdn_logs/src/fastly/mod.rs b/crates/crates_io_cdn_logs/src/fastly/mod.rs index ee7df780910..b0b75cc48c7 100644 --- a/crates/crates_io_cdn_logs/src/fastly/mod.rs +++ b/crates/crates_io_cdn_logs/src/fastly/mod.rs @@ -6,6 +6,7 @@ mod json; use crate::DownloadsMap; use crate::paths::parse_path; +use crate::user_agent::should_count_user_agent; use std::borrow::Cow; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; use tracing::{debug_span, instrument, warn}; @@ -42,6 +43,14 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul continue; } + if json + .user_agent() + .is_some_and(|ua| !should_count_user_agent(ua)) + { + // Ignore requests from user agents that should not be counted. + continue; + } + let url = decode_url(json.url()); // We're avoiding parsing to `url::Url` here for performance reasons. @@ -191,15 +200,12 @@ mod tests { 2025-10-26 ipnet@2.11.0 .. 1 2025-10-26 libc@0.2.177 .. 1 2025-10-26 lru-slab@0.1.2 .. 1 - 2025-10-26 matrixmultiply@0.3.3 .. 1 2025-10-26 owo-colors@4.2.3 .. 1 2025-10-26 parking_lot@0.12.5 .. 1 2025-10-26 precis-profiles@0.1.11 .. 1 2025-10-26 precis-tools@0.1.8 .. 1 - 2025-10-26 rand@0.8.5 .. 1 2025-10-26 scale-info@2.11.3 .. 1 2025-10-26 tinyvec_macros@0.1.1 .. 1 - 2025-10-26 tower@0.5.2 .. 1 2025-10-26 unicode-normalization@0.1.22 .. 1 } "); diff --git a/crates/crates_io_cdn_logs/src/lib.rs b/crates/crates_io_cdn_logs/src/lib.rs index a3a766a609e..9af448a6dc1 100644 --- a/crates/crates_io_cdn_logs/src/lib.rs +++ b/crates/crates_io_cdn_logs/src/lib.rs @@ -7,6 +7,7 @@ pub mod fastly; mod paths; #[cfg(test)] mod test_utils; +pub mod user_agent; pub use crate::compression::Decompressor; pub use crate::download_map::DownloadsMap; @@ -74,7 +75,6 @@ mod tests { 2024-01-17 flatbuffers@23.1.21 .. 1 2024-01-17 jemallocator@0.5.4 .. 1 2024-01-17 leveldb-sys@2.0.9 .. 1 - 2024-01-17 num_cpus@1.15.0 .. 1 2024-01-17 paste@1.0.12 .. 1 2024-01-17 quick-error@1.2.3 .. 1 2024-01-17 rand@0.8.5 .. 1 diff --git a/crates/crates_io_cdn_logs/src/user_agent.rs b/crates/crates_io_cdn_logs/src/user_agent.rs new file mode 100644 index 00000000000..3deface9d0c --- /dev/null +++ b/crates/crates_io_cdn_logs/src/user_agent.rs @@ -0,0 +1,55 @@ +/// Determines if downloads from the given user agent should be counted. +/// +/// Returns `true` if the download should be counted, `false` otherwise. +pub fn should_count_user_agent(user_agent: &str) -> bool { + let Some(suffix) = user_agent.strip_prefix("cargo") else { + return false; + }; + + suffix.starts_with('/') + || suffix.starts_with(' ') + || suffix.starts_with("%2f") + || suffix.starts_with("%2F") + || suffix.starts_with("%20") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_should_count_user_agent() { + // Standard cargo user agents with forward slash + assert!(should_count_user_agent( + "cargo/1.92.0-nightly (344c4567c 2025-10-21)" + )); + assert!(should_count_user_agent( + "cargo/1.88.0 (873a06493 2025-05-10)" + )); + assert!(should_count_user_agent( + "cargo/1.90.0 (840b83a10 2025-07-30)" + )); + assert!(should_count_user_agent("cargo/")); + + // CloudFront: Percent-encoded forward slash (lowercase and uppercase) + assert!(should_count_user_agent("cargo%2f1.74.0")); + assert!(should_count_user_agent("cargo%2F1.74.0")); + + // Space character (legacy Cargo versions) + assert!(should_count_user_agent("cargo 1.74.0")); + + // CloudFront: Percent-encoded space (legacy Cargo versions) + assert!(should_count_user_agent( + "cargo%201.74.0%20(ecb9851af%202023-10-18)" + )); + assert!(should_count_user_agent("cargo%20")); + + // Non-cargo user agents + assert!(!should_count_user_agent("Mozilla/5.0")); + assert!(!should_count_user_agent("curl/7.64.1")); + assert!(!should_count_user_agent("")); + assert!(!should_count_user_agent("Cargo/1.0.0")); + assert!(!should_count_user_agent("cargo")); + assert!(!should_count_user_agent("cargo-")); + } +} diff --git a/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log b/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log index 9a961853c05..4027208d74b 100644 --- a/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log +++ b/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log @@ -12,7 +12,7 @@ 2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - - 2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - - 2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - - -2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - - +2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - Bazel%2Frelease%207.6.2 - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - - 2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - - 2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - - 2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - -