From 8bce5b0e1609152c1440038bb84085ca813bbda9 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 27 Oct 2025 17:26:52 +0100 Subject: [PATCH 1/3] cdn_logs: Add user agent filtering module Add `user_agent` module with `should_count_user_agent()` fn to determine if downloads should be counted based on user agent. Currently filters to only count cargo client downloads. --- crates/crates_io_cdn_logs/src/lib.rs | 1 + crates/crates_io_cdn_logs/src/user_agent.rs | 55 +++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 crates/crates_io_cdn_logs/src/user_agent.rs diff --git a/crates/crates_io_cdn_logs/src/lib.rs b/crates/crates_io_cdn_logs/src/lib.rs index a3a766a609e..362b37d01ed 100644 --- a/crates/crates_io_cdn_logs/src/lib.rs +++ b/crates/crates_io_cdn_logs/src/lib.rs @@ -7,6 +7,7 @@ pub mod fastly; mod paths; #[cfg(test)] mod test_utils; +pub mod user_agent; pub use crate::compression::Decompressor; pub use crate::download_map::DownloadsMap; diff --git a/crates/crates_io_cdn_logs/src/user_agent.rs b/crates/crates_io_cdn_logs/src/user_agent.rs new file mode 100644 index 00000000000..3deface9d0c --- /dev/null +++ b/crates/crates_io_cdn_logs/src/user_agent.rs @@ -0,0 +1,55 @@ +/// Determines if downloads from the given user agent should be counted. +/// +/// Returns `true` if the download should be counted, `false` otherwise. +pub fn should_count_user_agent(user_agent: &str) -> bool { + let Some(suffix) = user_agent.strip_prefix("cargo") else { + return false; + }; + + suffix.starts_with('/') + || suffix.starts_with(' ') + || suffix.starts_with("%2f") + || suffix.starts_with("%2F") + || suffix.starts_with("%20") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_should_count_user_agent() { + // Standard cargo user agents with forward slash + assert!(should_count_user_agent( + "cargo/1.92.0-nightly (344c4567c 2025-10-21)" + )); + assert!(should_count_user_agent( + "cargo/1.88.0 (873a06493 2025-05-10)" + )); + assert!(should_count_user_agent( + "cargo/1.90.0 (840b83a10 2025-07-30)" + )); + assert!(should_count_user_agent("cargo/")); + + // CloudFront: Percent-encoded forward slash (lowercase and uppercase) + assert!(should_count_user_agent("cargo%2f1.74.0")); + assert!(should_count_user_agent("cargo%2F1.74.0")); + + // Space character (legacy Cargo versions) + assert!(should_count_user_agent("cargo 1.74.0")); + + // CloudFront: Percent-encoded space (legacy Cargo versions) + assert!(should_count_user_agent( + "cargo%201.74.0%20(ecb9851af%202023-10-18)" + )); + assert!(should_count_user_agent("cargo%20")); + + // Non-cargo user agents + assert!(!should_count_user_agent("Mozilla/5.0")); + assert!(!should_count_user_agent("curl/7.64.1")); + assert!(!should_count_user_agent("")); + assert!(!should_count_user_agent("Cargo/1.0.0")); + assert!(!should_count_user_agent("cargo")); + assert!(!should_count_user_agent("cargo-")); + } +} From a6b974f0cf4ff1b50ae8621bf811cd4d2084f0fa Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 27 Oct 2025 17:20:53 +0100 Subject: [PATCH 2/3] cdn_logs: Filter Fastly downloads by user agent Apply user agent filtering to Fastly log parsing. Downloads are counted if they have no user agent (for backwards compatibility with older logs) or if the user agent passes the `should_count_user_agent()` check. Currently filters out non-cargo user agents like Bazel. --- crates/crates_io_cdn_logs/src/fastly/json.rs | 57 ++++++++++++++++++++ crates/crates_io_cdn_logs/src/fastly/mod.rs | 12 +++-- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/crates/crates_io_cdn_logs/src/fastly/json.rs b/crates/crates_io_cdn_logs/src/fastly/json.rs index 1c500312aba..8a7d5165a58 100644 --- a/crates/crates_io_cdn_logs/src/fastly/json.rs +++ b/crates/crates_io_cdn_logs/src/fastly/json.rs @@ -37,6 +37,15 @@ impl LogLine<'_> { LogLine::V1(line) => line.status, } } + + pub fn user_agent(&self) -> Option<&str> { + match self { + LogLine::V1(line) => line + .http + .as_ref() + .and_then(|http| http.useragent.as_deref()), + } + } } /// This struct corresponds to the `"version": "1"` variant of the [LogLine] enum. @@ -60,6 +69,14 @@ pub struct LogLineV1<'a> { #[serde(borrow)] pub url: Cow<'a, str>, pub status: u16, + #[serde(borrow)] + pub http: Option>, +} + +#[derive(Debug, Deserialize)] +pub struct Http<'a> { + #[serde(borrow)] + pub useragent: Option>, } #[cfg(test)] @@ -79,6 +96,7 @@ mod tests { method: "GET", url: "https://static.staging.crates.io/?1705420437", status: 403, + http: None, }, ) "#); @@ -90,6 +108,7 @@ mod tests { assert_eq!(output.method(), "GET"); assert_eq!(output.url(), "https://static.staging.crates.io/?1705420437"); assert_eq!(output.status(), 403); + assert_eq!(output.user_agent(), None); match output { LogLine::V1(l) => { @@ -99,6 +118,44 @@ mod tests { } } + #[test] + fn test_parse_with_user_agent() { + let input = r#"{"bytes":36308,"content_type":"application/gzip","date_time":"2025-10-26T23:57:34.867635728Z","http":{"protocol":"HTTP/2","referer":null,"useragent":"cargo/1.92.0-nightly (344c4567c 2025-10-21)"},"ip":"192.0.2.1","method":"GET","status":200,"url":"https://static.crates.io/crates/scale-info/2.11.3/download","version":"1"}"#; + let output = assert_ok!(serde_json::from_str::>(input)); + assert_debug_snapshot!(output, @r#" + V1( + LogLineV1 { + date_time: 2025-10-26T23:57:34.867635728Z, + method: "GET", + url: "https://static.crates.io/crates/scale-info/2.11.3/download", + status: 200, + http: Some( + Http { + useragent: Some( + "cargo/1.92.0-nightly (344c4567c 2025-10-21)", + ), + }, + ), + }, + ) + "#); + + assert_eq!( + output.date_time().to_string(), + "2025-10-26 23:57:34.867635728 UTC" + ); + assert_eq!(output.method(), "GET"); + assert_eq!( + output.url(), + "https://static.crates.io/crates/scale-info/2.11.3/download" + ); + assert_eq!(output.status(), 200); + assert_eq!( + output.user_agent(), + Some("cargo/1.92.0-nightly (344c4567c 2025-10-21)") + ); + } + #[allow(clippy::ptr_arg)] fn is_borrowed(s: &Cow<'_, str>) -> bool { match s { diff --git a/crates/crates_io_cdn_logs/src/fastly/mod.rs b/crates/crates_io_cdn_logs/src/fastly/mod.rs index ee7df780910..b0b75cc48c7 100644 --- a/crates/crates_io_cdn_logs/src/fastly/mod.rs +++ b/crates/crates_io_cdn_logs/src/fastly/mod.rs @@ -6,6 +6,7 @@ mod json; use crate::DownloadsMap; use crate::paths::parse_path; +use crate::user_agent::should_count_user_agent; use std::borrow::Cow; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; use tracing::{debug_span, instrument, warn}; @@ -42,6 +43,14 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul continue; } + if json + .user_agent() + .is_some_and(|ua| !should_count_user_agent(ua)) + { + // Ignore requests from user agents that should not be counted. + continue; + } + let url = decode_url(json.url()); // We're avoiding parsing to `url::Url` here for performance reasons. @@ -191,15 +200,12 @@ mod tests { 2025-10-26 ipnet@2.11.0 .. 1 2025-10-26 libc@0.2.177 .. 1 2025-10-26 lru-slab@0.1.2 .. 1 - 2025-10-26 matrixmultiply@0.3.3 .. 1 2025-10-26 owo-colors@4.2.3 .. 1 2025-10-26 parking_lot@0.12.5 .. 1 2025-10-26 precis-profiles@0.1.11 .. 1 2025-10-26 precis-tools@0.1.8 .. 1 - 2025-10-26 rand@0.8.5 .. 1 2025-10-26 scale-info@2.11.3 .. 1 2025-10-26 tinyvec_macros@0.1.1 .. 1 - 2025-10-26 tower@0.5.2 .. 1 2025-10-26 unicode-normalization@0.1.22 .. 1 } "); From 5a899955c439633a9117da57a69a95532bef77bb Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 27 Oct 2025 17:45:37 +0100 Subject: [PATCH 3/3] cdn_logs: Filter CloudFront downloads by user agent Apply user agent filtering to CloudFront log parsing. Downloads are counted if they have no user agent (for backwards compatibility) or if the user agent passes the `should_count_user_agent()` check. --- crates/crates_io_cdn_logs/src/cloudfront.rs | 15 ++++++++++++++- crates/crates_io_cdn_logs/src/lib.rs | 1 - .../test_data/cloudfront/basic.log | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/crates/crates_io_cdn_logs/src/cloudfront.rs b/crates/crates_io_cdn_logs/src/cloudfront.rs index dfe1f454f00..2723dee1c8a 100644 --- a/crates/crates_io_cdn_logs/src/cloudfront.rs +++ b/crates/crates_io_cdn_logs/src/cloudfront.rs @@ -5,6 +5,7 @@ use crate::DownloadsMap; use crate::paths::parse_path; +use crate::user_agent::should_count_user_agent; use chrono::NaiveDate; use std::borrow::Cow; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; @@ -18,6 +19,7 @@ const FIELD_DATE: &str = "date"; const FIELD_METHOD: &str = "cs-method"; const FIELD_PATH: &str = "cs-uri-stem"; const FIELD_STATUS: &str = "sc-status"; +const FIELD_USER_AGENT: &str = "cs(User-Agent)"; #[instrument(level = "debug", skip(reader))] pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result { @@ -26,6 +28,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul let mut method_index = None; let mut path_index = None; let mut status_index = None; + let mut user_agent_index = None; let mut downloads = DownloadsMap::new(); @@ -47,6 +50,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul method_index = fields.iter().position(|f| f == &FIELD_METHOD); path_index = fields.iter().position(|f| f == &FIELD_PATH); status_index = fields.iter().position(|f| f == &FIELD_STATUS); + user_agent_index = fields.iter().position(|f| f == &FIELD_USER_AGENT); continue; } @@ -76,6 +80,12 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul continue; } + let user_agent = get_optional_value(&values, user_agent_index); + if user_agent.is_some_and(|ua| !should_count_user_agent(ua)) { + // Ignore requests from user agents that should not be counted. + continue; + } + let path = get_value(&values, path_index, FIELD_PATH); // Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`. @@ -120,6 +130,10 @@ fn get_value<'a>(values: &'a [&'a str], index: Option, field_name: &'stat }) } +fn get_optional_value<'a>(values: &'a [&'a str], index: Option) -> Option<&'a str> { + index.and_then(|i| values.get(i)).copied() +} + #[cfg(test)] mod tests { use super::*; @@ -149,7 +163,6 @@ mod tests { 2024-01-17 flatbuffers@23.1.21 .. 1 2024-01-17 jemallocator@0.5.4 .. 1 2024-01-17 leveldb-sys@2.0.9 .. 1 - 2024-01-17 num_cpus@1.15.0 .. 1 2024-01-17 paste@1.0.12 .. 1 2024-01-17 quick-error@1.2.3 .. 1 2024-01-17 rand@0.8.5 .. 1 diff --git a/crates/crates_io_cdn_logs/src/lib.rs b/crates/crates_io_cdn_logs/src/lib.rs index 362b37d01ed..9af448a6dc1 100644 --- a/crates/crates_io_cdn_logs/src/lib.rs +++ b/crates/crates_io_cdn_logs/src/lib.rs @@ -75,7 +75,6 @@ mod tests { 2024-01-17 flatbuffers@23.1.21 .. 1 2024-01-17 jemallocator@0.5.4 .. 1 2024-01-17 leveldb-sys@2.0.9 .. 1 - 2024-01-17 num_cpus@1.15.0 .. 1 2024-01-17 paste@1.0.12 .. 1 2024-01-17 quick-error@1.2.3 .. 1 2024-01-17 rand@0.8.5 .. 1 diff --git a/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log b/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log index 9a961853c05..4027208d74b 100644 --- a/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log +++ b/crates/crates_io_cdn_logs/test_data/cloudfront/basic.log @@ -12,7 +12,7 @@ 2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - - 2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - - 2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - - -2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - - +2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - Bazel%2Frelease%207.6.2 - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - - 2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - - 2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - - 2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - -