Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion crates/crates_io_cdn_logs/src/cloudfront.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

use crate::DownloadsMap;
use crate::paths::parse_path;
use crate::user_agent::should_count_user_agent;
use chrono::NaiveDate;
use std::borrow::Cow;
use tokio::io::{AsyncBufRead, AsyncBufReadExt};
Expand All @@ -18,6 +19,7 @@ const FIELD_DATE: &str = "date";
const FIELD_METHOD: &str = "cs-method";
const FIELD_PATH: &str = "cs-uri-stem";
const FIELD_STATUS: &str = "sc-status";
const FIELD_USER_AGENT: &str = "cs(User-Agent)";

#[instrument(level = "debug", skip(reader))]
pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result<DownloadsMap> {
Expand All @@ -26,6 +28,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
let mut method_index = None;
let mut path_index = None;
let mut status_index = None;
let mut user_agent_index = None;

let mut downloads = DownloadsMap::new();

Expand All @@ -47,6 +50,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
method_index = fields.iter().position(|f| f == &FIELD_METHOD);
path_index = fields.iter().position(|f| f == &FIELD_PATH);
status_index = fields.iter().position(|f| f == &FIELD_STATUS);
user_agent_index = fields.iter().position(|f| f == &FIELD_USER_AGENT);

continue;
}
Expand Down Expand Up @@ -76,6 +80,12 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
continue;
}

let user_agent = get_optional_value(&values, user_agent_index);
if user_agent.is_some_and(|ua| !should_count_user_agent(ua)) {
// Ignore requests from user agents that should not be counted.
continue;
}

let path = get_value(&values, path_index, FIELD_PATH);

// Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`.
Expand Down Expand Up @@ -120,6 +130,10 @@ fn get_value<'a>(values: &'a [&'a str], index: Option<usize>, field_name: &'stat
})
}

fn get_optional_value<'a>(values: &'a [&'a str], index: Option<usize>) -> Option<&'a str> {
index.and_then(|i| values.get(i)).copied()
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -149,7 +163,6 @@ mod tests {
2024-01-17 flatbuffers@23.1.21 .. 1
2024-01-17 jemallocator@0.5.4 .. 1
2024-01-17 leveldb-sys@2.0.9 .. 1
2024-01-17 num_cpus@1.15.0 .. 1
2024-01-17 paste@1.0.12 .. 1
2024-01-17 quick-error@1.2.3 .. 1
2024-01-17 rand@0.8.5 .. 1
Expand Down
57 changes: 57 additions & 0 deletions crates/crates_io_cdn_logs/src/fastly/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ impl LogLine<'_> {
LogLine::V1(line) => line.status,
}
}

pub fn user_agent(&self) -> Option<&str> {
match self {
LogLine::V1(line) => line
.http
.as_ref()
.and_then(|http| http.useragent.as_deref()),
}
}
}

/// This struct corresponds to the `"version": "1"` variant of the [LogLine] enum.
Expand All @@ -60,6 +69,14 @@ pub struct LogLineV1<'a> {
#[serde(borrow)]
pub url: Cow<'a, str>,
pub status: u16,
#[serde(borrow)]
pub http: Option<Http<'a>>,
}

#[derive(Debug, Deserialize)]
pub struct Http<'a> {
#[serde(borrow)]
pub useragent: Option<Cow<'a, str>>,
}

#[cfg(test)]
Expand All @@ -79,6 +96,7 @@ mod tests {
method: "GET",
url: "https://static.staging.crates.io/?1705420437",
status: 403,
http: None,
},
)
"#);
Expand All @@ -90,6 +108,7 @@ mod tests {
assert_eq!(output.method(), "GET");
assert_eq!(output.url(), "https://static.staging.crates.io/?1705420437");
assert_eq!(output.status(), 403);
assert_eq!(output.user_agent(), None);

match output {
LogLine::V1(l) => {
Expand All @@ -99,6 +118,44 @@ mod tests {
}
}

#[test]
fn test_parse_with_user_agent() {
let input = r#"{"bytes":36308,"content_type":"application/gzip","date_time":"2025-10-26T23:57:34.867635728Z","http":{"protocol":"HTTP/2","referer":null,"useragent":"cargo/1.92.0-nightly (344c4567c 2025-10-21)"},"ip":"192.0.2.1","method":"GET","status":200,"url":"https://static.crates.io/crates/scale-info/2.11.3/download","version":"1"}"#;
let output = assert_ok!(serde_json::from_str::<LogLine<'_>>(input));
assert_debug_snapshot!(output, @r#"
V1(
LogLineV1 {
date_time: 2025-10-26T23:57:34.867635728Z,
method: "GET",
url: "https://static.crates.io/crates/scale-info/2.11.3/download",
status: 200,
http: Some(
Http {
useragent: Some(
"cargo/1.92.0-nightly (344c4567c 2025-10-21)",
),
},
),
},
)
"#);

assert_eq!(
output.date_time().to_string(),
"2025-10-26 23:57:34.867635728 UTC"
);
assert_eq!(output.method(), "GET");
assert_eq!(
output.url(),
"https://static.crates.io/crates/scale-info/2.11.3/download"
);
assert_eq!(output.status(), 200);
assert_eq!(
output.user_agent(),
Some("cargo/1.92.0-nightly (344c4567c 2025-10-21)")
);
}

#[allow(clippy::ptr_arg)]
fn is_borrowed(s: &Cow<'_, str>) -> bool {
match s {
Expand Down
12 changes: 9 additions & 3 deletions crates/crates_io_cdn_logs/src/fastly/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod json;

use crate::DownloadsMap;
use crate::paths::parse_path;
use crate::user_agent::should_count_user_agent;
use std::borrow::Cow;
use tokio::io::{AsyncBufRead, AsyncBufReadExt};
use tracing::{debug_span, instrument, warn};
Expand Down Expand Up @@ -42,6 +43,14 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
continue;
}

if json
.user_agent()
.is_some_and(|ua| !should_count_user_agent(ua))
{
// Ignore requests from user agents that should not be counted.
continue;
}

let url = decode_url(json.url());

// We're avoiding parsing to `url::Url` here for performance reasons.
Expand Down Expand Up @@ -191,15 +200,12 @@ mod tests {
2025-10-26 ipnet@2.11.0 .. 1
2025-10-26 libc@0.2.177 .. 1
2025-10-26 lru-slab@0.1.2 .. 1
2025-10-26 matrixmultiply@0.3.3 .. 1
2025-10-26 owo-colors@4.2.3 .. 1
2025-10-26 parking_lot@0.12.5 .. 1
2025-10-26 precis-profiles@0.1.11 .. 1
2025-10-26 precis-tools@0.1.8 .. 1
2025-10-26 rand@0.8.5 .. 1
2025-10-26 scale-info@2.11.3 .. 1
2025-10-26 tinyvec_macros@0.1.1 .. 1
2025-10-26 tower@0.5.2 .. 1
2025-10-26 unicode-normalization@0.1.22 .. 1
}
");
Expand Down
2 changes: 1 addition & 1 deletion crates/crates_io_cdn_logs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pub mod fastly;
mod paths;
#[cfg(test)]
mod test_utils;
pub mod user_agent;

pub use crate::compression::Decompressor;
pub use crate::download_map::DownloadsMap;
Expand Down Expand Up @@ -74,7 +75,6 @@ mod tests {
2024-01-17 flatbuffers@23.1.21 .. 1
2024-01-17 jemallocator@0.5.4 .. 1
2024-01-17 leveldb-sys@2.0.9 .. 1
2024-01-17 num_cpus@1.15.0 .. 1
2024-01-17 paste@1.0.12 .. 1
2024-01-17 quick-error@1.2.3 .. 1
2024-01-17 rand@0.8.5 .. 1
Expand Down
55 changes: 55 additions & 0 deletions crates/crates_io_cdn_logs/src/user_agent.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/// Determines if downloads from the given user agent should be counted.
///
/// Returns `true` if the download should be counted, `false` otherwise.
pub fn should_count_user_agent(user_agent: &str) -> bool {
let Some(suffix) = user_agent.strip_prefix("cargo") else {
return false;
};

suffix.starts_with('/')
|| suffix.starts_with(' ')
|| suffix.starts_with("%2f")
|| suffix.starts_with("%2F")
|| suffix.starts_with("%20")
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_should_count_user_agent() {
// Standard cargo user agents with forward slash
assert!(should_count_user_agent(
"cargo/1.92.0-nightly (344c4567c 2025-10-21)"
));
assert!(should_count_user_agent(
"cargo/1.88.0 (873a06493 2025-05-10)"
));
assert!(should_count_user_agent(
"cargo/1.90.0 (840b83a10 2025-07-30)"
));
assert!(should_count_user_agent("cargo/"));

// CloudFront: Percent-encoded forward slash (lowercase and uppercase)
assert!(should_count_user_agent("cargo%2f1.74.0"));
assert!(should_count_user_agent("cargo%2F1.74.0"));

// Space character (legacy Cargo versions)
assert!(should_count_user_agent("cargo 1.74.0"));

// CloudFront: Percent-encoded space (legacy Cargo versions)
assert!(should_count_user_agent(
"cargo%201.74.0%20(ecb9851af%202023-10-18)"
));
assert!(should_count_user_agent("cargo%20"));

// Non-cargo user agents
assert!(!should_count_user_agent("Mozilla/5.0"));
assert!(!should_count_user_agent("curl/7.64.1"));
assert!(!should_count_user_agent(""));
assert!(!should_count_user_agent("Cargo/1.0.0"));
assert!(!should_count_user_agent("cargo"));
assert!(!should_count_user_agent("cargo-"));
}
}
2 changes: 1 addition & 1 deletion crates/crates_io_cdn_logs/test_data/cloudfront/basic.log
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - -
2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - -
2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - -
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - Bazel%2Frelease%207.6.2 - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - -
2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - -
2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - -
Expand Down