Skip to content

Commit 5fb8de1

Browse files
authored
Merge pull request #12210 from Turbo87/cdn-log-user-agent-filtering
cdn_logs: Filter downloads by user agent
2 parents 99686c3 + 5a89995 commit 5fb8de1

File tree

6 files changed

+137
-6
lines changed

6 files changed

+137
-6
lines changed

crates/crates_io_cdn_logs/src/cloudfront.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
use crate::DownloadsMap;
77
use crate::paths::parse_path;
8+
use crate::user_agent::should_count_user_agent;
89
use chrono::NaiveDate;
910
use std::borrow::Cow;
1011
use tokio::io::{AsyncBufRead, AsyncBufReadExt};
@@ -18,6 +19,7 @@ const FIELD_DATE: &str = "date";
1819
const FIELD_METHOD: &str = "cs-method";
1920
const FIELD_PATH: &str = "cs-uri-stem";
2021
const FIELD_STATUS: &str = "sc-status";
22+
const FIELD_USER_AGENT: &str = "cs(User-Agent)";
2123

2224
#[instrument(level = "debug", skip(reader))]
2325
pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result<DownloadsMap> {
@@ -26,6 +28,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
2628
let mut method_index = None;
2729
let mut path_index = None;
2830
let mut status_index = None;
31+
let mut user_agent_index = None;
2932

3033
let mut downloads = DownloadsMap::new();
3134

@@ -47,6 +50,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
4750
method_index = fields.iter().position(|f| f == &FIELD_METHOD);
4851
path_index = fields.iter().position(|f| f == &FIELD_PATH);
4952
status_index = fields.iter().position(|f| f == &FIELD_STATUS);
53+
user_agent_index = fields.iter().position(|f| f == &FIELD_USER_AGENT);
5054

5155
continue;
5256
}
@@ -76,6 +80,12 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
7680
continue;
7781
}
7882

83+
let user_agent = get_optional_value(&values, user_agent_index);
84+
if user_agent.is_some_and(|ua| !should_count_user_agent(ua)) {
85+
// Ignore requests from user agents that should not be counted.
86+
continue;
87+
}
88+
7989
let path = get_value(&values, path_index, FIELD_PATH);
8090

8191
// Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`.
@@ -120,6 +130,10 @@ fn get_value<'a>(values: &'a [&'a str], index: Option<usize>, field_name: &'stat
120130
})
121131
}
122132

133+
fn get_optional_value<'a>(values: &'a [&'a str], index: Option<usize>) -> Option<&'a str> {
134+
index.and_then(|i| values.get(i)).copied()
135+
}
136+
123137
#[cfg(test)]
124138
mod tests {
125139
use super::*;
@@ -149,7 +163,6 @@ mod tests {
149163
2024-01-17 flatbuffers@23.1.21 .. 1
150164
2024-01-17 jemallocator@0.5.4 .. 1
151165
2024-01-17 leveldb-sys@2.0.9 .. 1
152-
2024-01-17 num_cpus@1.15.0 .. 1
153166
2024-01-17 paste@1.0.12 .. 1
154167
2024-01-17 quick-error@1.2.3 .. 1
155168
2024-01-17 rand@0.8.5 .. 1

crates/crates_io_cdn_logs/src/fastly/json.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ impl LogLine<'_> {
3737
LogLine::V1(line) => line.status,
3838
}
3939
}
40+
41+
pub fn user_agent(&self) -> Option<&str> {
42+
match self {
43+
LogLine::V1(line) => line
44+
.http
45+
.as_ref()
46+
.and_then(|http| http.useragent.as_deref()),
47+
}
48+
}
4049
}
4150

4251
/// This struct corresponds to the `"version": "1"` variant of the [LogLine] enum.
@@ -60,6 +69,14 @@ pub struct LogLineV1<'a> {
6069
#[serde(borrow)]
6170
pub url: Cow<'a, str>,
6271
pub status: u16,
72+
#[serde(borrow)]
73+
pub http: Option<Http<'a>>,
74+
}
75+
76+
#[derive(Debug, Deserialize)]
77+
pub struct Http<'a> {
78+
#[serde(borrow)]
79+
pub useragent: Option<Cow<'a, str>>,
6380
}
6481

6582
#[cfg(test)]
@@ -79,6 +96,7 @@ mod tests {
7996
method: "GET",
8097
url: "https://static.staging.crates.io/?1705420437",
8198
status: 403,
99+
http: None,
82100
},
83101
)
84102
"#);
@@ -90,6 +108,7 @@ mod tests {
90108
assert_eq!(output.method(), "GET");
91109
assert_eq!(output.url(), "https://static.staging.crates.io/?1705420437");
92110
assert_eq!(output.status(), 403);
111+
assert_eq!(output.user_agent(), None);
93112

94113
match output {
95114
LogLine::V1(l) => {
@@ -99,6 +118,44 @@ mod tests {
99118
}
100119
}
101120

121+
#[test]
122+
fn test_parse_with_user_agent() {
123+
let input = r#"{"bytes":36308,"content_type":"application/gzip","date_time":"2025-10-26T23:57:34.867635728Z","http":{"protocol":"HTTP/2","referer":null,"useragent":"cargo/1.92.0-nightly (344c4567c 2025-10-21)"},"ip":"192.0.2.1","method":"GET","status":200,"url":"https://static.crates.io/crates/scale-info/2.11.3/download","version":"1"}"#;
124+
let output = assert_ok!(serde_json::from_str::<LogLine<'_>>(input));
125+
assert_debug_snapshot!(output, @r#"
126+
V1(
127+
LogLineV1 {
128+
date_time: 2025-10-26T23:57:34.867635728Z,
129+
method: "GET",
130+
url: "https://static.crates.io/crates/scale-info/2.11.3/download",
131+
status: 200,
132+
http: Some(
133+
Http {
134+
useragent: Some(
135+
"cargo/1.92.0-nightly (344c4567c 2025-10-21)",
136+
),
137+
},
138+
),
139+
},
140+
)
141+
"#);
142+
143+
assert_eq!(
144+
output.date_time().to_string(),
145+
"2025-10-26 23:57:34.867635728 UTC"
146+
);
147+
assert_eq!(output.method(), "GET");
148+
assert_eq!(
149+
output.url(),
150+
"https://static.crates.io/crates/scale-info/2.11.3/download"
151+
);
152+
assert_eq!(output.status(), 200);
153+
assert_eq!(
154+
output.user_agent(),
155+
Some("cargo/1.92.0-nightly (344c4567c 2025-10-21)")
156+
);
157+
}
158+
102159
#[allow(clippy::ptr_arg)]
103160
fn is_borrowed(s: &Cow<'_, str>) -> bool {
104161
match s {

crates/crates_io_cdn_logs/src/fastly/mod.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod json;
66

77
use crate::DownloadsMap;
88
use crate::paths::parse_path;
9+
use crate::user_agent::should_count_user_agent;
910
use std::borrow::Cow;
1011
use tokio::io::{AsyncBufRead, AsyncBufReadExt};
1112
use tracing::{debug_span, instrument, warn};
@@ -42,6 +43,14 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
4243
continue;
4344
}
4445

46+
if json
47+
.user_agent()
48+
.is_some_and(|ua| !should_count_user_agent(ua))
49+
{
50+
// Ignore requests from user agents that should not be counted.
51+
continue;
52+
}
53+
4554
let url = decode_url(json.url());
4655

4756
// We're avoiding parsing to `url::Url` here for performance reasons.
@@ -191,15 +200,12 @@ mod tests {
191200
2025-10-26 ipnet@2.11.0 .. 1
192201
2025-10-26 libc@0.2.177 .. 1
193202
2025-10-26 lru-slab@0.1.2 .. 1
194-
2025-10-26 matrixmultiply@0.3.3 .. 1
195203
2025-10-26 owo-colors@4.2.3 .. 1
196204
2025-10-26 parking_lot@0.12.5 .. 1
197205
2025-10-26 precis-profiles@0.1.11 .. 1
198206
2025-10-26 precis-tools@0.1.8 .. 1
199-
2025-10-26 rand@0.8.5 .. 1
200207
2025-10-26 scale-info@2.11.3 .. 1
201208
2025-10-26 tinyvec_macros@0.1.1 .. 1
202-
2025-10-26 tower@0.5.2 .. 1
203209
2025-10-26 unicode-normalization@0.1.22 .. 1
204210
}
205211
");

crates/crates_io_cdn_logs/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ pub mod fastly;
77
mod paths;
88
#[cfg(test)]
99
mod test_utils;
10+
pub mod user_agent;
1011

1112
pub use crate::compression::Decompressor;
1213
pub use crate::download_map::DownloadsMap;
@@ -74,7 +75,6 @@ mod tests {
7475
2024-01-17 flatbuffers@23.1.21 .. 1
7576
2024-01-17 jemallocator@0.5.4 .. 1
7677
2024-01-17 leveldb-sys@2.0.9 .. 1
77-
2024-01-17 num_cpus@1.15.0 .. 1
7878
2024-01-17 paste@1.0.12 .. 1
7979
2024-01-17 quick-error@1.2.3 .. 1
8080
2024-01-17 rand@0.8.5 .. 1
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/// Determines if downloads from the given user agent should be counted.
2+
///
3+
/// Returns `true` if the download should be counted, `false` otherwise.
4+
pub fn should_count_user_agent(user_agent: &str) -> bool {
5+
let Some(suffix) = user_agent.strip_prefix("cargo") else {
6+
return false;
7+
};
8+
9+
suffix.starts_with('/')
10+
|| suffix.starts_with(' ')
11+
|| suffix.starts_with("%2f")
12+
|| suffix.starts_with("%2F")
13+
|| suffix.starts_with("%20")
14+
}
15+
16+
#[cfg(test)]
17+
mod tests {
18+
use super::*;
19+
20+
#[test]
21+
fn test_should_count_user_agent() {
22+
// Standard cargo user agents with forward slash
23+
assert!(should_count_user_agent(
24+
"cargo/1.92.0-nightly (344c4567c 2025-10-21)"
25+
));
26+
assert!(should_count_user_agent(
27+
"cargo/1.88.0 (873a06493 2025-05-10)"
28+
));
29+
assert!(should_count_user_agent(
30+
"cargo/1.90.0 (840b83a10 2025-07-30)"
31+
));
32+
assert!(should_count_user_agent("cargo/"));
33+
34+
// CloudFront: Percent-encoded forward slash (lowercase and uppercase)
35+
assert!(should_count_user_agent("cargo%2f1.74.0"));
36+
assert!(should_count_user_agent("cargo%2F1.74.0"));
37+
38+
// Space character (legacy Cargo versions)
39+
assert!(should_count_user_agent("cargo 1.74.0"));
40+
41+
// CloudFront: Percent-encoded space (legacy Cargo versions)
42+
assert!(should_count_user_agent(
43+
"cargo%201.74.0%20(ecb9851af%202023-10-18)"
44+
));
45+
assert!(should_count_user_agent("cargo%20"));
46+
47+
// Non-cargo user agents
48+
assert!(!should_count_user_agent("Mozilla/5.0"));
49+
assert!(!should_count_user_agent("curl/7.64.1"));
50+
assert!(!should_count_user_agent(""));
51+
assert!(!should_count_user_agent("Cargo/1.0.0"));
52+
assert!(!should_count_user_agent("cargo"));
53+
assert!(!should_count_user_agent("cargo-"));
54+
}
55+
}

crates/crates_io_cdn_logs/test_data/cloudfront/basic.log

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - -
1313
2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - -
1414
2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - -
15-
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
15+
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - Bazel%2Frelease%207.6.2 - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
1616
2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - -
1717
2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - -
1818
2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - -

0 commit comments

Comments
 (0)