Skip to content

Commit 5a89995

Browse files
committed
cdn_logs: Filter CloudFront downloads by user agent
Apply user agent filtering to CloudFront log parsing. Downloads are counted if they have no user agent (for backwards compatibility) or if the user agent passes the `should_count_user_agent()` check.
1 parent a6b974f commit 5a89995

File tree

3 files changed

+15
-3
lines changed

3 files changed

+15
-3
lines changed

crates/crates_io_cdn_logs/src/cloudfront.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
use crate::DownloadsMap;
77
use crate::paths::parse_path;
8+
use crate::user_agent::should_count_user_agent;
89
use chrono::NaiveDate;
910
use std::borrow::Cow;
1011
use tokio::io::{AsyncBufRead, AsyncBufReadExt};
@@ -18,6 +19,7 @@ const FIELD_DATE: &str = "date";
1819
const FIELD_METHOD: &str = "cs-method";
1920
const FIELD_PATH: &str = "cs-uri-stem";
2021
const FIELD_STATUS: &str = "sc-status";
22+
const FIELD_USER_AGENT: &str = "cs(User-Agent)";
2123

2224
#[instrument(level = "debug", skip(reader))]
2325
pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Result<DownloadsMap> {
@@ -26,6 +28,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
2628
let mut method_index = None;
2729
let mut path_index = None;
2830
let mut status_index = None;
31+
let mut user_agent_index = None;
2932

3033
let mut downloads = DownloadsMap::new();
3134

@@ -47,6 +50,7 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
4750
method_index = fields.iter().position(|f| f == &FIELD_METHOD);
4851
path_index = fields.iter().position(|f| f == &FIELD_PATH);
4952
status_index = fields.iter().position(|f| f == &FIELD_STATUS);
53+
user_agent_index = fields.iter().position(|f| f == &FIELD_USER_AGENT);
5054

5155
continue;
5256
}
@@ -76,6 +80,12 @@ pub async fn count_downloads(reader: impl AsyncBufRead + Unpin) -> anyhow::Resul
7680
continue;
7781
}
7882

83+
let user_agent = get_optional_value(&values, user_agent_index);
84+
if user_agent.is_some_and(|ua| !should_count_user_agent(ua)) {
85+
// Ignore requests from user agents that should not be counted.
86+
continue;
87+
}
88+
7989
let path = get_value(&values, path_index, FIELD_PATH);
8090

8191
// Deal with paths like `/crates/tikv-jemalloc-sys/tikv-jemalloc-sys-0.5.4%252B5.3.0-patched.crate`.
@@ -120,6 +130,10 @@ fn get_value<'a>(values: &'a [&'a str], index: Option<usize>, field_name: &'stat
120130
})
121131
}
122132

133+
fn get_optional_value<'a>(values: &'a [&'a str], index: Option<usize>) -> Option<&'a str> {
134+
index.and_then(|i| values.get(i)).copied()
135+
}
136+
123137
#[cfg(test)]
124138
mod tests {
125139
use super::*;
@@ -149,7 +163,6 @@ mod tests {
149163
2024-01-17 flatbuffers@23.1.21 .. 1
150164
2024-01-17 jemallocator@0.5.4 .. 1
151165
2024-01-17 leveldb-sys@2.0.9 .. 1
152-
2024-01-17 num_cpus@1.15.0 .. 1
153166
2024-01-17 paste@1.0.12 .. 1
154167
2024-01-17 quick-error@1.2.3 .. 1
155168
2024-01-17 rand@0.8.5 .. 1

crates/crates_io_cdn_logs/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ mod tests {
7575
2024-01-17 flatbuffers@23.1.21 .. 1
7676
2024-01-17 jemallocator@0.5.4 .. 1
7777
2024-01-17 leveldb-sys@2.0.9 .. 1
78-
2024-01-17 num_cpus@1.15.0 .. 1
7978
2024-01-17 paste@1.0.12 .. 1
8079
2024-01-17 quick-error@1.2.3 .. 1
8180
2024-01-17 rand@0.8.5 .. 1

crates/crates_io_cdn_logs/test_data/cloudfront/basic.log

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
2024-01-16 23:57:42 CMH68-P2 15495 1.2.3.4 GET d19xqa3lc3clo8.cloudfront.net /crates/quick-error/quick-error-1.2.3.crate 200 - cargo%201.74.0%20(ecb9851af%202023-10-18) - - Hit iDfI0scfFBqKcmnFlcQ32TJ-2QQRcZXOF2pT1fqPJ0SwYt8gvsTH3w== static.crates.io https 48 0.024 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 54298 0.024 Hit application/x-tar 15066 - -
1313
2024-01-17 00:41:12 HIO50-C2 24428 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/flatbuffers/flatbuffers-23.1.21.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit HFwag0aWj8eJgyO0eaTqQA_q0Jo091a2TPUYFl6MrBBaN3aFYMr5ug== static.crates.io https 49 0.043 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.043 Hit application/gzip 23944 - -
1414
2024-01-17 00:41:12 HIO50-C2 13564 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/jemallocator/jemallocator-0.5.4.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit W_8eJKaKdIXNRhhlAVaZmjxcej-ibyT5XRWqPfi8Udjxv8KN-aA6ig== static.crates.io https 48 0.044 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.044 Hit application/gzip 13088 - -
15-
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
15+
2024-01-17 00:41:12 HIO50-C2 16153 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/num_cpus/num_cpus-1.15.0.crate 200 - Bazel%2Frelease%207.6.2 - - Hit MCnygi8IdF0r4cdgPb3rvlxubS2tmibodsAXEB3uD0KUghu-1h9k7A== static.crates.io https 44 0.045 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.045 Hit application/gzip 15680 - -
1616
2024-01-17 00:41:12 HIO50-C2 18642 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/paste/paste-1.0.12.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit J83xY2di3uOqjGBy49qk3M0nE0vvxNTIIsVkIvlEt5ZD7Cv54bDPug== static.crates.io https 39 0.056 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 18156 - -
1717
2024-01-17 00:41:12 HIO50-C2 87644 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/rand/rand-0.8.5.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit ceSKUmY8GOILmHQ8W1UcW_pGErGTv1DH_aHDXIWNKe9G2IJslpzFYg== static.crates.io https 38 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/x-tar 87113 - -
1818
2024-01-17 00:41:12 HIO50-C2 54969 3.4.5.6 GET d19xqa3lc3clo8.cloudfront.net /crates/serde_derive/serde_derive-1.0.163.crate 200 - cargo%201.71.0%20(cfd3bbd8f%202023-06-08) - - Hit 0H_zJOQvRFEEy-yrtgM7FusA3kEaxhOfMvWishmefb4pjt3EuNfn2w== static.crates.io https 50 0.057 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 59518 0.055 Hit application/gzip 54447 - -

0 commit comments

Comments
 (0)