From b692584ac64d71a6a7e2fc8cc033b704e2a9f38d Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 6 Mar 2025 07:39:27 +0100 Subject: [PATCH 1/5] Add `semver_ord(num)` SQL function and `version.semver_ord` column We currently rely on the Rust `semver` crate to implement our "sort by semantic versioning" functionality, that is used by web interface, but also to determine the "default version". This has the downside that we need to load the full list of version numbers for a crate from the database to the API server, sort it and then throw away the ones that we don't need. This commit implements a `semver_ord(num)` pgSQL function that returns a JSONB array, which has the same ordering precedence as the Semantic Versioning spec (https://semver.org/#spec-item-11), with the small caveat that it only supports up to 15 prerelease parts. The maximum number of prerelease parts in our current dataset is 7, so 15 should be plenty. The database migration in this commit also adds a new `semver_ord` column to the `versions` table, and an on-insert trigger function that automatically derives the `semver_ord` column from the `num` column value. Once this migration has run, the existing versions can be backfilled by running the following SQL script, until all versions are processed: ```sql with versions_to_update as ( select id, num from versions where semver_ord = 'null'::jsonb limit 1000 ) update versions set semver_ord = semver_ord(num) where id in (select id from versions_to_update); ``` --- crates/crates_io_database/tests/semver_ord.rs | 66 ++++++++++++ .../crates_io_database_dump/src/dump-db.toml | 1 + .../2025-03-06-060640_semver_ord/down.sql | 4 + .../2025-03-06-060640_semver_ord/up.sql | 100 ++++++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 crates/crates_io_database/tests/semver_ord.rs create mode 100644 migrations/2025-03-06-060640_semver_ord/down.sql create mode 100644 migrations/2025-03-06-060640_semver_ord/up.sql diff --git a/crates/crates_io_database/tests/semver_ord.rs b/crates/crates_io_database/tests/semver_ord.rs new file mode 100644 index 00000000000..141f575442b --- /dev/null +++ b/crates/crates_io_database/tests/semver_ord.rs @@ -0,0 +1,66 @@ +use crates_io_test_db::TestDatabase; +use diesel::prelude::*; +use diesel::sql_types::Text; +use diesel_async::RunQueryDsl; +use std::fmt::Debug; + +/// This test checks that the `semver_ord` function orders versions correctly. +/// +/// The test data is a list of versions in a random order. The versions are then +/// ordered by the `semver_ord` function and the result is compared to the +/// expected order (see ). +/// +/// The test data was imported from . +#[tokio::test] +async fn test_spec_order() { + let test_db = TestDatabase::new(); + let mut conn = test_db.async_connect().await; + + let query = r#" + with nums as ( + select unnest(array[ + '1.0.0-beta', + '1.0.0-alpha', + '1.0.0-rc.1', + '1.0.0', + '1.0.0-beta.2', + '1.0.0-alpha.1', + '1.0.0-alpha.beta', + '1.0.0-beta.11' + ]) as num + ) + select num + from nums + order by semver_ord(num); + "#; + + #[derive(QueryableByName)] + struct Row { + #[diesel(sql_type = Text)] + num: String, + } + + impl Debug for Row { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.num) + } + } + + let nums = diesel::sql_query(query) + .load::(&mut conn) + .await + .unwrap(); + + insta::assert_debug_snapshot!(nums, @r" + [ + 1.0.0-alpha, + 1.0.0-alpha.1, + 1.0.0-alpha.beta, + 1.0.0-beta, + 1.0.0-beta.2, + 1.0.0-beta.11, + 1.0.0-rc.1, + 1.0.0, + ] + "); +} diff --git a/crates/crates_io_database_dump/src/dump-db.toml b/crates/crates_io_database_dump/src/dump-db.toml index 531a8132172..e42d45c59dd 100644 --- a/crates/crates_io_database_dump/src/dump-db.toml +++ b/crates/crates_io_database_dump/src/dump-db.toml @@ -233,6 +233,7 @@ id = "public" crate_id = "public" num = "public" num_no_build = "public" +semver_ord = "private" updated_at = "public" created_at = "public" downloads = "public" diff --git a/migrations/2025-03-06-060640_semver_ord/down.sql b/migrations/2025-03-06-060640_semver_ord/down.sql new file mode 100644 index 00000000000..749fd0fe3c5 --- /dev/null +++ b/migrations/2025-03-06-060640_semver_ord/down.sql @@ -0,0 +1,4 @@ +drop trigger trigger_set_semver_ord on versions; +drop function set_semver_ord(); +alter table versions drop column semver_ord; +drop function semver_ord; diff --git a/migrations/2025-03-06-060640_semver_ord/up.sql b/migrations/2025-03-06-060640_semver_ord/up.sql new file mode 100644 index 00000000000..8081be86e57 --- /dev/null +++ b/migrations/2025-03-06-060640_semver_ord/up.sql @@ -0,0 +1,100 @@ +-- Add `semver_ord(num)` function to convert a semver string into a JSONB array for version comparison purposes. + +create or replace function semver_ord(num varchar) returns jsonb as $$ +declare + -- We need to ensure that the array has the same length for all versions + -- since shorter arrays have lower precedence in JSONB. Since we also need + -- to add a boolean value for each part of the prerelease string, this + -- results in us supporting up to 15 parts in the prerelease string. + -- Everything beyond that will be ignored. + prerelease_array_length constant int := 30; + + -- We ignore the "build metadata" part of the semver string, since it has + -- no impact on the version ordering. + match_result text[] := regexp_match(num, '^(\d+).(\d+).(\d+)(?:-([0-9A-Za-z\-.]+))?'); + + prerelease jsonb; + prerelease_part text; + i int := 0; +begin + if match_result[4] is null then + -- A JSONB object has higher precedence than an array, and versions with + -- prerelease specifiers should have lower precedence than those without. + prerelease := json_build_object(); + else + prerelease := to_jsonb(array_fill(NULL::bool, ARRAY[prerelease_array_length])); + + -- Split prerelease string by `.` and "append" items to + -- the `prerelease` array. + foreach prerelease_part in array string_to_array(match_result[4], '.') + loop + -- Parse parts as numbers if they consist of only digits. + if regexp_like(prerelease_part, '^\d+$') then + -- In JSONB a number has higher precedence than a string but in + -- semver it is the other way around, so we use true/false to + -- work around this. + prerelease := jsonb_set(prerelease, array[i::text], to_jsonb(false)); + prerelease := jsonb_set(prerelease, array[(i + 1)::text], to_jsonb(prerelease_part::numeric)); + else + prerelease := jsonb_set(prerelease, array[i::text], to_jsonb(true)); + prerelease := jsonb_set(prerelease, array[(i + 1)::text], to_jsonb(prerelease_part)); + end if; + + -- Exit the loop if we have reached the maximum number of parts. + i := i + 2; + exit when i > prerelease_array_length; + end loop; + end if; + + -- Return an array with the major, minor, patch, and prerelease parts. + return json_build_array( + match_result[1]::numeric, + match_result[2]::numeric, + match_result[3]::numeric, + prerelease + ); +end; +$$ language plpgsql immutable; + +comment on function semver_ord is 'Converts a semver string into a JSONB array for version comparison purposes. The array has the following format: [major, minor, patch, prerelease] and when used for sorting follow the precedence rules defined in the semver specification (https://semver.org/#spec-item-11).'; + + +-- Add corresponding column to the `versions` table. + +alter table versions + add semver_ord jsonb default 'null'::jsonb not null; + +comment on column versions.semver_ord is 'JSONB representation of the version number for sorting purposes.'; + + +-- Create a trigger to set the `semver_ord` column when inserting a new version. +-- Ideally, we would use a generated column for this, but introducing such a +-- column would require a full table rewrite, which is not feasible for large +-- tables. + +create or replace function set_semver_ord() returns trigger as $$ +begin + new.semver_ord := semver_ord(new.num); + return new; +end +$$ language plpgsql; + +create or replace trigger trigger_set_semver_ord + before insert on versions + for each row + execute procedure set_semver_ord(); + + +-- Populate the `semver_ord` column for existing versions. +-- This query should be run manually in small batches to avoid locking the +-- table for too long. + +-- with versions_to_update as ( +-- select id, num +-- from versions +-- where semver_ord = 'null'::jsonb +-- limit 1000 +-- ) +-- update versions +-- set semver_ord = semver_ord(num) +-- where id in (select id from versions_to_update); From 7012704b4c6b86bd8e62b38bf7acedf41c64ca72 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 6 Mar 2025 16:29:14 +0100 Subject: [PATCH 2/5] database/semver_ord: Add serialization test --- crates/crates_io_database/tests/semver_ord.rs | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/crates/crates_io_database/tests/semver_ord.rs b/crates/crates_io_database/tests/semver_ord.rs index 141f575442b..4c857ac1a58 100644 --- a/crates/crates_io_database/tests/semver_ord.rs +++ b/crates/crates_io_database/tests/semver_ord.rs @@ -4,6 +4,40 @@ use diesel::sql_types::Text; use diesel_async::RunQueryDsl; use std::fmt::Debug; +#[tokio::test] +async fn test_jsonb_output() { + let test_db = TestDatabase::new(); + let mut conn = test_db.async_connect().await; + + let mut check = async |num| { + let query = format!("select semver_ord('{num}') as output"); + + #[derive(QueryableByName)] + struct Row { + #[diesel(sql_type = Text)] + output: String, + } + + diesel::sql_query(query) + .get_result::(&mut conn) + .await + .unwrap() + .output + }; + + insta::assert_snapshot!(check("0.0.0").await, @r#"[0, 0, 0, {}]"#); + insta::assert_snapshot!(check("1.0.0-alpha.1").await, @r#"[1, 0, 0, [true, "alpha", false, 1, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]"#); + + // https://crates.io/crates/cursed-trying-to-break-cargo/1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv – thanks @Gankra! + insta::assert_snapshot!(check("1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv").await, @r#"[1, 0, 0, [false, 0, true, "HDTV-BluRay", true, "1020p", true, "YTSUB", true, "L33TRip", true, "mkv", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]"#); + + // Invalid version string + insta::assert_snapshot!(check("foo").await, @"[null, null, null, {}]"); + + // Version string with a lot of prerelease identifiers + insta::assert_snapshot!(check("1.2.3-1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.end").await, @"[1, 2, 3, [false, 1, false, 2, false, 3, false, 4, false, 5, false, 6, false, 7, false, 8, false, 9, false, 10, false, 11, false, 12, false, 13, false, 14, false, 15, false, 16]]"); +} + /// This test checks that the `semver_ord` function orders versions correctly. /// /// The test data is a list of versions in a random order. The versions are then From e3ebad5b3c5047da2612a15597eb3055c60f6fe3 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 6 Mar 2025 16:46:13 +0100 Subject: [PATCH 3/5] database/semver_ord: Add remaining prerelease identifiers as final text item --- crates/crates_io_database/tests/semver_ord.rs | 8 +++---- .../2025-03-06-060640_semver_ord/up.sql | 23 +++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/crates/crates_io_database/tests/semver_ord.rs b/crates/crates_io_database/tests/semver_ord.rs index 4c857ac1a58..c9cf4edb35b 100644 --- a/crates/crates_io_database/tests/semver_ord.rs +++ b/crates/crates_io_database/tests/semver_ord.rs @@ -26,16 +26,16 @@ async fn test_jsonb_output() { }; insta::assert_snapshot!(check("0.0.0").await, @r#"[0, 0, 0, {}]"#); - insta::assert_snapshot!(check("1.0.0-alpha.1").await, @r#"[1, 0, 0, [true, "alpha", false, 1, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]"#); + insta::assert_snapshot!(check("1.0.0-alpha.1").await, @r#"[1, 0, 0, [true, "alpha", false, 1, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, ""]]"#); - // https://crates.io/crates/cursed-trying-to-break-cargo/1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv – thanks @Gankra! - insta::assert_snapshot!(check("1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv").await, @r#"[1, 0, 0, [false, 0, true, "HDTV-BluRay", true, "1020p", true, "YTSUB", true, "L33TRip", true, "mkv", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]"#); + // see https://crates.io/crates/cursed-trying-to-break-cargo/1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv – thanks @Gankra! + insta::assert_snapshot!(check("1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv").await, @r#"[1, 0, 0, [false, 0, true, "HDTV-BluRay", true, "1020p", true, "YTSUB", true, "L33TRip", true, "mkv", null, null, null, null, null, null, null, null, ""]]"#); // Invalid version string insta::assert_snapshot!(check("foo").await, @"[null, null, null, {}]"); // Version string with a lot of prerelease identifiers - insta::assert_snapshot!(check("1.2.3-1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.end").await, @"[1, 2, 3, [false, 1, false, 2, false, 3, false, 4, false, 5, false, 6, false, 7, false, 8, false, 9, false, 10, false, 11, false, 12, false, 13, false, 14, false, 15, false, 16]]"); + insta::assert_snapshot!(check("1.2.3-1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.end").await, @r#"[1, 2, 3, [false, 1, false, 2, false, 3, false, 4, false, 5, false, 6, false, 7, false, 8, false, 9, false, 10, "11.12.13.14.15.16.17.end"]]"#); } /// This test checks that the `semver_ord` function orders versions correctly. diff --git a/migrations/2025-03-06-060640_semver_ord/up.sql b/migrations/2025-03-06-060640_semver_ord/up.sql index 8081be86e57..abc31a7cd0e 100644 --- a/migrations/2025-03-06-060640_semver_ord/up.sql +++ b/migrations/2025-03-06-060640_semver_ord/up.sql @@ -2,18 +2,19 @@ create or replace function semver_ord(num varchar) returns jsonb as $$ declare - -- We need to ensure that the array has the same length for all versions - -- since shorter arrays have lower precedence in JSONB. Since we also need - -- to add a boolean value for each part of the prerelease string, this - -- results in us supporting up to 15 parts in the prerelease string. - -- Everything beyond that will be ignored. - prerelease_array_length constant int := 30; + -- We need to ensure that the prerelease array has the same length for all + -- versions since shorter arrays have lower precedence in JSONB. We store + -- the first 10 parts of the prerelease string as pairs of booleans and + -- numbers or text values, and then a final text item for the remaining + -- parts. + max_prerelease_parts constant int := 10; -- We ignore the "build metadata" part of the semver string, since it has -- no impact on the version ordering. match_result text[] := regexp_match(num, '^(\d+).(\d+).(\d+)(?:-([0-9A-Za-z\-.]+))?'); prerelease jsonb; + prerelease_parts text[]; prerelease_part text; i int := 0; begin @@ -22,11 +23,13 @@ begin -- prerelease specifiers should have lower precedence than those without. prerelease := json_build_object(); else - prerelease := to_jsonb(array_fill(NULL::bool, ARRAY[prerelease_array_length])); + prerelease := to_jsonb(array_fill(NULL::bool, ARRAY[max_prerelease_parts * 2 + 1])); -- Split prerelease string by `.` and "append" items to -- the `prerelease` array. - foreach prerelease_part in array string_to_array(match_result[4], '.') + prerelease_parts := string_to_array(match_result[4], '.'); + + foreach prerelease_part in array prerelease_parts[1:max_prerelease_parts + 1] loop -- Parse parts as numbers if they consist of only digits. if regexp_like(prerelease_part, '^\d+$') then @@ -42,8 +45,10 @@ begin -- Exit the loop if we have reached the maximum number of parts. i := i + 2; - exit when i > prerelease_array_length; + exit when i >= max_prerelease_parts * 2; end loop; + + prerelease := jsonb_set(prerelease, array[(max_prerelease_parts * 2)::text], to_jsonb(array_to_string(prerelease_parts[max_prerelease_parts + 1:], '.'))); end if; -- Return an array with the major, minor, patch, and prerelease parts. From 7438bdfa193803c819d7126a4594685a8659f741 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 7 Mar 2025 10:34:45 +0100 Subject: [PATCH 4/5] database/semver_ord: Return `null` if version number can't be parsed --- crates/crates_io_database/tests/semver_ord.rs | 9 +++++---- migrations/2025-03-06-060640_semver_ord/up.sql | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/crates/crates_io_database/tests/semver_ord.rs b/crates/crates_io_database/tests/semver_ord.rs index c9cf4edb35b..f9c288307d3 100644 --- a/crates/crates_io_database/tests/semver_ord.rs +++ b/crates/crates_io_database/tests/semver_ord.rs @@ -1,6 +1,6 @@ use crates_io_test_db::TestDatabase; use diesel::prelude::*; -use diesel::sql_types::Text; +use diesel::sql_types::{Nullable, Text}; use diesel_async::RunQueryDsl; use std::fmt::Debug; @@ -14,8 +14,8 @@ async fn test_jsonb_output() { #[derive(QueryableByName)] struct Row { - #[diesel(sql_type = Text)] - output: String, + #[diesel(sql_type = Nullable)] + output: Option, } diesel::sql_query(query) @@ -23,6 +23,7 @@ async fn test_jsonb_output() { .await .unwrap() .output + .unwrap_or_default() }; insta::assert_snapshot!(check("0.0.0").await, @r#"[0, 0, 0, {}]"#); @@ -32,7 +33,7 @@ async fn test_jsonb_output() { insta::assert_snapshot!(check("1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv").await, @r#"[1, 0, 0, [false, 0, true, "HDTV-BluRay", true, "1020p", true, "YTSUB", true, "L33TRip", true, "mkv", null, null, null, null, null, null, null, null, ""]]"#); // Invalid version string - insta::assert_snapshot!(check("foo").await, @"[null, null, null, {}]"); + insta::assert_snapshot!(check("foo").await, @""); // Version string with a lot of prerelease identifiers insta::assert_snapshot!(check("1.2.3-1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.end").await, @r#"[1, 2, 3, [false, 1, false, 2, false, 3, false, 4, false, 5, false, 6, false, 7, false, 8, false, 9, false, 10, "11.12.13.14.15.16.17.end"]]"#); diff --git a/migrations/2025-03-06-060640_semver_ord/up.sql b/migrations/2025-03-06-060640_semver_ord/up.sql index abc31a7cd0e..233286319ab 100644 --- a/migrations/2025-03-06-060640_semver_ord/up.sql +++ b/migrations/2025-03-06-060640_semver_ord/up.sql @@ -18,6 +18,10 @@ declare prerelease_part text; i int := 0; begin + if match_result is null then + return null; + end if; + if match_result[4] is null then -- A JSONB object has higher precedence than an array, and versions with -- prerelease specifiers should have lower precedence than those without. @@ -67,7 +71,7 @@ comment on function semver_ord is 'Converts a semver string into a JSONB array f -- Add corresponding column to the `versions` table. alter table versions - add semver_ord jsonb default 'null'::jsonb not null; + add semver_ord jsonb; comment on column versions.semver_ord is 'JSONB representation of the version number for sorting purposes.'; From 2f6d121eab3f140c8f466783128772a6b10a081d Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 7 Mar 2025 10:39:23 +0100 Subject: [PATCH 5/5] database/semver_ord: Add descending order snapshot too --- crates/crates_io_database/tests/semver_ord.rs | 63 ++++++++++++------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/crates/crates_io_database/tests/semver_ord.rs b/crates/crates_io_database/tests/semver_ord.rs index f9c288307d3..4553baaeb4e 100644 --- a/crates/crates_io_database/tests/semver_ord.rs +++ b/crates/crates_io_database/tests/semver_ord.rs @@ -51,24 +51,6 @@ async fn test_spec_order() { let test_db = TestDatabase::new(); let mut conn = test_db.async_connect().await; - let query = r#" - with nums as ( - select unnest(array[ - '1.0.0-beta', - '1.0.0-alpha', - '1.0.0-rc.1', - '1.0.0', - '1.0.0-beta.2', - '1.0.0-alpha.1', - '1.0.0-alpha.beta', - '1.0.0-beta.11' - ]) as num - ) - select num - from nums - order by semver_ord(num); - "#; - #[derive(QueryableByName)] struct Row { #[diesel(sql_type = Text)] @@ -81,12 +63,34 @@ async fn test_spec_order() { } } - let nums = diesel::sql_query(query) - .load::(&mut conn) - .await - .unwrap(); + let mut check = async |order| { + let query = format!( + r#" + with nums as ( + select unnest(array[ + '1.0.0-beta', + '1.0.0-alpha', + '1.0.0-rc.1', + '1.0.0', + '1.0.0-beta.2', + '1.0.0-alpha.1', + '1.0.0-alpha.beta', + '1.0.0-beta.11' + ]) as num + ) + select num + from nums + order by semver_ord(num) {order}; + "# + ); - insta::assert_debug_snapshot!(nums, @r" + diesel::sql_query(query) + .load::(&mut conn) + .await + .unwrap() + }; + + insta::assert_debug_snapshot!(check("asc").await, @r" [ 1.0.0-alpha, 1.0.0-alpha.1, @@ -98,4 +102,17 @@ async fn test_spec_order() { 1.0.0, ] "); + + insta::assert_debug_snapshot!(check("desc").await, @r" + [ + 1.0.0, + 1.0.0-rc.1, + 1.0.0-beta.11, + 1.0.0-beta.2, + 1.0.0-beta, + 1.0.0-alpha.beta, + 1.0.0-alpha.1, + 1.0.0-alpha, + ] + "); }