Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions crates/crates_io_database/tests/semver_ord.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use crates_io_test_db::TestDatabase;
use diesel::prelude::*;
use diesel::sql_types::{Nullable, Text};
use diesel_async::RunQueryDsl;
use std::fmt::Debug;

#[tokio::test]
async fn test_jsonb_output() {
let test_db = TestDatabase::new();
let mut conn = test_db.async_connect().await;

let mut check = async |num| {
let query = format!("select semver_ord('{num}') as output");

#[derive(QueryableByName)]
struct Row {
#[diesel(sql_type = Nullable<Text>)]
output: Option<String>,
}

diesel::sql_query(query)
.get_result::<Row>(&mut conn)
.await
.unwrap()
.output
.unwrap_or_default()
};

insta::assert_snapshot!(check("0.0.0").await, @r#"[0, 0, 0, {}]"#);
insta::assert_snapshot!(check("1.0.0-alpha.1").await, @r#"[1, 0, 0, [true, "alpha", false, 1, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, ""]]"#);

// see https://crates.io/crates/cursed-trying-to-break-cargo/1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv – thanks @Gankra!
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love that this is represented as-is. 😆

insta::assert_snapshot!(check("1.0.0-0.HDTV-BluRay.1020p.YTSUB.L33TRip.mkv").await, @r#"[1, 0, 0, [false, 0, true, "HDTV-BluRay", true, "1020p", true, "YTSUB", true, "L33TRip", true, "mkv", null, null, null, null, null, null, null, null, ""]]"#);

// Invalid version string
insta::assert_snapshot!(check("foo").await, @"");

// Version string with a lot of prerelease identifiers
insta::assert_snapshot!(check("1.2.3-1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17.end").await, @r#"[1, 2, 3, [false, 1, false, 2, false, 3, false, 4, false, 5, false, 6, false, 7, false, 8, false, 9, false, 10, "11.12.13.14.15.16.17.end"]]"#);
}

/// This test checks that the `semver_ord` function orders versions correctly.
///
/// The test data is a list of versions in a random order. The versions are then
/// ordered by the `semver_ord` function and the result is compared to the
/// expected order (see <https://semver.org/#spec-item-11>).
///
/// The test data was imported from <https://github.com/dtolnay/semver/blob/1.0.26/tests/test_version.rs#L223-L242>.
#[tokio::test]
async fn test_spec_order() {
let test_db = TestDatabase::new();
let mut conn = test_db.async_connect().await;

#[derive(QueryableByName)]
struct Row {
#[diesel(sql_type = Text)]
num: String,
}

impl Debug for Row {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.num)
}
}

let mut check = async |order| {
let query = format!(
r#"
with nums as (
select unnest(array[
'1.0.0-beta',
'1.0.0-alpha',
'1.0.0-rc.1',
'1.0.0',
'1.0.0-beta.2',
'1.0.0-alpha.1',
'1.0.0-alpha.beta',
'1.0.0-beta.11'
]) as num
)
select num
from nums
order by semver_ord(num) {order};
"#
);

diesel::sql_query(query)
.load::<Row>(&mut conn)
.await
.unwrap()
};

insta::assert_debug_snapshot!(check("asc").await, @r"
[
1.0.0-alpha,
1.0.0-alpha.1,
1.0.0-alpha.beta,
1.0.0-beta,
1.0.0-beta.2,
1.0.0-beta.11,
1.0.0-rc.1,
1.0.0,
]
");

insta::assert_debug_snapshot!(check("desc").await, @r"
[
1.0.0,
1.0.0-rc.1,
1.0.0-beta.11,
1.0.0-beta.2,
1.0.0-beta,
1.0.0-alpha.beta,
1.0.0-alpha.1,
1.0.0-alpha,
]
");
}
1 change: 1 addition & 0 deletions crates/crates_io_database_dump/src/dump-db.toml
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ id = "public"
crate_id = "public"
num = "public"
num_no_build = "public"
semver_ord = "private"
updated_at = "public"
created_at = "public"
downloads = "public"
Expand Down
4 changes: 4 additions & 0 deletions migrations/2025-03-06-060640_semver_ord/down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
drop trigger trigger_set_semver_ord on versions;
drop function set_semver_ord();
alter table versions drop column semver_ord;
drop function semver_ord;
109 changes: 109 additions & 0 deletions migrations/2025-03-06-060640_semver_ord/up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
-- Add `semver_ord(num)` function to convert a semver string into a JSONB array for version comparison purposes.

create or replace function semver_ord(num varchar) returns jsonb as $$
declare
-- We need to ensure that the prerelease array has the same length for all
-- versions since shorter arrays have lower precedence in JSONB. We store
-- the first 10 parts of the prerelease string as pairs of booleans and
-- numbers or text values, and then a final text item for the remaining
-- parts.
max_prerelease_parts constant int := 10;

-- We ignore the "build metadata" part of the semver string, since it has
-- no impact on the version ordering.
match_result text[] := regexp_match(num, '^(\d+).(\d+).(\d+)(?:-([0-9A-Za-z\-.]+))?');

prerelease jsonb;
prerelease_parts text[];
prerelease_part text;
i int := 0;
begin
if match_result is null then
return null;
end if;

if match_result[4] is null then
-- A JSONB object has higher precedence than an array, and versions with
-- prerelease specifiers should have lower precedence than those without.
Comment on lines +26 to +27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went to check that we're not relying on any undocumented behaviour, and found this gem:

The btree ordering for jsonb datums is seldom of great interest[...]

Little did they know!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, see #10763 (comment) :)

prerelease := json_build_object();
else
prerelease := to_jsonb(array_fill(NULL::bool, ARRAY[max_prerelease_parts * 2 + 1]));

-- Split prerelease string by `.` and "append" items to
-- the `prerelease` array.
prerelease_parts := string_to_array(match_result[4], '.');

foreach prerelease_part in array prerelease_parts[1:max_prerelease_parts + 1]
loop
-- Parse parts as numbers if they consist of only digits.
if regexp_like(prerelease_part, '^\d+$') then
-- In JSONB a number has higher precedence than a string but in
-- semver it is the other way around, so we use true/false to
-- work around this.
Comment on lines +40 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are some references to facilitate the review. The ordering for jsonb, as excerpted from the official doc1, is as follows:

Object > Array > Boolean > Number > String > null
Object with n pairs > object with n - 1 pairs
Array with n elements > array with n - 1 elements

The SemVer's pre-release ordering, as excerpted from the spec2, is as follows:

Precedence for two pre-release versions with the same major, minor, and patch version MUST be determined by comparing each dot separated identifier from left to right until a difference is found as follows:
    1. Identifiers consisting of only digits are compared numerically.
    2. Identifiers with letters or hyphens are compared lexically in ASCII sort order.
    3. Numeric identifiers always have lower precedence than non-numeric identifiers.
    4. A larger set of pre-release fields has a higher precedence than a smaller set, if all of the preceding identifiers are equal.

Footnotes

  1. https://www.postgresql.org/docs/current/datatype-json.html#JSON-INDEXING

  2. https://semver.org/#spec-item-11

prerelease := jsonb_set(prerelease, array[i::text], to_jsonb(false));
prerelease := jsonb_set(prerelease, array[(i + 1)::text], to_jsonb(prerelease_part::numeric));
else
prerelease := jsonb_set(prerelease, array[i::text], to_jsonb(true));
prerelease := jsonb_set(prerelease, array[(i + 1)::text], to_jsonb(prerelease_part));
end if;

-- Exit the loop if we have reached the maximum number of parts.
i := i + 2;
exit when i >= max_prerelease_parts * 2;
end loop;

prerelease := jsonb_set(prerelease, array[(max_prerelease_parts * 2)::text], to_jsonb(array_to_string(prerelease_parts[max_prerelease_parts + 1:], '.')));
end if;

-- Return an array with the major, minor, patch, and prerelease parts.
return json_build_array(
match_result[1]::numeric,
match_result[2]::numeric,
match_result[3]::numeric,
prerelease
);
end;
$$ language plpgsql immutable;

comment on function semver_ord is 'Converts a semver string into a JSONB array for version comparison purposes. The array has the following format: [major, minor, patch, prerelease] and when used for sorting follow the precedence rules defined in the semver specification (https://semver.org/#spec-item-11).';


-- Add corresponding column to the `versions` table.

alter table versions
add semver_ord jsonb;

comment on column versions.semver_ord is 'JSONB representation of the version number for sorting purposes.';


-- Create a trigger to set the `semver_ord` column when inserting a new version.
-- Ideally, we would use a generated column for this, but introducing such a
-- column would require a full table rewrite, which is not feasible for large
-- tables.

create or replace function set_semver_ord() returns trigger as $$
begin
new.semver_ord := semver_ord(new.num);
return new;
end
$$ language plpgsql;

create or replace trigger trigger_set_semver_ord
before insert on versions
for each row
execute procedure set_semver_ord();


-- Populate the `semver_ord` column for existing versions.
-- This query should be run manually in small batches to avoid locking the
-- table for too long.

-- with versions_to_update as (
-- select id, num
-- from versions
-- where semver_ord = 'null'::jsonb
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed that I didn't update this correctly. this line should now be where semver_ord is null.

-- limit 1000
-- )
-- update versions
-- set semver_ord = semver_ord(num)
-- where id in (select id from versions_to_update);