Skip to content

Commit

Permalink
Implement file parsing for webassembly (#3047)
Browse files Browse the repository at this point in the history
Address
#1577 (comment)

This PR implements `Read` for `File` in browsers, which allows using
`niffler` + `needletail` to parse FASTA/Q, `.gz`compressed or not, in
browsers.

I also added error handling, so the browser can print nicer error
messages instead of something cryptic to `console.log`.
  • Loading branch information
luizirber committed Mar 23, 2024
1 parent 24ab89c commit 2b1bf0d
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 31 deletions.
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ last-tag:
wasm:
wasm-pack build src/core -d ../../pkg

wasm-test:
wasm-pack test --node src/core

wasi:
cargo wasi build

Expand Down
1 change: 1 addition & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
cargo-outdated
cargo-udeps
cargo-deny
cargo-wasi
#cargo-semver-checks
nixpkgs-fmt
];
Expand Down
30 changes: 29 additions & 1 deletion src/core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]

## [0.13.1] - 2024-03-23

MSRV: 1.65

Changes/additions:

* Implement file parsing for webassembly (#3047)
* fix `calculate_gather_stats` `threshold=0` bug (#3052)
* fix clippy beta issues (#3088)

Updates:

* Bump wasm-bindgen-test from 0.3.41 to 0.3.42 (#3063)
* Bump web-sys from 0.3.68 to 0.3.69 (#3061)
* Bump log from 0.4.20 to 0.4.21 (#3062)
* Bump rayon from 1.8.1 to 1.9.0 (#3058)
* Bump tempfile from 3.10.0 to 3.10.1 (#3059)
* Bump serde_json from 1.0.113 to 1.0.114 (#3044)
* Bump serde from 1.0.196 to 1.0.197 (#3045)
* Bump itertools from 0.12.0 to 0.12.1 (#3043)

## [0.13.0] - 2024-02-23

MSRV: 1.65
Expand All @@ -17,6 +40,7 @@ Changes/additions:
* make core Manifest booleans python compatible (core) (#3007)

Updates:

* Bump roaring from 0.10.2 to 0.10.3 (#3014)
* Bump histogram from 0.9.0 to 0.9.1 (#3002)
* Bump chrono from 0.4.33 to 0.4.34 (#3000)
Expand Down Expand Up @@ -287,7 +311,11 @@ Fixed:
- Fix mem leak in get_mins (#807)
- Fixes for WASI and WASM compilation (#771) (#723)

[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...HEAD
[unreleased]: https://github.com/sourmash-bio/sourmash/compare/r0.13.1...HEAD
[0.13.1]: https://github.com/sourmash-bio/sourmash/compare/r0.13.0...r0.13.1
[0.13.0]: https://github.com/sourmash-bio/sourmash/compare/r0.12.1...r0.13.0
[0.12.1]: https://github.com/sourmash-bio/sourmash/compare/r0.12.0...r0.12.1
[0.12.0]: https://github.com/sourmash-bio/sourmash/compare/r0.11.0...r0.12.0
[0.11.0]: https://github.com/sourmash-bio/sourmash/compare/r0.10.0...r0.11.0
[0.10.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0
[0.9.0]: https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0
Expand Down
27 changes: 12 additions & 15 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[package]
name = "sourmash"
version = "0.13.0"
authors = ["Luiz Irber <luiz.irber@gmail.com>"]
description = "MinHash sketches for genomic data"
version = "0.13.1"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
keywords = ["minhash", "bioinformatics"]
categories = ["science", "algorithms", "data-structures"]
Expand Down Expand Up @@ -43,6 +43,7 @@ log = "0.4.21"
md5 = "0.7.0"
memmap2 = "0.9.4"
murmurhash3 = "0.0.5"
needletail = { version = "0.5.1", default-features = false }
niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] }
nohash-hasher = "0.2.0"
num-iter = "0.1.44"
Expand All @@ -64,8 +65,6 @@ typed-builder = "0.18.0"
vec-collections = "0.4.3"

[dev-dependencies]
criterion = "0.5.1"
needletail = { version = "0.5.1", default-features = false }
proptest = { version = "1.4.0", default-features = false, features = ["std"]}
rand = "0.8.2"
tempfile = "3.10.1"
Expand Down Expand Up @@ -95,21 +94,19 @@ skip_feature_sets = [

## Wasm section. Crates only used for WASM, as well as specific configurations

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.wasm-bindgen]
version = "0.2.89"
features = ["serde-serialize"]
[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies]
js-sys = "0.3.68"
web-sys = { version = "0.3.69", features = ["console", "File", "FileReaderSync"] }
wasm-bindgen = { version = "0.2.89", features = ["serde-serialize"] }

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.web-sys]
version = "0.3.69"
features = ["console", "File"]

[target.'cfg(all(target_arch = "wasm32"))'.dependencies.chrono]
version = "0.4.32"
features = ["wasmbind"]
[target.'cfg(all(target_arch = "wasm32"))'.dependencies]
chrono = { version = "0.4.32", features = ["wasmbind"] }

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dev-dependencies]
wasm-bindgen-test = "0.3.42"

### These crates don't compile on wasm
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
rocksdb = { version = "0.21.0", optional = true }
[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
criterion = "0.5.1"
193 changes: 179 additions & 14 deletions src/core/src/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#[global_allocator]
static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;

use needletail::parse_fastx_reader;
use wasm_bindgen::prelude::*;

use crate::cmd::ComputeParameters as _ComputeParameters;
Expand Down Expand Up @@ -57,15 +58,15 @@ impl KmerMinHash {
}

#[wasm_bindgen]
pub fn add_sequence_js(&mut self, buf: &str) {
self.0
.add_sequence(buf.as_bytes(), true)
.expect("Error adding sequence");
pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> {
self.0.add_sequence(buf.as_bytes(), true)?;
Ok(())
}

#[wasm_bindgen]
pub fn to_json(&mut self) -> String {
serde_json::to_string(&self.0).unwrap()
pub fn to_json(&mut self) -> Result<String, JsErrors> {
let json = serde_json::to_string(&self.0)?;
Ok(json)
}
}

Expand All @@ -81,6 +82,40 @@ impl ComputeParameters {
pub fn set_ksizes(&mut self, ksizes: Vec<u32>) {
self.0.set_ksizes(ksizes);
}

#[wasm_bindgen]
pub fn set_scaled(&mut self, scaled: u32) {
self.0.set_scaled(scaled as u64);
}

#[wasm_bindgen]
pub fn set_num(&mut self, num: u32) {
self.0.set_num_hashes(num);
}

#[wasm_bindgen]
pub fn set_protein(&mut self, is_protein: bool) {
self.0.set_protein(is_protein);
}

#[wasm_bindgen]
pub fn set_dayhoff(&mut self, dayhoff: bool) {
self.0.set_dayhoff(dayhoff);
}

#[wasm_bindgen]
pub fn set_hp(&mut self, hp: bool) {
self.0.set_hp(hp);
}

#[wasm_bindgen]
pub fn set_track_abundance(&mut self, track: bool) {
self.0.set_track_abundance(track);
}
#[wasm_bindgen]
pub fn set_seed(&mut self, seed: u32) {
self.0.set_seed(seed.into());
}
}

#[wasm_bindgen]
Expand All @@ -93,27 +128,68 @@ impl Signature {
}

#[wasm_bindgen]
pub fn add_sequence_js(&mut self, buf: &str) {
self.0
.add_sequence(buf.as_bytes(), true)
.expect("Error adding sequence");
pub fn add_sequence_js(&mut self, buf: &str) -> Result<(), JsErrors> {
self.0.add_sequence(buf.as_bytes(), true)?;

Ok(())
}

#[wasm_bindgen]
pub fn add_from_file(&mut self, fp: web_sys::File) {
unimplemented!()
pub fn add_from_file(
&mut self,
fp: web_sys::File,
callback: Option<js_sys::Function>,
) -> Result<(), JsErrors> {
let wf = SyncFile::new(fp, callback);

let (rdr, _format) = niffler::send::get_reader(Box::new(wf))?;

let mut parser = parse_fastx_reader(std::io::BufReader::with_capacity(
1024 << 14, // 16 MiB
rdr,
))?;

while let Some(record) = parser.next() {
let record = record?;
self.0.add_sequence(&record.seq(), true)?;
}

Ok(())
}

#[wasm_bindgen]
pub fn to_json(&mut self) -> String {
serde_json::to_string(&self.0).unwrap()
pub fn to_json(&mut self) -> Result<String, JsErrors> {
let json = serde_json::to_string(&self.0)?;
Ok(json)
}

pub fn size(&self) -> usize {
self.0.size()
}
}

#[derive(thiserror::Error, Debug)]
pub enum JsErrors {
#[error(transparent)]
SourmashError(#[from] crate::Error),

#[error(transparent)]
SerdeError(#[from] serde_json::error::Error),

#[error(transparent)]
NifflerError(#[from] niffler::Error),

#[error(transparent)]
NeedletailError(#[from] needletail::errors::ParseError),
}

impl Into<JsValue> for JsErrors {
fn into(self) -> JsValue {
let error = js_sys::Error::new(&self.to_string());
error.into()
}
}

#[cfg(test)]
mod test {
use super::*;
Expand All @@ -127,3 +203,92 @@ mod test {
assert_eq!(sig.size(), 3);
}
}

// ==============================

use js_sys::Number;
use js_sys::Uint8Array;
use once_cell::sync::Lazy;
use web_sys::FileReaderSync;

thread_local! {
static FILE_READER_SYNC: Lazy<FileReaderSync> = Lazy::new(|| {
FileReaderSync::new().expect("Failed to create FileReaderSync. Is it running in a web worker context?")
});
}

/// Wrapper around a `web_sys::File` that implements `Read` and `Seek`.
pub struct SyncFile {
file: web_sys::File,
pos: u64,
cb: Option<js_sys::Function>,
}

/// Because this needs to be initialized in a Web Worker, it is safe to make it Send.
/// (hopefully. I don't think they can be sent across Web Workers, nor accessed from other WW)
unsafe impl Send for SyncFile {}

impl SyncFile {
pub fn new(file: web_sys::File, cb: Option<js_sys::Function>) -> Self {
Self { file, pos: 0, cb }
}

/// File size in bytes.
pub fn size(&self) -> u64 {
let size = self.file.size();
if size <= Number::MAX_SAFE_INTEGER {
return size as u64;
} else {
panic!("size is not safe to convert to integer from float")
}
}

fn set_pos(&mut self, pos: u64) {
self.pos = pos;
self.cb.as_ref().map(|f| {
let arr = js_sys::Array::new_with_length(1);
arr.set(0, self.progress().into());
f.apply(&JsValue::null(), &arr)
.expect("Error calling progress callback");
});
}

/// Current progress on the file
pub fn progress(&self) -> f64 {
self.pos as f64 / self.file.size()
}
}

impl std::io::Read for SyncFile {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
let current_offset = self.pos;
let new_offset_f64 = current_offset as f64;
let new_offset_end_f64 = current_offset.saturating_add(
u64::try_from(buf.len()).map_err(|_| std::io::Error::other("Can't convert to u64"))?,
) as f64;

let blob = self
.file
.slice_with_f64_and_f64(new_offset_f64, new_offset_end_f64)
.map_err(|_| std::io::Error::other("failed to slice file"))?;
let array_buffer = FILE_READER_SYNC
.with(|frs| frs.read_as_array_buffer(&blob))
.map_err(|_| std::io::Error::other("failed to read as array buffer"))?;

let array = Uint8Array::new(&array_buffer);
let read_bytes = usize::try_from(array.byte_length())
.map_err(|_| std::io::Error::other("read too many bytes at once"))?;

// Copy to output buffer
array.copy_to(&mut buf[..read_bytes]);

// Update position
self.set_pos(
current_offset
.checked_add(read_bytes as u64)
.ok_or_else(|| std::io::Error::other("new position too large"))?,
);

Ok(read_bytes)
}
}
5 changes: 5 additions & 0 deletions src/core/tests/dedicated_worker.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#![cfg(all(target_arch = "wasm32", target_os = "unknown"))]

use wasm_bindgen_test::wasm_bindgen_test_configure;

wasm_bindgen_test_configure!(run_in_dedicated_worker);

0 comments on commit 2b1bf0d

Please sign in to comment.