Skip to content

Commit

Permalink
Bugfix skip bad audio (#492)
Browse files Browse the repository at this point in the history
- Use new whisper-rs method to access segment bytes when string conversion fails
- Fix spelling on method name

---------

Co-authored-by: travolin <joel@spyglass.fyi>
  • Loading branch information
travolin and travolin committed Aug 18, 2023
1 parent 0ce3d65 commit 332f20b
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 18 deletions.
8 changes: 3 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion crates/spyglass-processor/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ strum = "0.24"
strum_macros = "0.24"
symphonia = { version = "0.5.2", features = ["aac", "isomp4", "mp3", "mpa"] }
uuid = { version = "1.0.0", features = ["serde", "v4"], default-features = false }
whisper-rs = "0.8.0"
whisper-rs = { git = "https://github.com/tazz4843/whisper-rs.git", rev = "24e6a0025e2714ff2bed5861b07af8735fb71d18"}

# Internal spyglass libs
shared = { path = "../shared" }
Expand Down
58 changes: 47 additions & 11 deletions crates/spyglass-processor/src/parser/audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ pub struct TranscriptionResult {
}

/// Given a path to a wav file, transcribe it using our **shhhh** models.
pub fn transcibe_audio(
pub fn transcribe_audio(
path: PathBuf,
model_path: PathBuf,
segment_len: i32,
Expand Down Expand Up @@ -259,14 +259,50 @@ pub fn transcibe_audio(
state.full(params, &audio_file.samples)?;
let num_segments = state.full_n_segments()?;
log::debug!("Extracted {} segments", num_segments);
let mut token_buffer = Vec::new();
let mut start_time_stored: Option<i64> = None;
for i in 0..num_segments {
let segment = state
.full_get_segment_text(i)
.expect("failed to get segment");
let start_timestamp = state.full_get_segment_t0(i)?;
let segment = match state.full_get_segment_text(i) {
Ok(segment) => {
token_buffer.clear();
start_time_stored = None;
Some(segment)
}
Err(_error) => {
match state.full_get_segment_bytes(i) {
Ok(bytes) => {
if start_time_stored.is_none() {
start_time_stored = Some(state.full_get_segment_t0(i)?);
}
token_buffer.extend(bytes);
}
Err(error) => {
log::error!("Error accessing bytes for segment {:?}", error);
}
}

match std::str::from_utf8(&token_buffer.clone()) {
Ok(str) => {
token_buffer.clear();
Some(str.to_string())
}
Err(_error) => None,
}
}
};

let mut start_timestamp = state.full_get_segment_t0(i)?;
let end_timestamp = state.full_get_segment_t1(i)?;
res.segments
.push(Segment::new(start_timestamp, end_timestamp, &segment));
if let Some(seg) = segment {
// In the case we had to piece together segments to get a valid
// utf8 string the start time might not be this segment, but a
// previous one.
if let Some(start) = start_time_stored {
start_timestamp = start;
}
res.segments
.push(Segment::new(start_timestamp, end_timestamp, &seg));
}
}
}
Err(err) => {
Expand All @@ -282,14 +318,14 @@ pub fn transcibe_audio(
#[cfg(test)]
mod test {
const MODEL_PATH: &str = "../../assets/models/whisper.base.en.bin";
use super::transcibe_audio;
use super::transcribe_audio;

#[test]
fn test_wav_transcription() {
// Use the sample from whisper.cpp as a baseline test.
let expected = include_str!("../../../../fixtures/audio/jfk.txt");
let path = "../../fixtures/audio/jfk.wav".into();
let res = transcibe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let res = transcribe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let segments = res.segments;
assert!(segments.len() > 0);

Expand All @@ -305,7 +341,7 @@ mod test {
fn test_ogg_transcription() {
let expected = include_str!("../../../../fixtures/audio/armstrong.txt");
let path = "../../fixtures/audio/armstrong.ogg".into();
let res = transcibe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let res = transcribe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let segments = res.segments;
assert!(segments.len() > 0);
let combined = segments
Expand All @@ -322,7 +358,7 @@ mod test {
fn test_mp3_transcription() {
let expected = include_str!("../../../../fixtures/audio/count_of_monte_cristo.txt");
let path = "../../fixtures/audio/count_of_monte_cristo.mp3".into();
let res = transcibe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let res = transcribe_audio(path, MODEL_PATH.into(), 1).expect("Unable to transcribe");
let segments = res.segments;
assert!(segments.len() > 0);
let combined = segments
Expand Down
2 changes: 1 addition & 1 deletion crates/spyglass/src/crawler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ async fn _process_file(
log::warn!("whisper model not installed, skipping transcription");
content = None;
} else {
match parser::audio::transcibe_audio(path.to_path_buf(), model_path, 0) {
match parser::audio::transcribe_audio(path.to_path_buf(), model_path, 0) {
Ok(result) => {
// Update crawl result with appropriate title/stuff
if let Some(metadata) = result.metadata {
Expand Down

0 comments on commit 332f20b

Please sign in to comment.