From 63cdaa39c2c5e1931924706ea38e331419e8bcb9 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:16:58 -0700 Subject: [PATCH 1/7] Move subtitle to its own structure --- src/{subtitle.rs => subtitle/ass.rs} | 1 + src/subtitle/mod.rs | 3 +++ 2 files changed, 4 insertions(+) rename src/{subtitle.rs => subtitle/ass.rs} (99%) create mode 100644 src/subtitle/mod.rs diff --git a/src/subtitle.rs b/src/subtitle/ass.rs similarity index 99% rename from src/subtitle.rs rename to src/subtitle/ass.rs index e28468b..f84d897 100644 --- a/src/subtitle.rs +++ b/src/subtitle/ass.rs @@ -5,6 +5,7 @@ use log::info; use std::fs::File; use std::io::Write; use std::process::Command; + pub struct SubtitleGenerator; impl SubtitleGenerator { diff --git a/src/subtitle/mod.rs b/src/subtitle/mod.rs new file mode 100644 index 0000000..e978916 --- /dev/null +++ b/src/subtitle/mod.rs @@ -0,0 +1,3 @@ +mod ass; + +pub use ass::SubtitleGenerator; From 2472e23ff617a15b5737f942939e453779496960 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:17:35 -0700 Subject: [PATCH 2/7] added .DS_Store to gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 83854d9..2069ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ build .venv .mypy_cache -target \ No newline at end of file +target + +.DS_Store \ No newline at end of file From f020d86a0fb4616db7e874e8dec6a1057619c9e2 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:17:50 -0700 Subject: [PATCH 3/7] Move audio to its own structure --- src/audio/extractor.rs | 39 ++++++++++++++++++++++++++++++ src/audio/mod.rs | 16 ++++++++++++ src/{audio.rs => audio/whisper.rs} | 39 +++++------------------------- 3 files changed, 61 insertions(+), 33 deletions(-) create mode 100644 src/audio/extractor.rs create mode 100644 src/audio/mod.rs rename src/{audio.rs => audio/whisper.rs} (76%) diff --git a/src/audio/extractor.rs b/src/audio/extractor.rs new file mode 100644 index 0000000..95afca6 --- /dev/null +++ b/src/audio/extractor.rs @@ -0,0 +1,39 @@ +use super::Extractor; +use anyhow::{Context, Result}; +use log::info; +use std::process::Command; + +pub struct FfmpegExtractor; + +impl Extractor for FfmpegExtractor { + // Function to extract audio from a video file + fn extract(input: &str, output: &str) -> Result<()> { + info!("Extracting audio from {} to {}", input, output); + + let status = Command::new("ffmpeg") + .args(&[ + "-y", + "-i", + input, + "-vn", + "-acodec", + "pcm_s16le", + "-ar", + "16000", + "-ac", + "1", + output, + ]) + .status() + .context("Failed to extract audio via FFmpeg")?; + + if !status.success() { + anyhow::bail!( + "ffmpeg failed to extract audio: exited with code {}", + status + ); + } + + Ok(()) + } +} diff --git a/src/audio/mod.rs b/src/audio/mod.rs new file mode 100644 index 0000000..f1186cb --- /dev/null +++ b/src/audio/mod.rs @@ -0,0 +1,16 @@ +mod extractor; +mod whisper; + +pub use extractor::FfmpegExtractor; +pub use whisper::WhisperCapski; + +use crate::types::Segment; +use anyhow::Result; + +pub trait Capski { + fn transcribe(model_path: &str, audio_path: &str) -> Result>; +} + +pub trait Extractor { + fn extract(input: &str, output: &str) -> Result<()>; +} diff --git a/src/audio.rs b/src/audio/whisper.rs similarity index 76% rename from src/audio.rs rename to src/audio/whisper.rs index 3cef3ea..ad58aa0 100644 --- a/src/audio.rs +++ b/src/audio/whisper.rs @@ -1,18 +1,19 @@ use crate::types::Segment; +use super::Capski; use anyhow::{Context, Result}; use log::info; -use std::process::Command; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; -pub struct Audio; +pub struct WhisperCapski; -impl Audio { +impl Capski for WhisperCapski { // Function to transcribe audio using the Whisper model - pub fn transcribe(model_path: &str, audio_path: &str) -> Result> { + fn transcribe(model_path: &str, audio_path: &str) -> Result> { info!("Transcribing with Whisper..."); - let reader = hound::WavReader::open(audio_path).expect("failed to open file"); + let reader = hound::WavReader::open(audio_path) + .with_context(|| format!("failed to open audio file: {}", audio_path))?; // Read WAV file and collect samples let samples: Vec = reader.into_samples::().map(|x| x.unwrap()).collect(); @@ -91,32 +92,4 @@ impl Audio { Ok(segments) } - - // Function to extract audio from a video file - pub fn extract(video_path: &str, audio_path: &str) -> Result<()> { - info!("Extracting audio from {} to {}", video_path, audio_path); - - let status = Command::new("ffmpeg") - .args(&[ - "-y", - "-i", - video_path, - "-vn", - "-acodec", - "pcm_s16le", - "-ar", - "16000", - "-ac", - "1", - audio_path, - ]) - .status() - .context("Failed to extract audio")?; - - if !status.success() { - return Err(anyhow::anyhow!("ffmpeg failed to extract audio")); - } - - Ok(()) - } } From 3fef627861a040a488bf2d3cf83d46d099e8bde9 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:18:41 -0700 Subject: [PATCH 4/7] refactor the netry point of Capski --- src/lib.rs | 47 ++--------------------------------------------- src/main.rs | 23 +++++++++++++++++++++-- src/pipeline.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 47 deletions(-) create mode 100644 src/pipeline.rs diff --git a/src/lib.rs b/src/lib.rs index ac1c9f0..bcab720 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,50 +1,7 @@ pub mod audio; pub mod cli; +pub mod pipeline; pub mod subtitle; pub mod types; -use crate::audio::Audio; -use crate::cli::Opts; -use crate::subtitle::SubtitleGenerator; -use crate::types::StyleConfig; - -use anyhow::{Context, Result}; -use clap::Parser; -use log::info; -use std::fs::File; -use std::path::Path; - -pub fn run_cli() -> Result<()> { - env_logger::init(); - let opts: Opts = Opts::try_parse()?; - - let input_video = opts.input; - let output_video = opts.output; - - let temp_dir = Path::new("build"); - std::fs::create_dir_all(temp_dir).context("Failed to create temp directory")?; - - let base = Path::new(&input_video) - .file_stem() - .and_then(|s| s.to_str()) - .context("Invalid input video path")?; - - let audio_path = temp_dir.join(format!("{}_audio.wav", base)); - let srt_path = temp_dir.join(format!("{}.ass", base)); - - Audio::extract(&input_video, audio_path.to_str().unwrap())?; - - let segments = Audio::transcribe("model/ggml-tiny.bin", audio_path.to_str().unwrap())?; - let style_config: StyleConfig = serde_json::from_reader(File::open("style.json")?)?; - - SubtitleGenerator::generate(segments, srt_path.to_str().unwrap(), &style_config)?; - SubtitleGenerator::burn( - &input_video, - srt_path.to_str().unwrap(), - &output_video, - &style_config, - )?; - - info!("Done! Video saved to: {}", output_video); - Ok(()) -} +pub use pipeline::CapskiApp; diff --git a/src/main.rs b/src/main.rs index 1b6ba03..feca465 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,22 @@ -fn main() -> anyhow::Result<()> { - transcriber::run_cli() +use capski::CapskiApp; +use capski::cli::Opts; + +use anyhow::Result; +use clap::Parser; +use std::fs::File; + +fn main() -> Result<()> { + env_logger::init(); + + let opts: Opts = Opts::parse(); + let style = serde_json::from_reader(File::open("style.json")?)?; + + let app = CapskiApp { + input: opts.input, + output: opts.output, + model_path: "model/ggml-tiny.bin".to_string(), + style, + }; + + app.run() } diff --git a/src/pipeline.rs b/src/pipeline.rs new file mode 100644 index 0000000..d12a2b9 --- /dev/null +++ b/src/pipeline.rs @@ -0,0 +1,43 @@ +use crate::audio::{Capski, Extractor, FfmpegExtractor, WhisperCapski}; +use crate::subtitle::SubtitleGenerator; +use crate::types::StyleConfig; + +use anyhow::{Context, Result}; +use log::info; +use std::path::Path; + +pub struct CapskiApp { + pub input: String, + pub output: String, + pub model_path: String, + pub style: StyleConfig, +} + +impl CapskiApp { + pub fn run(&self) -> Result<()> { + let base = Path::new(&self.input) + .file_stem() + .and_then(|s| s.to_str()) + .context("Invalid input video path")?; + + let build_dir = Path::new("build"); + std::fs::create_dir_all(build_dir).context("Failed to create temp directory")?; + + let audio_path = build_dir.join(format!("{}_audio.wav", base)); + let subtitle_path = build_dir.join(format!("{}.ass", base)); + + FfmpegExtractor::extract(&self.input, audio_path.to_str().unwrap())?; + let segments = WhisperCapski::transcribe(&self.model_path, audio_path.to_str().unwrap())?; + + SubtitleGenerator::generate(segments, subtitle_path.to_str().unwrap(), &self.style)?; + SubtitleGenerator::burn( + &self.input, + subtitle_path.to_str().unwrap(), + &self.output, + &self.style, + )?; + + info!("Done! Video saved to: {}", self.output); + Ok(()) + } +} From 91ed073039a9957e74d9377321db3106f1d3acb5 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:19:40 -0700 Subject: [PATCH 5/7] Rename transcriber to Capski --- Cargo.toml | 2 +- README.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dab9faf..2d0dcb9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "transcriber" +name = "capski" version = "0.1.0" edition = "2024" diff --git a/README.md b/README.md index a58b4c4..30ec312 100644 --- a/README.md +++ b/README.md @@ -30,15 +30,15 @@ The Engineering Requirements Document (**ERD**) is available here : brew install ffmpeg ``` -### 2. Build Transcriber -You can now build the transcriber using Rust Cargo. +### 2. Install Capski +You can now install Capski using Rust Cargo. ```bash -cargo build +cargo install --path . ``` -### 3. Run Transcriber +### 3. Run Capski ```bash -cargo run -- --input "example/input_audio.wav" +capski --input "example/input_audio.wav" ``` This runs the pipeline end-to-end: From faaec8004879359654000734ef9ff0b15fc79f1d Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:22:20 -0700 Subject: [PATCH 6/7] update lock file --- Cargo.lock | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 80df6f4..c968330 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -104,6 +104,20 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "capski" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger", + "hound", + "log", + "serde", + "serde_json", + "whisper-rs", +] + [[package]] name = "cc" version = "1.2.27" @@ -456,20 +470,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "transcriber" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "env_logger", - "hound", - "log", - "serde", - "serde_json", - "whisper-rs", -] - [[package]] name = "unicode-ident" version = "1.0.18" From c49bf5766123e1bdae151f7b3f6f5a0bf93b3aef Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 22 Jul 2025 21:54:03 -0700 Subject: [PATCH 7/7] ran cargo update --- Cargo.lock | 104 ++++++++++++++++++++++++++++++++++++++++++----------- src/cli.rs | 9 ++++- 2 files changed, 92 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c968330..25cbd83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -120,9 +120,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.27" +version = "1.2.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +checksum = "deec109607ca693028562ed836a5f1c4b8bd77755c4e132fc5ce11b0b6211ae7" dependencies = [ "shlex", ] @@ -155,9 +155,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" dependencies = [ "clap_builder", "clap_derive", @@ -165,9 +165,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" dependencies = [ "anstream", "anstyle", @@ -177,9 +177,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" +checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491" dependencies = [ "heck", "proc-macro2", @@ -300,7 +300,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.2", ] [[package]] @@ -428,9 +428,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.141" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "30b9eff21ebe718216c6ec64e1d9ac57087aad11efc64e32002bce4a0d4c03d3" dependencies = [ "itoa", "memchr", @@ -540,7 +540,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -549,14 +549,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -565,44 +581,92 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" diff --git a/src/cli.rs b/src/cli.rs index b01efd0..899dfdc 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,7 +1,14 @@ use clap::Parser; #[derive(Parser, Debug)] -#[command(version = "1.0", author = "Chris Dedman")] +#[command( + name = "Capski", + version = "0.1.0", + author = "Chris Dedman", + about = "Create karaoke-style videos from audio or video", + disable_help_flag = false, + disable_version_flag = false +)] pub struct Opts { #[arg(short, long, required = true)] pub input: String,