sandbox-science · chrisdedman · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/README.md b/README.md
@@ -37,9 +37,24 @@ cargo install --path .
 ```
 
 ### 3. Run Capski
+Basic transcription (auto-detects language):
 ```bash
 capski --input "example/input_audio.wav"
 ```
+You can also translate your non-English audio to English:
+```bash
+capski --input "japanese_audio.wav" --translate
+```
+Also explicity set the source language to translate to English:
+```bash
+capski --input "french_audio.wav" --language FR --translate
+```
+
+> [!NOTE]
+> Capski uses Whisper to transcribe audio.
+> If you want to translate non-English speech into English subtitles, use the --translate flag along with the --language option to specify the source language (e.g., fr for French, es for Spanish).
+>
+> 📌 Whisper only supports translation into English. Translating English into other languages is not supported.
 
 This runs the pipeline end-to-end:
 - extracts or processes audio,

diff --git a/example/output_translated_sub.mp4 b/example/output_translated_sub.mp4
diff --git a/src/audio/mod.rs b/src/audio/mod.rs
@@ -8,7 +8,12 @@ use crate::types::Segment;
 use anyhow::Result;
 
 pub trait Capski {
-    fn transcribe(model_path: &str, audio_path: &str) -> Result<Vec<Segment>>;
+    fn transcribe(
+        model_path: &str,
+        audio_path: &str,
+        translate: bool,
+        language: &Option<String>,
+    ) -> Result<Vec<Segment>>;
 }
 
 pub trait Extractor {

diff --git a/src/audio/whisper.rs b/src/audio/whisper.rs
@@ -7,11 +7,130 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar
 
 pub struct WhisperCapski;
 
+const LANGUAGES: &[(&str, &str)] = &[
+    ("en", "english"),
+    ("zh", "chinese"),
+    ("de", "german"),
+    ("es", "spanish"),
+    ("ru", "russian"),
+    ("ko", "korean"),
+    ("fr", "french"),
+    ("ja", "japanese"),
+    ("pt", "portuguese"),
+    ("tr", "turkish"),
+    ("pl", "polish"),
+    ("ca", "catalan"),
+    ("nl", "dutch"),
+    ("ar", "arabic"),
+    ("sv", "swedish"),
+    ("it", "italian"),
+    ("id", "indonesian"),
+    ("hi", "hindi"),
+    ("fi", "finnish"),
+    ("vi", "vietnamese"),
+    ("he", "hebrew"),
+    ("uk", "ukrainian"),
+    ("el", "greek"),
+    ("ms", "malay"),
+    ("cs", "czech"),
+    ("ro", "romanian"),
+    ("da", "danish"),
+    ("hu", "hungarian"),
+    ("ta", "tamil"),
+    ("no", "norwegian"),
+    ("th", "thai"),
+    ("ur", "urdu"),
+    ("hr", "croatian"),
+    ("bg", "bulgarian"),
+    ("lt", "lithuanian"),
+    ("la", "latin"),
+    ("mi", "maori"),
+    ("ml", "malayalam"),
+    ("cy", "welsh"),
+    ("sk", "slovak"),
+    ("te", "telugu"),
+    ("fa", "persian"),
+    ("lv", "latvian"),
+    ("bn", "bengali"),
+    ("sr", "serbian"),
+    ("az", "azerbaijani"),
+    ("sl", "slovenian"),
+    ("kn", "kannada"),
+    ("et", "estonian"),
+    ("mk", "macedonian"),
+    ("br", "breton"),
+    ("eu", "basque"),
+    ("is", "icelandic"),
+    ("hy", "armenian"),
+    ("ne", "nepali"),
+    ("mn", "mongolian"),
+    ("bs", "bosnian"),
+    ("kk", "kazakh"),
+    ("sq", "albanian"),
+    ("sw", "swahili"),
+    ("gl", "galician"),
+    ("mr", "marathi"),
+    ("pa", "punjabi"),
+    ("si", "sinhala"),
+    ("km", "khmer"),
+    ("sn", "shona"),
+    ("yo", "yoruba"),
+    ("so", "somali"),
+    ("af", "afrikaans"),
+    ("oc", "occitan"),
+    ("ka", "georgian"),
+    ("be", "belarusian"),
+    ("tg", "tajik"),
+    ("sd", "sindhi"),
+    ("gu", "gujarati"),
+    ("am", "amharic"),
+    ("yi", "yiddish"),
+    ("lo", "lao"),
+    ("uz", "uzbek"),
+    ("fo", "faroese"),
+    ("ht", "haitian creole"),
+    ("ps", "pashto"),
+    ("tk", "turkmen"),
+    ("nn", "nynorsk"),
+    ("mt", "maltese"),
+    ("sa", "sanskrit"),
+    ("lb", "luxembourgish"),
+    ("my", "myanmar"),
+    ("bo", "tibetan"),
+    ("tl", "tagalog"),
+    ("mg", "malagasy"),
+    ("as", "assamese"),
+    ("tt", "tatar"),
+    ("haw", "hawaiian"),
+    ("ln", "lingala"),
+    ("ha", "hausa"),
+    ("ba", "bashkir"),
+    ("jw", "javanese"),
+    ("su", "sundanese"),
+];
+
 impl Capski for WhisperCapski {
     // Function to transcribe audio using the Whisper model
-    fn transcribe(model_path: &str, audio_path: &str) -> Result<Vec<Segment>> {
+    fn transcribe(
+        model_path: &str,
+        audio_path: &str,
+        translate: bool,
+        language: &Option<String>,
+    ) -> Result<Vec<Segment>> {
         info!("Transcribing with Whisper...");
 
+        let language_code = language
+            .as_ref()
+            .map(|s| s.to_lowercase())
+            .as_ref()
+            .and_then(|lang| {
+                LANGUAGES
+                    .iter()
+                    .find(|&&(code, name)| code == lang || name == lang)
+                    .map(|&(code, _)| code)
+            })
+            .unwrap_or("auto");
+
         let reader = hound::WavReader::open(audio_path)
             .with_context(|| format!("failed to open audio file: {}", audio_path))?;
 
@@ -24,8 +143,8 @@ impl Capski for WhisperCapski {
 
         // Set up parameters for the Whisper model
         let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
-        params.set_translate(false);
-        params.set_language(Some("auto"));
+        params.set_translate(translate);
+        params.set_language(Some(language_code));
         params.set_print_special(false);
         params.set_print_progress(false);
         params.set_print_realtime(false);

diff --git a/src/cli.rs b/src/cli.rs
@@ -3,9 +3,9 @@ use clap::Parser;
 #[derive(Parser, Debug)]
 #[command(
     name = "Capski",
-    version = "0.1.0",
+    version = "0.2.0",
     author = "Chris Dedman",
-    about = "Create karaoke-style videos from audio or video",
+    about = "Create karaoke-style videos from audio or video.",
     disable_help_flag = false,
     disable_version_flag = false
 )]
@@ -15,4 +15,18 @@ pub struct Opts {
 
     #[arg(short, long, default_value = "output.mp4")]
     pub output: String,
+
+    #[arg(
+        long,
+        default_value_t = false,
+        help = "Translate from the source language to English."
+    )]
+    pub translate: bool,
+
+    #[arg(
+        long,
+        default_value = "auto",
+        help = "Specify the source language ('fr', 'es', etc). Defaults to 'auto'."
+    )]
+    pub language: String,
 }
diff --git a/src/main.rs b/src/main.rs
@@ -16,6 +16,8 @@ fn main() -> Result<()> {
         output: opts.output,
         model_path: "model/ggml-tiny.bin".to_string(),
         style,
+        translate: opts.translate,
+        language: Some(opts.language),
     };
 
     app.run()

diff --git a/src/pipeline.rs b/src/pipeline.rs
@@ -9,6 +9,8 @@ use std::path::Path;
 pub struct CapskiApp {
     pub input: String,
     pub output: String,
+    pub translate: bool,
+    pub language: Option<String>,
     pub model_path: String,
     pub style: StyleConfig,
 }
@@ -27,7 +29,12 @@ impl CapskiApp {
         let subtitle_path = build_dir.join(format!("{}.ass", base));
 
         FfmpegExtractor::extract(&self.input, audio_path.to_str().unwrap())?;
-        let segments = WhisperCapski::transcribe(&self.model_path, audio_path.to_str().unwrap())?;
+        let segments = WhisperCapski::transcribe(
+            &self.model_path,
+            audio_path.to_str().unwrap(),
+            self.translate,
+            &self.language,
+        )?;
 
         SubtitleGenerator::generate(segments, subtitle_path.to_str().unwrap(), &self.style)?;
         SubtitleGenerator::burn(