Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,24 @@ cargo install --path .
```

### 3. Run Capski
Basic transcription (auto-detects language):
```bash
capski --input "example/input_audio.wav"
```
You can also translate your non-English audio to English:
```bash
capski --input "japanese_audio.wav" --translate
```
Also explicity set the source language to translate to English:
```bash
capski --input "french_audio.wav" --language FR --translate
```

> [!NOTE]
> Capski uses Whisper to transcribe audio.
> If you want to translate non-English speech into English subtitles, use the --translate flag along with the --language option to specify the source language (e.g., fr for French, es for Spanish).
>
> 📌 Whisper only supports translation into English. Translating English into other languages is not supported.

This runs the pipeline end-to-end:
- extracts or processes audio,
Expand Down
Binary file added example/output_translated_sub.mp4
Binary file not shown.
7 changes: 6 additions & 1 deletion src/audio/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ use crate::types::Segment;
use anyhow::Result;

pub trait Capski {
fn transcribe(model_path: &str, audio_path: &str) -> Result<Vec<Segment>>;
fn transcribe(
model_path: &str,
audio_path: &str,
translate: bool,
language: &Option<String>,
) -> Result<Vec<Segment>>;
}

pub trait Extractor {
Expand Down
125 changes: 122 additions & 3 deletions src/audio/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,130 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar

pub struct WhisperCapski;

const LANGUAGES: &[(&str, &str)] = &[
("en", "english"),
("zh", "chinese"),
("de", "german"),
("es", "spanish"),
("ru", "russian"),
("ko", "korean"),
("fr", "french"),
("ja", "japanese"),
("pt", "portuguese"),
("tr", "turkish"),
("pl", "polish"),
("ca", "catalan"),
("nl", "dutch"),
("ar", "arabic"),
("sv", "swedish"),
("it", "italian"),
("id", "indonesian"),
("hi", "hindi"),
("fi", "finnish"),
("vi", "vietnamese"),
("he", "hebrew"),
("uk", "ukrainian"),
("el", "greek"),
("ms", "malay"),
("cs", "czech"),
("ro", "romanian"),
("da", "danish"),
("hu", "hungarian"),
("ta", "tamil"),
("no", "norwegian"),
("th", "thai"),
("ur", "urdu"),
("hr", "croatian"),
("bg", "bulgarian"),
("lt", "lithuanian"),
("la", "latin"),
("mi", "maori"),
("ml", "malayalam"),
("cy", "welsh"),
("sk", "slovak"),
("te", "telugu"),
("fa", "persian"),
("lv", "latvian"),
("bn", "bengali"),
("sr", "serbian"),
("az", "azerbaijani"),
("sl", "slovenian"),
("kn", "kannada"),
("et", "estonian"),
("mk", "macedonian"),
("br", "breton"),
("eu", "basque"),
("is", "icelandic"),
("hy", "armenian"),
("ne", "nepali"),
("mn", "mongolian"),
("bs", "bosnian"),
("kk", "kazakh"),
("sq", "albanian"),
("sw", "swahili"),
("gl", "galician"),
("mr", "marathi"),
("pa", "punjabi"),
("si", "sinhala"),
("km", "khmer"),
("sn", "shona"),
("yo", "yoruba"),
("so", "somali"),
("af", "afrikaans"),
("oc", "occitan"),
("ka", "georgian"),
("be", "belarusian"),
("tg", "tajik"),
("sd", "sindhi"),
("gu", "gujarati"),
("am", "amharic"),
("yi", "yiddish"),
("lo", "lao"),
("uz", "uzbek"),
("fo", "faroese"),
("ht", "haitian creole"),
("ps", "pashto"),
("tk", "turkmen"),
("nn", "nynorsk"),
("mt", "maltese"),
("sa", "sanskrit"),
("lb", "luxembourgish"),
("my", "myanmar"),
("bo", "tibetan"),
("tl", "tagalog"),
("mg", "malagasy"),
("as", "assamese"),
("tt", "tatar"),
("haw", "hawaiian"),
("ln", "lingala"),
("ha", "hausa"),
("ba", "bashkir"),
("jw", "javanese"),
("su", "sundanese"),
];

impl Capski for WhisperCapski {
// Function to transcribe audio using the Whisper model
fn transcribe(model_path: &str, audio_path: &str) -> Result<Vec<Segment>> {
fn transcribe(
model_path: &str,
audio_path: &str,
translate: bool,
language: &Option<String>,
) -> Result<Vec<Segment>> {
info!("Transcribing with Whisper...");

let language_code = language
.as_ref()
.map(|s| s.to_lowercase())
.as_ref()
.and_then(|lang| {
LANGUAGES
.iter()
.find(|&&(code, name)| code == lang || name == lang)
.map(|&(code, _)| code)
})
.unwrap_or("auto");

let reader = hound::WavReader::open(audio_path)
.with_context(|| format!("failed to open audio file: {}", audio_path))?;

Expand All @@ -24,8 +143,8 @@ impl Capski for WhisperCapski {

// Set up parameters for the Whisper model
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
params.set_translate(false);
params.set_language(Some("auto"));
params.set_translate(translate);
params.set_language(Some(language_code));
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
Expand Down
18 changes: 16 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ use clap::Parser;
#[derive(Parser, Debug)]
#[command(
name = "Capski",
version = "0.1.0",
version = "0.2.0",
author = "Chris Dedman",
about = "Create karaoke-style videos from audio or video",
about = "Create karaoke-style videos from audio or video.",
disable_help_flag = false,
disable_version_flag = false
)]
Expand All @@ -15,4 +15,18 @@ pub struct Opts {

#[arg(short, long, default_value = "output.mp4")]
pub output: String,

#[arg(
long,
default_value_t = false,
help = "Translate from the source language to English."
)]
pub translate: bool,

#[arg(
long,
default_value = "auto",
help = "Specify the source language ('fr', 'es', etc). Defaults to 'auto'."
)]
pub language: String,
}
2 changes: 2 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ fn main() -> Result<()> {
output: opts.output,
model_path: "model/ggml-tiny.bin".to_string(),
style,
translate: opts.translate,
language: Some(opts.language),
};

app.run()
Expand Down
9 changes: 8 additions & 1 deletion src/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use std::path::Path;
pub struct CapskiApp {
pub input: String,
pub output: String,
pub translate: bool,
pub language: Option<String>,
pub model_path: String,
pub style: StyleConfig,
}
Expand All @@ -27,7 +29,12 @@ impl CapskiApp {
let subtitle_path = build_dir.join(format!("{}.ass", base));

FfmpegExtractor::extract(&self.input, audio_path.to_str().unwrap())?;
let segments = WhisperCapski::transcribe(&self.model_path, audio_path.to_str().unwrap())?;
let segments = WhisperCapski::transcribe(
&self.model_path,
audio_path.to_str().unwrap(),
self.translate,
&self.language,
)?;

SubtitleGenerator::generate(segments, subtitle_path.to_str().unwrap(), &self.style)?;
SubtitleGenerator::burn(
Expand Down