

### 🏷️ **Credits & License**

* 🔗 [VoxCPM GitHub Repository](https://github.com/OpenBMB/VoxCPM)
* 🤗 [VoxCPM-0.5B on Hugging Face](https://huggingface.co/openbmb/VoxCPM-0.5B)
* 📄 **License**: Provided under the [Apache License 2.0](https://github.com/OpenBMB/VoxCPM/blob/main/LICENSE)





### ⚠️ **Usage Disclaimer**

Use of this voice cloning model is subject to strict ethical and legal standards. By using this tool, you agree **not to** engage in any of the following prohibited activities:

* **Fraud or Deception**: Using cloned voices to create misleading or fraudulent content.
* **Impersonation**: Replicating someone’s voice without their explicit permission, especially for malicious, harmful, or deceptive purposes.
* **Illegal Activities**: Employing the model in any manner that violates local, national, or international laws and regulations.
* **Harmful Content Generation**: Creating offensive, defamatory, or unethical material, including content that spreads misinformation or causes harm.

> ⚖️ **Legal Responsibility**
> The developers of this tool disclaim all liability for misuse. **Users bear full responsibility** for ensuring that their usage complies with all applicable laws, regulations, and ethical guidelines.



In [1]:
#@title Install VoxCPM
%cd /content/
!git clone https://github.com/OpenBMB/VoxCPM.git

# If the original repository has been updated and causes errors, use the frozen fork version instead:
# !git clone https://github.com/NeuralFalconYT/VoxCPM.git

%cd ./VoxCPM
!wget https://raw.githubusercontent.com/NeuralFalconYT/Useful-Function/refs/heads/main/hf_downloader.py
!pip install -e .
!pip install pysrt
from IPython.display import clear_output
clear_output()

In [2]:
#@title Add gradio share on app.py
%%writefile /content/VoxCPM/app.py
import os
import numpy as np
import torch
import gradio as gr
import spaces
from typing import Optional, Tuple
from funasr import AutoModel
from pathlib import Path
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if os.environ.get("HF_REPO_ID", "").strip() == "":
    os.environ["HF_REPO_ID"] = "openbmb/VoxCPM-0.5B"

import voxcpm


class VoxCPMDemo:
    def __init__(self) -> None:
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🚀 Running on device: {self.device}")

        # ASR model for prompt text recognition
        self.asr_model_id = "iic/SenseVoiceSmall"
        self.asr_model: Optional[AutoModel] = AutoModel(
            model=self.asr_model_id,
            disable_update=True,
            log_level='DEBUG',
            device="cuda:0" if self.device == "cuda" else "cpu",
        )

        # TTS model (lazy init)
        self.voxcpm_model: Optional[voxcpm.VoxCPM] = None
        self.default_local_model_dir = "./models/VoxCPM-0.5B"

    # ---------- Model helpers ----------
    def _resolve_model_dir(self) -> str:
        """
        Resolve model directory:
        1) Use local checkpoint directory if exists
        2) If HF_REPO_ID env is set, download into models/{repo}
        3) Fallback to 'models'
        """
        if os.path.isdir(self.default_local_model_dir):
            return self.default_local_model_dir

        repo_id = os.environ.get("HF_REPO_ID", "").strip()
        if len(repo_id) > 0:
            target_dir = os.path.join("models", repo_id.replace("/", "__"))
            if not os.path.isdir(target_dir):
                try:
                    from huggingface_hub import snapshot_download  # type: ignore
                    os.makedirs(target_dir, exist_ok=True)
                    print(f"Downloading model from HF repo '{repo_id}' to '{target_dir}' ...")
                    snapshot_download(repo_id=repo_id, local_dir=target_dir, local_dir_use_symlinks=False)
                except Exception as e:
                    print(f"Warning: HF download failed: {e}. Falling back to 'data'.")
                    return "models"
            return target_dir
        return "models"

    def get_or_load_voxcpm(self) -> voxcpm.VoxCPM:
        if self.voxcpm_model is not None:
            return self.voxcpm_model
        print("Model not loaded, initializing...")
        model_dir = self._resolve_model_dir()
        print(f"Using model dir: {model_dir}")
        self.voxcpm_model = voxcpm.VoxCPM(voxcpm_model_path=model_dir)
        print("Model loaded successfully.")
        return self.voxcpm_model

    # ---------- Functional endpoints ----------
    def prompt_wav_recognition(self, prompt_wav: Optional[str]) -> str:
        if prompt_wav is None:
            return ""
        res = self.asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
        text = res[0]["text"].split('|>')[-1]
        return text

    def generate_tts_audio(
        self,
        text_input: str,
        prompt_wav_path_input: Optional[str] = None,
        prompt_text_input: Optional[str] = None,
        cfg_value_input: float = 2.0,
        inference_timesteps_input: int = 10,
        do_normalize: bool = True,
        denoise: bool = True,
    ) -> Tuple[int, np.ndarray]:
        """
        Generate speech from text using VoxCPM; optional reference audio for voice style guidance.
        Returns (sample_rate, waveform_numpy)
        """
        current_model = self.get_or_load_voxcpm()

        text = (text_input or "").strip()
        if len(text) == 0:
            raise ValueError("Please input text to synthesize.")

        prompt_wav_path = prompt_wav_path_input if prompt_wav_path_input else None
        prompt_text = prompt_text_input if prompt_text_input else None

        print(f"Generating audio for text: '{text[:60]}...'")
        wav = current_model.generate(
            text=text,
            prompt_text=prompt_text,
            prompt_wav_path=prompt_wav_path,
            cfg_value=float(cfg_value_input),
            inference_timesteps=int(inference_timesteps_input),
            normalize=do_normalize,
            denoise=denoise,
        )
        return (16000, wav)


# ---------- UI Builders ----------

def create_demo_interface(demo: VoxCPMDemo):
    """Build the Gradio UI for VoxCPM demo."""
    # static assets (logo path)
    gr.set_static_paths(paths=[Path.cwd().absolute()/"assets"])

    with gr.Blocks(
        theme=gr.themes.Soft(
            primary_hue="blue",
            secondary_hue="gray",
            neutral_hue="slate",
            font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
        ),
        css="""
        .logo-container {
            text-align: center;
            margin: 0.5rem 0 1rem 0;
        }
        .logo-container img {
            height: 80px;
            width: auto;
            max-width: 200px;
            display: inline-block;
        }
        /* Bold accordion labels */
        #acc_quick details > summary,
        #acc_tips details > summary {
            font-weight: 600 !important;
            font-size: 1.1em !important;
        }
        /* Bold labels for specific checkboxes */
        #chk_denoise label,
        #chk_denoise span,
        #chk_normalize label,
        #chk_normalize span {
            font-weight: 600;
        }
        """
    ) as interface:
        # Header logo
        gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo"></div>')

        # Quick Start
        with gr.Accordion("📋 Quick Start Guide ｜快速入门", open=False, elem_id="acc_quick"):
            gr.Markdown("""
            ### How to Use ｜使用说明
            1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
               **（可选）提供参考声音** - 上传或录制一段音频，为声音合成提供音色、语调和情感等个性化特征
            2. **(Optional) Enter prompt text** - If you provided a voice prompt, enter the corresponding transcript here (auto-recognition available).
               **（可选项）输入参考文本** - 如果提供了参考语音，请输入其对应的文本内容（支持自动识别）。
            3. **Enter target text** - Type the text you want the model to speak.
               **输入目标文本** - 输入您希望模型朗读的文字内容。
            4. **Generate Speech** - Click the "Generate" button to create your audio.
               **生成语音** - 点击"生成"按钮，即可为您创造出音频。
            """)

        # Pro Tips
        with gr.Accordion("💡 Pro Tips ｜使用建议", open=False, elem_id="acc_tips"):
            gr.Markdown("""
            ### Prompt Speech Enhancement｜参考语音降噪
            - **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
              **启用**：通过 ZipEnhancer 组件消除背景噪音，获得更好的音质。
            - **Disable** to preserve the original audio's background atmosphere.
              **禁用**：保留原始音频的背景环境声，如果想复刻相应声学环境。

            ### Text Normalization｜文本正则化
            - **Enable** to process general text with an external WeTextProcessing component.
              **启用**：使用 WeTextProcessing 组件，可处理常见文本。
            - **Disable** to use VoxCPM's native text understanding ability. For example, it supports phonemes input ({HH AH0 L OW1}), try it!
              **禁用**：将使用 VoxCPM 内置的文本理解能力。如，支持音素输入（如 {da4}{jia1}好）和公式符号合成，尝试一下！

            ### CFG Value｜CFG 值
            - **Lower CFG** if the voice prompt sounds strained or expressive.
              **调低**：如果提示语音听起来不自然或过于夸张。
            - **Higher CFG** for better adherence to the prompt speech style or input text.
              **调高**：为更好地贴合提示音频的风格或输入文本。

            ### Inference Timesteps｜推理时间步
            - **Lower** for faster synthesis speed.
              **调低**：合成速度更快。
            - **Higher** for better synthesis quality.
              **调高**：合成质量更佳。
            """)

        # Main controls
        with gr.Row():
            with gr.Column():
                prompt_wav = gr.Audio(
                    sources=["upload", 'microphone'],
                    type="filepath",
                    label="Prompt Speech (Optional, or let VoxCPM improvise)",
                    value="./examples/example.wav",
                )
                DoDenoisePromptAudio = gr.Checkbox(
                    value=False,
                    label="Prompt Speech Enhancement",
                    elem_id="chk_denoise",
                    info="We use ZipEnhancer model to denoise the prompt audio."
                )
                with gr.Row():
                    prompt_text = gr.Textbox(
                        value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
                        label="Prompt Text",
                        placeholder="Please enter the prompt text. Automatic recognition is supported, and you can correct the results yourself..."
                    )
                run_btn = gr.Button("Generate Speech", variant="primary")

            with gr.Column():
                cfg_value = gr.Slider(
                    minimum=1.0,
                    maximum=3.0,
                    value=2.0,
                    step=0.1,
                    label="CFG Value (Guidance Scale)",
                    info="Higher values increase adherence to prompt, lower values allow more creativity"
                )
                inference_timesteps = gr.Slider(
                    minimum=4,
                    maximum=30,
                    value=10,
                    step=1,
                    label="Inference Timesteps",
                    info="Number of inference timesteps for generation (higher values may improve quality but slower)"
                )
                with gr.Row():
                    text = gr.Textbox(
                        value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
                        label="Target Text",
                    )
                with gr.Row():
                    DoNormalizeText = gr.Checkbox(
                        value=False,
                        label="Text Normalization",
                        elem_id="chk_normalize",
                        info="We use wetext library to normalize the input text."
                    )
                audio_output = gr.Audio(label="Output Audio")

        # Wiring
        run_btn.click(
            fn=demo.generate_tts_audio,
            inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText, DoDenoisePromptAudio],
            outputs=[audio_output],
            show_progress=True,
            api_name="generate",
        )
        prompt_wav.change(fn=demo.prompt_wav_recognition, inputs=[prompt_wav], outputs=[prompt_text])

    return interface


# def run_demo(server_name: str = "localhost", server_port: int = 7860, show_error: bool = True):
#     demo = VoxCPMDemo()
#     interface = create_demo_interface(demo)
#     # Recommended to enable queue on Spaces for better throughput
#     interface.queue(max_size=10).launch(server_name=server_name, server_port=server_port, show_error=show_error)


import click
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def run_demo(share,debug):
    demo = VoxCPMDemo()
    interface = create_demo_interface(demo)
    interface.queue(max_size=10).launch(share=share,debug=debug)
if __name__ == "__main__":
    run_demo()

Overwriting /content/VoxCPM/app.py


In [None]:
#@title Run Official Gradio APP
%cd /content/VoxCPM
!python app.py --share --debug

In [37]:
#@title VoxCPM Dubbing
%cd /content/VoxCPM
import os
import numpy as np
import torch
import gradio as gr
import spaces
from typing import Optional, Tuple
from funasr import AutoModel
from pathlib import Path
import soundfile as sf
import sys
from tqdm.auto import tqdm

sys.path.append("/content/VoxCPM/src")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TQDM_DISABLE"] = "1"
if os.environ.get("HF_REPO_ID", "").strip() == "":
    os.environ["HF_REPO_ID"] = "openbmb/VoxCPM-0.5B"

import voxcpm
from hf_downloader import download_model

class VoxCPMDemo:
    def __init__(self) -> None:
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🚀 Running on device: {self.device}")

        # ASR model for prompt text recognition
        self.asr_model_id = "iic/SenseVoiceSmall"
        self.asr_model: Optional[AutoModel] = AutoModel(
            model=self.asr_model_id,
            disable_update=True,
            log_level='DEBUG',
            device="cuda:0" if self.device == "cuda" else "cpu",
        )

        # TTS model (lazy init)
        self.voxcpm_model: Optional[voxcpm.VoxCPM] = None
        self.default_local_model_dir = "./models/VoxCPM-0.5B"
        # Skipping snapshot_download because sometimes Google Colab requires a Hugging Face token,
        #otherwise it will give an error. It is very painful to set up a Hugging Face token.
        download_model("openbmb/VoxCPM-0.5B", download_folder="./models/")


    # ---------- Model helpers ----------
    def _resolve_model_dir(self) -> str:
        """
        Resolve model directory:
        1) Use local checkpoint directory if exists
        2) If HF_REPO_ID env is set, download into models/{repo}
        3) Fallback to 'models'
        """
        if os.path.isdir(self.default_local_model_dir):
            return self.default_local_model_dir

        repo_id = os.environ.get("HF_REPO_ID", "").strip()
        if len(repo_id) > 0:
            target_dir = os.path.join("models", repo_id.replace("/", "__"))
            if not os.path.isdir(target_dir):
                try:
                    from huggingface_hub import snapshot_download  # type: ignore
                    os.makedirs(target_dir, exist_ok=True)
                    print(f"Downloading model from HF repo '{repo_id}' to '{target_dir}' ...")
                    snapshot_download(repo_id=repo_id, local_dir=target_dir, local_dir_use_symlinks=False)
                except Exception as e:
                    print(f"Warning: HF download failed: {e}. Falling back to 'data'.")
                    return "models"
            return target_dir
        return "models"

    def get_or_load_voxcpm(self) -> voxcpm.VoxCPM:
        if self.voxcpm_model is not None:
            return self.voxcpm_model
        print("Model not loaded, initializing...")
        model_dir = self._resolve_model_dir()
        print(f"Using model dir: {model_dir}")
        self.voxcpm_model = voxcpm.VoxCPM(voxcpm_model_path=model_dir)
        print("Model loaded successfully.")
        return self.voxcpm_model

    # ---------- Functional endpoints ----------
    def prompt_wav_recognition(self, prompt_wav: Optional[str]) -> str:
        if prompt_wav is None:
            return ""
        res = self.asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
        text = res[0]["text"].split('|>')[-1]
        return text

    def generate_tts_audio(
        self,
        text_input: str,
        prompt_wav_path_input: Optional[str] = None,
        prompt_text_input: Optional[str] = None,
        cfg_value_input: float = 2.0,
        inference_timesteps_input: int = 10,
        do_normalize: bool = True,
        denoise: bool = True,
    ) -> Tuple[int, np.ndarray]:
        """
        Generate speech from text using VoxCPM; optional reference audio for voice style guidance.
        Returns (sample_rate, waveform_numpy)
        """
        current_model = self.get_or_load_voxcpm()

        text = (text_input or "").strip()
        if len(text) == 0:
            raise ValueError("Please input text to synthesize.")

        prompt_wav_path = prompt_wav_path_input if prompt_wav_path_input else None
        prompt_text = prompt_text_input if prompt_text_input else None

        print(f"Generating audio for text: '{text[:60]}...'")
        wav = current_model.generate(
            text=text,
            prompt_text=prompt_text,
            prompt_wav_path=prompt_wav_path,
            cfg_value=float(cfg_value_input),
            inference_timesteps=int(inference_timesteps_input),
            normalize=do_normalize,
            denoise=denoise,
        )
        return (16000, wav)
demo = VoxCPMDemo()
from IPython.display import clear_output
clear_output()



## Test VoxCPM TTS


### denoise

Enable to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
Disable to preserve the original audio's background atmosphere.

### do_normalize

Enable to process general text with an external WeTextProcessing component.
Disable to use VoxCPM's native text understanding ability. For example, it supports phonemes input ({HH AH0 L OW1}), try it!

### cfg_value_input

Lower CFG if the voice prompt sounds strained or expressive.
Higher CFG for better adherence to the prompt speech style or input text.

### inference_timesteps_input

Lower for faster synthesis speed.
Higher for better synthesis quality.



In [5]:
#@title Upload Reference Audio [only .wav file]
import os
from google.colab import files
from IPython.display import clear_output, Audio

def upload_audio():
    upload_folder = "/content/uploaded_audio"
    os.makedirs(upload_folder, exist_ok=True)
    os.chdir(upload_folder)
    f_names = []
    uploaded = files.upload()
    for fn in uploaded.keys():
        f_names.append(f"{upload_folder}/{fn}")
    os.chdir("/content/VoxCPM")
    clear_output()
    if f_names[-1].lower().endswith(".wav"):
        return f_names[-1]
    else:
        print("Please upload a .wav file")
        return None

uploaded_audio = upload_audio()

# Play audio in Colab
if uploaded_audio:
    display(Audio(uploaded_audio, autoplay=False))

uploaded_audio


'/content/uploaded_audio/example.wav'

In [38]:
#@title Copy Transcription (edit if the speech-to-text is incorrect)
reference_wav_audio = '/content/uploaded_audio/example.wav'  # @param {type: "string"}
transcription=demo.prompt_wav_recognition(reference_wav_audio)
from IPython.display import clear_output
clear_output()
transcription

"Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive."

In [41]:
text_input = "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech."  # @param {type: "string"}
transcription = "Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive."  # @param {type: "string"}
cfg_value_input = 2.0 # @param {type: "number"}
inference_timesteps_input = 10 # @param {type: "number"}
do_normalize = False # @param {type: "boolean"}
denoise = False # @param {type: "boolean"}

sr, wav = demo.generate_tts_audio(
    text_input,
    reference_wav_audio,
    transcription,
    cfg_value_input,
    inference_timesteps_input,
    do_normalize,
    denoise,
)
sf.write("/content/output.wav", np.array(wav).squeeze(), sr)
clear_output()
from IPython.display import Audio, display
display(Audio("/content/output.wav"))

In [42]:
#@title Downlaod TTS
from google.colab import files
files.download("/content/output.wav")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



## 📘 My Goal: **English-to-English Subtitle Dubbing**

⚠️ **Note:** This tool currently supports **single-speaker subtitle dubbing only**. It **does not separate multiple speakers** or perform multi-voice cloning. <br>

⚠️ **Warning / Limitations:**

* The subtitle dubbing logic is **not perfect**, the generated dubbing audio **not 100% accurate** [For Lipsync / context matching].
* For **long-duration subtitles**, Colab may crash, and the process can be **time-consuming**.

This tool generates audio directly from subtitle files, enabling a wide range of dubbing possibilities: ```English to Chinese```, ```Chinese to English```, or even ```Chinese to Chinese```.

### Why Use This Tool?

* You want a **better voice or accent** than your own.
* You struggle with **pronunciation** or make **grammatical mistakes** in English.
* You want to perform ```English-to-Chinese``` or ```Chinese-to-English``` dubbing.

With this tool, you can achieve your goal of speaking with a **professional voice and accent** using **voice cloning technology**.

### How to Use

1. **Generate Subtitles** using `faster-whisper-large-v3-turbo-ct2`:

   * [Auto Subtitle Generator – Web](https://huggingface.co/spaces/NeuralFalcon/Auto-Subtitle-Generator)
   * [Auto Subtitle Generator – GitHub](https://github.com/NeuralFalconYT/Auto-Subtitle-Generator-Free)

   **Tips:**

   * Use the **“Readable Subtitles (Multi-Line) .SRT”** if your goal is to simply change the voice.
   * To fix grammar, upload the **“Readable Subtitles (Multi-Line) .SRT”** in the **SRT Translation** tab. Select **“Fix Grammar \[English to English for dubbing]”** in the **Select Task** dropdown and generate the updated subtitle using ```gemini-2.5-pro``` on Google AI Studio. Follow the instructions in the provided links.

2. **Download the final subtitle file.**

3. **Upload the `.srt` file** in the cell below to proceed with dubbing.

4. After dubbing the subtitles, use any video editing software to replace your video's sound with the generated dubbing audio. You have now successfully did basic video dubbing.



In [43]:
#@title Auto-Save Dubbing Audio to Drive for Long Subtitles [Optional]
Enable_Drive_Save= False  # @param {type: "boolean"}
if Enable_Drive_Save:
  from google.colab import drive
  drive.mount('/content/gdrive/')
  drive_folder = "/content/gdrive/MyDrive/dubbing"
  os.makedirs(drive_folder, exist_ok=True)

In [12]:
#@title Upload  fixed english .SRT file
from google.colab import files
from IPython.display import clear_output

def upload_srt():
  upload_folder="/content/uploaded_srt"
  os.makedirs(upload_folder,exist_ok=True)
  os.chdir(upload_folder)
  f_names=[]
  uploaded = files.upload()
  for fn in uploaded.keys():
        f_names.append(f"{upload_folder}/{fn}")
  os.chdir("/content/VoxCPM")
  clear_output()
  return f_names[-1]
uploaded_srt=upload_srt()
uploaded_srt

'/content/uploaded_srt/me_talking.srt'

In [44]:
#@title Upload Reference Audio [only .wav file]
import os
from google.colab import files
from IPython.display import clear_output, Audio

def upload_audio():
    upload_folder = "/content/uploaded_audio"
    os.makedirs(upload_folder, exist_ok=True)
    os.chdir(upload_folder)
    f_names = []
    uploaded = files.upload()
    for fn in uploaded.keys():
        f_names.append(f"{upload_folder}/{fn}")
    os.chdir("/content/VoxCPM")
    clear_output()
    if f_names[-1].lower().endswith(".wav"):
        return f_names[-1]
    else:
        print("Please upload a .wav file")
        return None

uploaded_audio = upload_audio()

# Play audio in Colab
if uploaded_audio:
    display(Audio(uploaded_audio, autoplay=False))

uploaded_audio


'/content/uploaded_audio/example.wav'

In [46]:
#@title Copy Transcription (edit if the speech-to-text is incorrect)
reference_audio = '/content/uploaded_audio/example.wav'  # @param {type: "string"}
transcription=demo.prompt_wav_recognition(reference_audio)
from IPython.display import clear_output
clear_output()
# print("Copy the transcription and edit it if the AI made any mistakes in speech-to-text.")
transcription

"Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive."

In [49]:

#@title Subtitle Dubbing Code
import os
import re
import uuid
import shutil
import platform
import datetime
import subprocess
import math
import json
import pysrt
import librosa
import soundfile as sf
# from tqdm.auto import tqdm
from pydub import AudioSegment
from pydub.silence import split_on_silence
import soundfile as sf
from librosa import get_duration
from IPython.display import clear_output

# ---------------------- Utility Functions ----------------------

# Returns the current time formatted as HH_MM_AM/PM (for filenames or logs)
def get_current_time():
    return datetime.datetime.now().strftime("%I_%M_%p")

# Constructs an output file path for the final dubbed audio
def get_subtitle_Dub_path(srt_file_path, lang="en"):
    file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
    full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
    os.makedirs(full_base_path, exist_ok=True)
    random_string = str(uuid.uuid4())[:6]
    new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
    return new_path.replace("__", "_")

# Removes noise characters like [♫] from the subtitle text and saves a cleaned SRT
def clean_srt(input_path):
    def clean_srt_line(text):
        for bad in ["[", "]", "♫"]:
            text = text.replace(bad, "")
        return text.strip()

    subs = pysrt.open(input_path, encoding='utf-8')
    output_path = input_path.lower().replace(".srt", "") + "_.srt"
    with open(output_path, "w", encoding='utf-8') as file:
        for sub in subs:
            file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
    return output_path





# Because FFmpeg can handle speeds from 0.5× to 2.0× only
def atempo_chain(factor):
    if 0.5 <= factor <= 2.0:
        return f"atempo={factor:.3f}"
    parts = []
    while factor > 2.0:
        parts.append("atempo=2.0")
        factor /= 2.0
    while factor < 0.5:
        parts.append("atempo=0.5")
        factor *= 2.0
    parts.append(f"atempo={factor:.3f}")
    return ",".join(parts)

# If FFmpeg is not found, we will use Librosa
def speedup_audio_librosa(input_file, output_file, speedup_factor):
    try:
        y, sr = librosa.load(input_file, sr=None)
        y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
        sf.write(output_file, y_stretched, sr)
    except Exception as e:
        gr.Warning(f"Librosa speedup failed: {e}")
        shutil.copy(input_file, output_file)

# Change the audio speed if it exceeds the original SRT segment duration.
def change_speed(input_file, output_file, speedup_factor):
    try:
        subprocess.run(
            ["ffmpeg", "-i", input_file, "-filter:a", atempo_chain(speedup_factor), output_file, "-y"],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
    except Exception as e:
        gr.Error(f"FFmpeg speedup error: {e}")
        speedup_audio_librosa(input_file, output_file, speedup_factor)


# Remove silence from the start and end of the audio.
def remove_edge_silence(input_path, output_path):
    y, sr = librosa.load(input_path, sr=None)
    trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
    sf.write(output_path, trimmed_audio, sr)
    return output_path


def remove_silence_function(file_path,minimum_silence=50):
    # Extract file name and format from the provided path
    output_path = file_path.replace(".wav", "_no_silence.wav")
    audio_format = "wav"
    # Reading and splitting the audio file into chunks
    sound = AudioSegment.from_file(file_path, format=audio_format)
    audio_chunks = split_on_silence(sound,
                                    min_silence_len=100,
                                    silence_thresh=-45,
                                    keep_silence=minimum_silence)
    # Putting the file back together
    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk
    combined.export(output_path, format=audio_format)
    return output_path

# ---------------------- Main Class ----------------------
class SRTDubbing:
    def __init__(self):
        self.cache_dir = "./cache"
        os.makedirs("./dummy", exist_ok=True)
        os.makedirs(self.cache_dir, exist_ok=True)

    @staticmethod
    # Because our target is single-speaker SRT dubbing,
    # we will calculate the speaker's average talking speed per second.
    def get_avg_speaker_speed(srt_path):
        subs = pysrt.open(srt_path, encoding='utf-8')
        speeds = []
        for sub in subs:
            duration_sec = (sub.end.ordinal - sub.start.ordinal) / 1000
            char_count = len(sub.text.replace(" ", ""))
            if duration_sec > 0 and char_count > 0:
                speeds.append(char_count / duration_sec)
        return sum(speeds) / len(speeds) if speeds else 14

    @staticmethod
    # Calculate the speaker's default talking speed (e.g., 0.5x, 1x, 1.5x)
    def get_speed_factor(srt_path, default_tts_rate=14):
        avg_rate = SRTDubbing.get_avg_speaker_speed(srt_path)
        speed_factor = avg_rate / default_tts_rate if default_tts_rate > 0 else 1.0
        return math.floor(speed_factor * 100) / 100  # Truncate

    @staticmethod
    # Merge multiple SRT segments if the gap is small and total duration
    # stays under N milliseconds
    def merge_fast_entries(entries, max_pause_gap=1000, max_merged_duration_ms=8000):
        merged = []
        i = 0
        n = len(entries)
        while i < n:
            curr = entries[i].copy()
            j = i + 1
            while j < n:
                next_ = entries[j]
                gap = next_["start_time"] - curr["end_time"]
                new_duration = next_["end_time"] - curr["start_time"]
                if gap > max_pause_gap or new_duration > max_merged_duration_ms:
                    break
                if not curr["text"].strip().endswith((".", "!", "?")):
                    curr["text"] = curr["text"].strip() + ","
                curr["text"] += " " + next_["text"]
                curr["end_time"] = next_["end_time"]
                j += 1
            merged.append(curr)
            i = j
        return merged

    @staticmethod
    # Convert SRT timestamp to milliseconds
    def convert_to_millisecond(t):
        return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)

    # Read SRT file and convert it to our required dictionary format for dubbing
    def read_srt_file(self, file_path):
        subs = pysrt.open(file_path, encoding='utf-8')
        entries = []
        prev_end = 0
        for idx, sub in enumerate(subs, 1):
            start = self.convert_to_millisecond(sub.start)
            end = self.convert_to_millisecond(sub.end)
            pause = start - prev_end if idx > 1 else start
            entries.append({
                'entry_number': idx,
                'start_time': start,
                'end_time': end,
                'text': sub.text.strip(),
                'pause_time': pause,
                'audio_name': f"{idx}.wav",
                'previous_pause': f"{idx}_before_pause.wav",
            })
            prev_end = end

        entries = self.merge_fast_entries(entries)

        ## For debug
        # with open("./old.json", "w", encoding="utf-8") as f:
        #     json.dump(entries, f, indent=2, ensure_ascii=False)
        # with open("/content/new.json", "w", encoding="utf-8") as f:
        #     json.dump(entries, f, indent=2, ensure_ascii=False)

        return entries


    # For TTS, modify this function in the future to use a different TTS or voice cloning tool
    def text_to_speech_srt(self, text,
                                    audio_path,
                                    reference_audio,
                                    transcription,
                                    cfg_value_input,
                                    inference_timesteps_input,
                                    do_normalize,
                                    denoise,
                                    actual_duration,
                                    default_speed_factor=None):


        TOLERANCE_MS = 30
        temp = os.path.join(self.cache_dir, "temp.wav")

        if default_speed_factor is None:
            default_speed_factor = 1.0
        # Step 1: create clone voice
        sr, wav = demo.generate_tts_audio(
              text,
              reference_audio,
              transcription,
              cfg_value_input,
              inference_timesteps_input,
              do_normalize,
              denoise,
          )
        path="./audio.wav"
        sf.write(path, np.array(wav).squeeze(), sr)

        # Step 2: Apply user-defined speaking speed
        if default_speed_factor != 1.0:
            user_speed_path = path.replace(".wav", "_user.wav")
            change_speed(path, user_speed_path, default_speed_factor, self.use_ffmpeg, self.ffmpeg_path)
            path = user_speed_path

        # Step 3: Trim silence
        remove_edge_silence(path, temp)

        # Step 4: Duration analysis (high precision)
        y, sr = sf.read(temp)
        duration_ms = int(get_duration(y=y, sr=sr) * 1000)

        # Step 5: If very close, skip correction
        if abs(duration_ms - actual_duration) <= TOLERANCE_MS:
            shutil.move(temp, audio_path)
            return

        # Step 6: Try regenerating with silence removal if too long
        if duration_ms > actual_duration:
            path=remove_silence_function(temp,minimum_silence=50)
            shutil.copy(path, temp)
            y, sr = sf.read(temp)
            duration_ms = int(get_duration(y=y, sr=sr) * 1000)

        # Step 7: Final correction
        if duration_ms > actual_duration + TOLERANCE_MS:
            factor = duration_ms / actual_duration
            corrected = os.path.join(self.cache_dir, "speed_final.wav")
            change_speed(temp, corrected, factor)
            shutil.move(corrected, audio_path)
        elif duration_ms < actual_duration - TOLERANCE_MS:
            silence = AudioSegment.silent(duration=actual_duration - duration_ms)
            (AudioSegment.from_file(temp) + silence).export(audio_path, format="wav")
        else:
            shutil.move(temp, audio_path)


    @staticmethod
    # Insert silent gaps between two segments
    def make_silence(duration, path):
        AudioSegment.silent(duration=duration).export(path, format="wav")

    @staticmethod
    # Srt save folder
    def create_folder_for_srt(srt_file_path):
        base = os.path.splitext(os.path.basename(srt_file_path))[0]
        folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
        os.makedirs(folder, exist_ok=True)
        return folder

    @staticmethod
    # Join Chunks audio files
    def concatenate_audio_files(paths, output):
        audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
        audio.export(output, format="wav")

    # Util funtion to call other funtions
    def srt_to_dub(self,srt_path,
                            output_path,
                            reference_audio,
                            transcription,
                            cfg_value_input,
                            inference_timesteps_input,
                            do_normalize,
                            denoise,
                            speaker_talk_speed):

        entries = self.read_srt_file(srt_path)
        folder = self.create_folder_for_srt(srt_path)
        all_audio = []
        if speaker_talk_speed:
          default_speed_factor = self.get_speed_factor(srt_path)
        else:
          default_speed_factor=1.0
        total=len(entries)
        count=1
        # for entry in tqdm(entries):
        # for idx, entry in enumerate(entries):
        for entry in entries:

            self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
            all_audio.append(os.path.join(folder, entry['previous_pause']))
            tts_path = os.path.join(folder, entry['audio_name'])
            print(f"{count}/{total}",end="\r")
            count+=1
            self.text_to_speech_srt(entry['text'],
                                    tts_path,
                                    reference_audio,
                                    transcription,
                                    cfg_value_input,
                                    inference_timesteps_input,
                                    do_normalize,
                                    denoise,
                                    entry['end_time'] - entry['start_time'],
                                    default_speed_factor)
            all_audio.append(tts_path)
            clear_output()
        self.concatenate_audio_files(all_audio, output_path)

# Clear the output of the current cell

# ---------------------- Entrypoint ----------------------
def dubbing(srt_path, reference_audio,transcription,
                       cfg_value_input=2.0,
                       inference_timesteps_input=10,
                       do_normalize=False,
                       denoise=False,
                       speaker_talk_speed=False):
    if not srt_path.endswith(".srt"):
        print("Please upload a valid .srt file", duration=5)
        return None

    processed_srt =  clean_srt(srt_path)
    output_path = get_subtitle_Dub_path(srt_path)
    if not transcription:
      transcription=demo.prompt_wav_recognition(reference_audio)
    SRTDubbing().srt_to_dub(processed_srt,
                            output_path,
                            reference_audio,
                            transcription,
                            cfg_value_input,
                            inference_timesteps_input,
                            do_normalize,
                            denoise,
                            speaker_talk_speed)
    return output_path


# -------------------------------
# Example Usage
# -------------------------------

srt_path = "/content/uploaded_srt/me_talking.srt" # @param {type: "string"}
transcription = "Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive." # @param {type: "string"}
cfg_value_input = 2.0 # @param {type: "number"}
inference_timesteps_input = 10 # @param {type: "number"}
do_normalize = False # @param {type: "boolean"}
denoise = False # @param {type: "boolean"}
match_speaker_speaking_speed= False # @param {type: "boolean"}
result_path =  dubbing(srt_path, reference_audio,transcription,
                       cfg_value_input,
                       inference_timesteps_input,
                       do_normalize,
                       denoise,
                       match_speaker_speaking_speed)

clear_output()

print(f"Dubbed audio saved at:\n{result_path}")

drive_folder = "/content/gdrive/MyDrive/dubbing"
if os.path.exists(drive_folder):
    drive_result_path = shutil.copy(result_path, drive_folder)
    print(f"Copied to Google Drive:\n{drive_result_path}")



from google.colab import files
files.download(result_path)




Dubbed audio saved at:
/content/VoxCPM/TTS_DUB/me_talking_en_ea32ad.wav


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
#@title Downlaod Dubbing Audio
from google.colab import files
files.download(result_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title Play Dubbed Audio
from IPython.display import Audio
Audio(result_path, autoplay=False)
