# Voice Inpainting Debug Demo

In [1]:
from IPython.display import Audio, display
import torch
import time
from loguru import logger

from src.tokenization import AudioTokenizer
from src.semantic_edit import SemanticEditor
from src.integrated_inpainting import IntegratedVoiceInpainting
from src.main import setup_device

%load_ext autoreload
%autoreload 2

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



  from .autonotebook import tqdm as notebook_tqdm


import error: No module named 'triton'


In [2]:
# Input parameters
input_file = "data/debug/2739d072-86c9-4606-9618-3f452716ebac/01_original.wav"
edit_prompt = "Change 'one piece' to 'two pieces'"
output_file = "output.wav"
debug_dir = "data/debug_output"

In [3]:
edits=edit_prompt

start_time = time.time()

# Set up device
device = setup_device()

# Step 1: Tokenize input audio to RVQ tokens
logger.info("Tokenizing input audio to RVQ tokens...")
tokenizer = AudioTokenizer(device=device)
tokenized_audio = tokenizer.tokenize(input_file)

# Step 2: Process the edits - either convert a single prompt to an edit operation
#         or use the provided list of edits
edit_operations = []

if isinstance(edits, str):
    # Single edit prompt provided - use SemanticEditor to find edit region
    logger.info(f"Processing single edit prompt: {edits}")
    editor = SemanticEditor(tokenizer, load_llm=True)
    edit_op = editor.find_edit_region(tokenized_audio, edits)
    edit_operations.append(edit_op)

[32m2025-03-29 23:01:41.259[0m | [1mINFO    [0m | [36msrc.main[0m:[36msetup_device[0m:[36m41[0m - [1mUsing device: cpu[0m
[32m2025-03-29 23:01:41.259[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mTokenizing input audio to RVQ tokens...[0m
[32m2025-03-29 23:01:41.260[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36m_initialize_tokenizers[0m:[36m71[0m - [1mInitializing Mimi RVQ tokenizer...[0m
[32m2025-03-29 23:01:41.372[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m38[0m - [1mInitializing moshi_mlx backend[0m
[32m2025-03-29 23:01:41.373[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m41[0m - [1mDownloading moshi_mlx tokenizer weights[0m
[32m2025-03-29 23:01:41.552[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m47[0m - [1mCreating Mimi MLX models with 32 codebooks[0m
[32m2025-03-29 23:01:41.58

In [4]:
edit_op

EditOperation(original_text='one piece', edited_text='two pieces', start_token_idx=17, end_token_idx=27, confidence=1.0, prepadding_text='I would like to have', prepadding_start_token_idx=6, prepadding_end_token_idx=17, postpadding_text='of chocolate as well as a strawberry cake.', postpadding_start_token_idx=27, postpadding_end_token_idx=59)

In [None]:
temperature=0.7
topk=25

# Step 3: Perform integrated inpainting
logger.info("Performing integrated voice inpainting...")
inpainting = IntegratedVoiceInpainting(device=device)

with torch.inference_mode():
    inpainted_tokens, final_audio, final_sr = inpainting.batch_inpaint(
        tokenized_audio,
        edit_operations,
        temperature=temperature,
        topk=topk
    )

    elapsed_time = time.time() - start_time
    logger.info(
        f"Voice inpainting completed successfully in {elapsed_time:.2f} seconds"
    )

display(Audio(final_audio, rate=final_sr))

[32m2025-03-29 23:02:03.627[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mPerforming integrated voice inpainting...[0m
[32m2025-03-29 23:02:03.628[0m | [1mINFO    [0m | [36msrc.integrated_inpainting[0m:[36m_initialize_csm_model[0m:[36m49[0m - [1mInitializing CSM model...[0m
[32m2025-03-29 23:02:22.031[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m38[0m - [1mInitializing moshi_mlx backend[0m
[32m2025-03-29 23:02:22.036[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m41[0m - [1mDownloading moshi_mlx tokenizer weights[0m
[32m2025-03-29 23:02:22.161[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m47[0m - [1mCreating Mimi MLX models with 32 codebooks[0m
[32m2025-03-29 23:02:22.186[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m57[0m - [1mWarming up MLX models[0m
[3

ckpt path or config path does not exist! Downloading the model from the Hugging Face Hub...


Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 199728.76it/s]
[32m2025-03-29 23:02:26.176[0m | [1mINFO    [0m | [36msrc.integrated_inpainting[0m:[36m_initialize_csm_model[0m:[36m54[0m - [1mCSM model loaded successfully[0m
[32m2025-03-29 23:02:26.176[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36m_initialize_tokenizers[0m:[36m71[0m - [1mInitializing Mimi RVQ tokenizer...[0m
[32m2025-03-29 23:02:26.177[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m38[0m - [1mInitializing moshi_mlx backend[0m
[32m2025-03-29 23:02:26.177[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m41[0m - [1mDownloading moshi_mlx tokenizer weights[0m
[32m2025-03-29 23:02:26.334[0m | [1mINFO    [0m | [36msrc.mimi_tokenizer[0m:[36m_initialize_moshi_mlx[0m:[36m47[0m - [1mCreating Mimi MLX models with 32 codebooks[0m
[32m2025-03-29 23:02:26.370[0m | [1mINFO    [0m | [36msrc.mimi_t