In [2]:
from google import genai
import os

client =  genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

myfile = client.files.upload(file='/home/skamalj/dev/tataplay/separated_audio/htdemucs/media_1_audio/vocals.wav')

response = client.models.generate_content(
  model='gemini-2.5-pro',
  contents=[
    """You are given an audio clip. 
    Task:
    1. Identify the speakers.
    2. Give them indian names basis gender
    3. Descrive their speaking style

    Stye description examples:
    Aditi - Slightly High-Pitched, Expressive Tone:
    "Aditi speaks with a slightly higher pitch in a close-sounding environment. Her voice is clear, with subtle emotional depth and a normal pace, all captured in high-quality recording."

    Sita - Rapid, Slightly Monotone:
    "Sita speaks at a fast pace with a slightly low-pitched voice, captured clearly in a close-sounding environment with excellent recording quality."

    Tapan - Male, Moderate Pace, Slightly Monotone:
    "Tapan speaks at a moderate pace with a slightly monotone tone. The recording is clear, with a close sound and only minimal ambient noise."
    """,
    myfile,
  ]
)
print(response.text)

Of course, here is a breakdown of the speakers in the audio clip.

### Speaker Identification

1.  **Gita** - **Female, Expressive and Questioning Tone:**
    Gita speaks with a clear, mid-to-high-pitched voice. Her tone is expressive and inquisitive, speaking at a moderate pace. The recording is high-quality with a close, intimate sound.

2.  **Gopal** - **Male, Calm and Measured Tone:**
    Gopal has a calm, mid-pitched male voice. He speaks in a measured and slightly charming tone at a moderate pace. The audio is captured clearly in a high-quality, close-sounding environment.

3.  **Rohan** - **Male Narrator, Upbeat and Rhythmic:**
    Rohan speaks in a classic, upbeat narrator's voice with a mid-to-low pitch. His delivery is fast and rhythmic, designed to be memorable for the advertisement's tagline. The recording quality is excellent.


In [1]:
from google import genai
import os

client =  genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

myfile = client.files.upload(file='/home/skamalj/dev/tataplay/separated_audio/htdemucs/media_1_audio/vocals.wav')

response = client.models.generate_content(
  model='gemini-2.5-pro',
  contents=[
    """You are given an audio clip. 
    Task:
    1. Transcribe the speech in its original language.
    2. Translate it into English, but:
       - If the original speaker uses words from another language (e.g., English words in Hindi),
         preserve those words exactly in the translated text. Do not re-translate them.
       - The translation must feel natural and conversational.
    3. Assume there are 3 speakers (Speaker 1, Speaker 2, Speaker 3).
    4. Perform speaker diarization: identify continuous segments by the same speaker and merge them.
    5. **Break the segments only when speaker changes or there is a significant pause (more than 2 second)**.
    6. Capture pitch, pauses, and tone in the SSML output where possible.
    7. Use a unique voice for each speaker in the SSML output.
    5. Return output strictly in valid JSON with this structure:

    {
      "speakers_count": <int>,
      "speak_segments": [
        {
          "speaker": "<speaker_id>",
          "start_time": "<float_seconds>",
          "end_time": "<float_seconds>",
          "original_text": "<transcribed_text_in_original_language>",
          "translated_text": "<translated_text_in_targeted_language_with_code_switch_words_preserved>",
          "ssml": "<speak>...</speak>  // built from translated_text only"
        },
        ...
      ],
      "metadata": {
        "translation_language": "<targeted language",
        "total_duration": "<float_seconds>",
        "notes": "pitch, pauses, tone captured where possible; code-switch words preserved"
      }
    }

    Rules:
    - Each segment must include both original_text and translated_text.
    - The translated_text must preserve any words from another language exactly as spoken.
      Example: If the speaker says "वैसे कल आपका स्माइल करना मेरे लिए काफी हिस्टोरिकल था",
      then translated_text could be: "<translated text." (with "smile" and "historical" kept in English).
    - The ssml field must be built from translated_text only.
    - Each segment must have its own <speak> block with a <voice> tag unique to the speaker.
    - Use <break> and <prosody> to reflect pauses, pitch, and tone.
    - Do not include any explanation outside the JSON.
    """,
    myfile,
  ]
)

### Helper Function

In [2]:
# @title Helper functions (just run that cell)

import contextlib
import wave
from IPython.display import Audio

file_index = 0

@contextlib.contextmanager
def wave_file(filename, channels=1, rate=24000, sample_width=2):
    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        yield wf

def play_audio_blob(blob):
  global file_index
  file_index += 1

  fname = f'audio_{file_index}.wav'
  with wave_file(fname) as wav:
    wav.writeframes(blob.data)

  return Audio(fname, autoplay=True)

def play_audio(response):
    return play_audio_blob(response.candidates[0].content.parts[0].inline_data)

In [3]:
import json

# Example: Gemini response text
text = response.text

# Remove leading ```json and trailing ```
if text.startswith("```json"):
    text = text[len("```json"):].strip()
if text.endswith("```"):
    text = text[:-3].strip()

# Now parse as JSON
gemini_data = json.loads(text)

# save gemini_data to a file
with open('translated_diarized_data.json', 'w', encoding='utf-8') as f:
    json.dump(gemini_data, f, indent=2,ensure_ascii=False)


In [4]:
def create_wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
   with wave.open(filename, "wb") as wf:
      wf.setnchannels(channels)
      wf.setsampwidth(sample_width)
      wf.setframerate(rate)
      wf.writeframes(pcm)

In [5]:
from google.genai import types
from pathlib import Path

def synthesize_segment_tts(client, segment, output_dir="translated_segments", voice_name="Puck"):
    """
    Generate TTS audio for a single segment, keeping duration for lip-sync.

    segment: dict with keys
        - ssml: SSML string
        - start_time: float (seconds)
        - end_time: float (seconds)
        - speaker: string
    client: gemini client
    output_dir: where to save the segment audio
    voice_name: prebuilt Gemini TTS voice
    """
    import os
    Path(output_dir).mkdir(exist_ok=True)
    
    # Calculate duration from segment metadata
    duration = float(segment["end_time"])  - float(segment["start_time"])
    
    # Prepare output file path
    seg_index = segment.get("index", 0)

    speaker = segment.get("speaker", "Speaker")
    out_file = Path(output_dir) / f"{seg_index}_{speaker.replace(' ', '_')}.wav"

    print(f"Processing segment {seg_index} with duration {duration:.3f} seconds to {out_file}")
    print(segment['translated_text'])
    # Wrap the SSML into the TTS prompt
    tts_prompt = f"""
    You are given SSML with timing metadata. 
    Identify the target language from text. If language is english ensure that accent is **Indian English**.
    Must keep original pauses and restrict **durations to {duration:.3f} seconds**. 
    Generate **audio output**.

    Here is the SSML:

    {segment['ssml']}
    """
    #gemini-2.5-flash-preview-tts
    #emini-2.5-pro-preview-tts
    response = client.models.generate_content(
        model='gemini-2.5-flash-preview-tts',
        contents=tts_prompt,
        config=types.GenerateContentConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name=voice_name,
                    )
                )
            ),
        )
    )
    print(response.candidates[0])
    # Save the returned audio bytes
    if response.candidates[0].content:
        data = response.candidates[0].content.parts[0].inline_data.data # Gemini returns audio bytes in response.audio
        create_wave_file(str(out_file), data)


In [14]:
!rm translated_segments/*.wav

### Block is only for testing. Run for any single segment

In [11]:
n = 3  # change this index as needed
seg = gemini_data["speak_segments"][n]
seg["index"] = n

synthesize_segment_tts(client, seg, voice_name="Puck")


Processing segment 3 with duration 4.160 seconds to translated_segments/3_Speaker_2.wav
By the way, you smiling yesterday was quite historical for me.
content=Content(
  parts=[
    Part(
      inline_data=Blob(
        data=b'\xfc\xff\xfe\xff\x0c\x00\x10\x00\x03\x00\n\x00\x19\x00*\x009\x00/\x00\x19\x00\xed\xff\xe6\xff\xdb\xff\xde\xff\x02\x00\r\x00\x11\x00\x10\x00\xf1\xff\xe2\xff\xd7\xff\xcf\xff\xd4\xff\xf7\xff\x13\x00\x0e\x00\xf0\xff\xf5\xff\n\x00\xeb\xff\xda\xff\xdb\xff\xee\xff\x0b\x00\x1a\x00\x10\x00\xf1\xff\xeb\xff\xeb\xff\xf0\xff\xf3\xff\xfc\xff\x05\x00\x0c\x00\t\x00\xff\xff\xff\xff\xfd...',
        mime_type='audio/L16;codec=pcm;rate=24000'
      )
    ),
  ],
  role='model'
) citation_metadata=None finish_message=None token_count=None finish_reason=<FinishReason.STOP: 'STOP'> url_context_metadata=None avg_logprobs=None grounding_metadata=None index=0 logprobs_result=None safety_ratings=None


In [7]:
from google import genai
import os
client =  genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

In [8]:
# Assume `gemini_data` is your parsed Gemini JSON with speak_segments
for idx, seg in enumerate(gemini_data["speak_segments"]):
    seg["index"] = idx
    synthesize_segment_tts(client, seg, voice_name="Puck")


Processing segment 0 with duration 9.880 seconds to translated_segments/0_Speaker_1.wav
Gopal ji, what were you teaching in the history class today? You were saying that Tata Sky has now become Tata Play. What kind of a history class is this?
content=None citation_metadata=None finish_message=None token_count=None finish_reason=<FinishReason.OTHER: 'OTHER'> url_context_metadata=None avg_logprobs=None grounding_metadata=None index=0 logprobs_result=None safety_ratings=None
Processing segment 1 with duration 5.790 seconds to translated_segments/1_Speaker_2.wav
Geeta ji, Tata Sky becoming Tata Play is a very historical moment. That's why I was teaching it.
content=Content(
  parts=[
    Part(
      inline_data=Blob(
        data=b'\xef\xff\xea\xff\xec\xff\xe5\xff\xe6\xff\xf0\xff\x06\x00 \x001\x008\x00,\x00\x16\x00\x04\x00\xf5\xff\xef\xff\xea\xff\xe9\xff\xe6\xff\xf0\xff\x01\x00\x0e\x00\r\x00\x06\x00\x01\x00\x01\x00\x0c\x00\x11\x00\x0b\x00\xff\xff\xf1\xff\xe9\xff\xf0\xff\xf9\xff\x01\x00\x04

## Code to stretch a audio for segments. WIP

In [12]:
from pydub import AudioSegment
from pydub.silence import detect_silence
from audiostretchy.stretch import stretch_audio

def remove_silence_from_end(audio_path, output_path, silence_thresh=-50, chunk_size=10):
    audio = AudioSegment.from_file(audio_path)
    silence_ranges = detect_silence(audio, min_silence_len=chunk_size, silence_thresh=silence_thresh)

    # Default end_trim is full length of audio
    end_trim = len(audio)

    # Check if last silence segment is at the end of audio
    if silence_ranges and silence_ranges[-1][1] == len(audio):
        # Trim silence at the end
        end_trim = silence_ranges[-1][0]

    # Keep audio from start to end_trim (remove silence only at end)
    trimmed_audio = audio[:end_trim]
    trimmed_audio.export(output_path, format="wav")

# Usage example:
#remove_silence_from_end("translated_segments/1_Speaker_2_compress.wav", "translated_segments/trimmed_end_only.wav")


In [13]:
import json
from pathlib import Path
from pydub import AudioSegment
import soundfile as sf
from audiostretchy.stretch import stretch_audio

# Paths
json_path = "translated_diarized_data.json"
tts_dir = Path("translated_segments")          # TTS-generated segments
adjusted_dir = Path("adjusted_segments")      # Save adjusted segments
adjusted_dir.mkdir(exist_ok=True)

# Load JSON
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

segments = data["speak_segments"]


for idx, seg in enumerate(segments):
    speaker = seg.get("speaker", "Speaker")
    
    tts_file = tts_dir / f"{idx}_{speaker.replace(' ', '_')}.wav"
    if not tts_file.exists():
        print(f"Missing file: {tts_file}")
        continue

    # Target duration from JSON
    target_duration = float(seg["end_time"]) - float(seg["start_time"])  # in seconds

   # Load your audio file
    audio = AudioSegment.from_file(tts_file)

    # Get the duration in milliseconds
    current_duration = audio.duration_seconds

    # Save adjusted segment
    out_file = adjusted_dir / f"adjusted_seg_{idx}_{seg['speaker'].replace(' ', '_')}.wav"
    out_file_stretched = adjusted_dir / f"adjusted_seg_{idx}_{seg['speaker'].replace(' ', '_')}_stretched.wav"

    stretch_factor = target_duration / current_duration
    adjusted_audio = stretch_audio(tts_file, out_file, ratio=stretch_factor)
    remove_silence_from_end(out_file, out_file_stretched)

    print(f"Saved adjusted segment: {out_file} (target: {target_duration:.2f}s, actual: {current_duration:.2f}s)")


Saved adjusted segment: adjusted_segments/adjusted_seg_0_Speaker_1.wav (target: 9.88s, actual: 39.93s)
Saved adjusted segment: adjusted_segments/adjusted_seg_1_Speaker_2.wav (target: 5.79s, actual: 34.33s)
Saved adjusted segment: adjusted_segments/adjusted_seg_2_Speaker_1.wav (target: 2.56s, actual: 4.73s)
Saved adjusted segment: adjusted_segments/adjusted_seg_3_Speaker_2.wav (target: 4.16s, actual: 8.37s)
Saved adjusted segment: adjusted_segments/adjusted_seg_4_Speaker_3.wav (target: 5.46s, actual: 8.33s)


### Switch to OpenVoice Environment

In [1]:
import json
from pathlib import Path

# ----------- Load Gemini segments -----------
with open("translated_diarized_data.json", "r", encoding="utf-8") as f:
    gemini_data = json.load(f)

segments = gemini_data.get("speak_segments", [])

print(f"Loaded {len(segments)} segments from file")

Loaded 7 segments from file


In [2]:
import torch
from openvoice.api import ToneColorConverter

ckpt_converter = 'checkpoints_v2/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

  WeightNorm.apply(module, name, dim)
  from .autonotebook import tqdm as notebook_tqdm


Loaded checkpoint 'checkpoints_v2/converter/checkpoint.pth'
missing/unexpected keys: [] []


In [3]:
import re
from pathlib import Path
from collections import defaultdict

from pydub import AudioSegment
import torch
import warnings

# OpenVoice se_extractor import (make sure OpenVoice is on PYTHONPATH)
from openvoice import se_extractor

def _sanitize_speaker(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9_\-]", "_", s).strip("_")

def build_reference_embeddings_from_diarization(
    original_audio_path: str,
    gemini_data: dict,
    out_dir: str = "reference_segments",
    join_silence_ms: int = 100,
    sample_rate: int = 24000,
    min_total_duration_sec: float = 2.5,
    tone_color_converter=tone_color_converter,
):
    """
    Concatenate all segments for each speaker (in chronological order),
    export a single WAV reference per speaker, then create + save OpenVoice embeddings.

    Returns:
      dict mapping speaker -> {"ref_wav": str, "embedding": str, "duration_s": float}
    """

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Load full original audio once (pydub uses ffmpeg behind the scenes)
    audio = AudioSegment.from_file(str(original_audio_path))
    full_duration_ms = len(audio)
    print(f"[info] Loaded original audio '{original_audio_path}' ({full_duration_ms/1000:.2f}s)")

    # Group segments by speaker (keeps order)
    speaker_segments = defaultdict(list)
    for seg in gemini_data.get("speak_segments", []):
        try:
            start_ms = int(round(float(seg["start_time"]) * 1000))
            end_ms   = int(round(float(seg["end_time"]) * 1000))
        except Exception as e:
            print(f"[warning] skipping segment with bad times: {seg} -> {e}")
            continue
        if end_ms <= start_ms:
            print(f"[warning] skipping zero/negative-length segment: start={start_ms} end={end_ms}")
            continue
        # clamp within audio
        start_ms = max(0, min(start_ms, full_duration_ms))
        end_ms   = max(0, min(end_ms, full_duration_ms))
        dur_ms = end_ms - start_ms
        speaker = seg.get("speaker", "unknown").replace(' ', '_')
        speaker_segments[speaker].append({"start_ms": start_ms, "end_ms": end_ms, "duration_ms": dur_ms})

    results = {}

    for speaker, segs in speaker_segments.items():
        # sort by start time to keep natural order
        segs_sorted = sorted(segs, key=lambda x: x["start_ms"])
        concat = AudioSegment.silent(duration=0, frame_rate=sample_rate)

        total_ms = 0
        for s in segs_sorted:
            start_ms = s["start_ms"]
            end_ms = s["end_ms"]
            # slice original audio
            clip = audio[start_ms:end_ms]
            # append clip
            concat += clip
            total_ms += len(clip)
            # add tiny silence between clips to avoid gluing words (optional)
            concat += AudioSegment.silent(duration=join_silence_ms, frame_rate=sample_rate)

        total_s = total_ms / 1000.0

        # Warn if too short
        if total_s < min_total_duration_sec:
            warnings.warn(
                f"[warn] total concatenated duration for speaker '{speaker}' is short ({total_s:.2f}s). "
                "Embeddings may be poor. Consider merging more segments or increasing min_total_duration_sec."
            )

        # Trim trailing silence
        # If concat is longer than join silence at end, remove last join_silence_ms
        if len(concat) >= join_silence_ms:
            concat = concat[:-join_silence_ms]

        # Normalize export settings
        concat = concat.set_frame_rate(sample_rate).set_channels(1)

        # Save reference wav
        speaker_safe = _sanitize_speaker(speaker)
        ref_path = out_dir / f"ref_{speaker_safe}.wav"
        concat.export(str(ref_path), format="wav")
        print(f"[ok] Saved ref audio for '{speaker}' -> {ref_path} ({total_s:.2f}s)")

        # Create embedding via se_extractor
        try:
            # se_extractor.get_se may accept different signatures; attempt with tone_color_converter first
            if tone_color_converter is not None:
                source_se, audio_name = se_extractor.get_se(str(ref_path), tone_color_converter, vad=True)
            else:
                # fallback signature
                source_se, audio_name = se_extractor.get_se(str(ref_path), vad=True)
        except TypeError:
            # try alternate call (some builds expect only path)
            source_se, audio_name = se_extractor.get_se(str(ref_path))

        emb_path = out_dir / f"{speaker_safe}_embedding.pt"
        torch.save(source_se, str(emb_path))
        print(f"[ok] Saved embedding for '{speaker}' -> {emb_path}")

        results[speaker] = {
            "ref_wav": str(ref_path),
            "embedding": str(emb_path),
            "duration_s": total_s,
            "num_segments": len(segs_sorted),
        }

    return results


Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [4]:
# gemini_data = json.load(open("translated_diarized_data.json", encoding="utf-8"))
# original_audio = "original_audio.mp3"

ref_map = build_reference_embeddings_from_diarization(
    original_audio_path="media_1_audio.mp3",
    gemini_data=gemini_data,
    out_dir="reference_segments",
    join_silence_ms=100,
    sample_rate=24000,
    min_total_duration_sec=5.0,
    tone_color_converter=tone_color_converter  # or None
)

print(ref_map)
# -> {'Speaker 1': {'ref_wav': 'reference_segments/ref_Speaker_1.wav', 'embedding': 'reference_segments/Speaker_1_embedding.pt', ...}, ...}


[info] Loaded original audio 'media_1_audio.mp3' (35.11s)
[ok] Saved ref audio for 'Speaker_1' -> reference_segments/ref_Speaker_1.wav (7.52s)
OpenVoice version: v2
[(0.0, 7.72)]
after vad: dur = 7.72


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:875.)
  return _VF.stft(  # type: ignore[attr-defined]


[ok] Saved embedding for 'Speaker_1' -> reference_segments/Speaker_1_embedding.pt
[ok] Saved ref audio for 'Speaker_2' -> reference_segments/ref_Speaker_2.wav (12.78s)
OpenVoice version: v2
[(0.0, 12.98)]
after vad: dur = 12.98
[ok] Saved embedding for 'Speaker_2' -> reference_segments/Speaker_2_embedding.pt
[ok] Saved ref audio for 'Speaker_3' -> reference_segments/ref_Speaker_3.wav (5.62s)
OpenVoice version: v2
[(0.0, 5.62)]
after vad: dur = 5.62
[ok] Saved embedding for 'Speaker_3' -> reference_segments/Speaker_3_embedding.pt
{'Speaker_1': {'ref_wav': 'reference_segments/ref_Speaker_1.wav', 'embedding': 'reference_segments/Speaker_1_embedding.pt', 'duration_s': 7.52, 'num_segments': 3}, 'Speaker_2': {'ref_wav': 'reference_segments/ref_Speaker_2.wav', 'embedding': 'reference_segments/Speaker_2_embedding.pt', 'duration_s': 12.78, 'num_segments': 3}, 'Speaker_3': {'ref_wav': 'reference_segments/ref_Speaker_3.wav', 'embedding': 'reference_segments/Speaker_3_embedding.pt', 'duration_s': 

In [5]:
from pydub import AudioSegment
import torch
from pathlib import Path

def build_src_embedding_from_tts(tts_dir: str, gemini_data: dict, tone_color_converter=None, max_len_sec=10):
    """
    Combine TTS segments (up to max_len_sec) to create one source embedding.
    """
    combined = AudioSegment.silent(duration=0)
    total_ms = 0

    
    for idx, seg in enumerate(gemini_data["speak_segments"]):
        speaker = seg.get("speaker", "Speaker").replace(' ', '_')
        tts_path = Path(tts_dir) / f"{idx}_{speaker}.wav"
        if not tts_path.exists():
            continue

        seg_audio = AudioSegment.from_wav(tts_path)
        combined += seg_audio
        total_ms += len(seg_audio)

        if total_ms >= max_len_sec * 1000:
            break

    # Save temporary combined TTS clip
    tmp_path = Path(tts_dir) / "tts_src_reference.wav"
    combined.export(tmp_path, format="wav")

    # Extract embedding once
    if tone_color_converter is not None:
        src_se, _ = se_extractor.get_se(str(tmp_path), tone_color_converter, vad=True)
    else:
        src_se, _ = se_extractor.get_se(str(tmp_path), vad=True)

    print(f"[ok] Source embedding created from {total_ms/1000:.1f}s of TTS audio -> {tmp_path}")
    return src_se, str(tmp_path)


In [6]:
# Step 2: Build **single src embedding** from TTS segments
src_se, tts_ref_path = build_src_embedding_from_tts(
    tts_dir="translated_segments",
    gemini_data=gemini_data,
    tone_color_converter=tone_color_converter
)

OpenVoice version: v2
[(0.0, 10.7019375)]
after vad: dur = 10.701
[ok] Source embedding created from 10.7s of TTS audio -> translated_segments/tts_src_reference.wav


In [7]:
def convert_tts_segments_with_refs(tts_dir, gemini_data, ref_map, out_dir, src_se, tone_color_converter):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    results = {}
    for idx, seg in enumerate(gemini_data.get("speak_segments", [])):
        speaker = seg["speaker"].replace(' ', '_')

        tts_wav = Path(tts_dir) / f"adjusted_seg_{idx}_{speaker}_stretched.wav"
        if not tts_wav.exists():
            print(f"[skip] No TTS audio for seg {idx}")
            continue

        if speaker not in ref_map:
            print(f"[skip] No reference for {speaker}")
            continue

        tgt_se = torch.load(ref_map[speaker]["embedding"])
        out_wav = out_dir / f"converted_seg_{idx}_{speaker}.wav"

        tone_color_converter.convert(
            audio_src_path=str(tts_wav),
            src_se=src_se,   # <-- reuse the same source embedding
            tgt_se=tgt_se,
            output_path=str(out_wav),
            message="@MyShell",
        )
        print(f"[ok] Converted seg {idx} ({speaker}) -> {out_wav}")
        results[idx] = str(out_wav)

    return results


In [8]:
converted = convert_tts_segments_with_refs(
    tts_dir="adjusted_segments",
    gemini_data=gemini_data,
    ref_map=ref_map,
    out_dir="converted_segments",
    src_se=src_se,
    tone_color_converter=tone_color_converter
)

[ok] Converted seg 0 (Speaker_1) -> converted_segments/converted_seg_0_Speaker_1.wav
[ok] Converted seg 1 (Speaker_2) -> converted_segments/converted_seg_1_Speaker_2.wav
Audio too short, fail to add watermark
[ok] Converted seg 2 (Speaker_1) -> converted_segments/converted_seg_2_Speaker_1.wav
[ok] Converted seg 3 (Speaker_2) -> converted_segments/converted_seg_3_Speaker_2.wav
[ok] Converted seg 4 (Speaker_1) -> converted_segments/converted_seg_4_Speaker_1.wav
[ok] Converted seg 5 (Speaker_2) -> converted_segments/converted_seg_5_Speaker_2.wav
[ok] Converted seg 6 (Speaker_3) -> converted_segments/converted_seg_6_Speaker_3.wav


In [9]:
import json
from pathlib import Path
from pydub import AudioSegment

def mix_segments_with_background(gemini_json_path, converted_dir, background_audio_path, output_path):
    # Load segment metadata
    with open(gemini_json_path, "r", encoding="utf-8") as f:
        gemini_data = json.load(f)

    speak_segments = gemini_data["speak_segments"]

    # Load background audio
    bg_audio = AudioSegment.from_file(background_audio_path, format="mp3")
    
    # Overlay each converted segment
    for idx, seg in enumerate(speak_segments):
        speaker = seg["speaker"].replace(' ', '_')
        seg_file = Path(converted_dir) / f"converted_seg_{idx}_{speaker}.wav"
        if not seg_file.exists():
            print(f"Skipping missing segment: {seg_file}")
            continue
        
        segment_audio = AudioSegment.from_file(seg_file, format="wav")
        start_ms = int(float(seg["start_time"]) * 1000)
        bg_audio = bg_audio.overlay(segment_audio, position=start_ms)
        print(f"Overlayed segment {idx} ({seg['speaker']}) at {start_ms} ms")

    # Export final mix
    bg_audio.export(output_path, format="wav")
    print(f"Final audio mix saved to {output_path}")

# Example usage
mix_segments_with_background(
    gemini_json_path="translated_diarized_data.json",
    converted_dir="converted_segments",
    background_audio_path="separated_audio/htdemucs/media_1_audio/no_vocals.wav",
    output_path="final_mix.wav"
)


Overlayed segment 0 (Speaker 1) at 320 ms
Overlayed segment 1 (Speaker 2) at 4990 ms
Overlayed segment 2 (Speaker 1) at 8560 ms
Overlayed segment 3 (Speaker 2) at 10220 ms
Overlayed segment 4 (Speaker 1) at 16320 ms
Overlayed segment 5 (Speaker 2) at 19460 ms
Overlayed segment 6 (Speaker 3) at 26336 ms
Final audio mix saved to final_mix.wav


In [1]:
from moviepy import VideoFileClip, AudioFileClip

# Load video and audio clips
video = VideoFileClip("media_1_video.mp4")
audio = AudioFileClip("final_mix.wav")

# Set the new audio to the video
final_video = video.with_audio(audio)

# Export the combined file
final_video.write_videofile("output_video_with_audio.mp4")


MoviePy - Building video output_video_with_audio.mp4.
MoviePy - Writing audio in output_video_with_audioTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
MoviePy - Writing video output_video_with_audio.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready output_video_with_audio.mp4
