In [None]:
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

In [2]:
import os
# Hardcoded paths (update these based on actual locations)
wav_file_path = os.path.join(os.getcwd(), "temp_video/cf5f9419ae994200bf4af7472bc51abd.wav")
json_transcript_path = os.path.join(os.getcwd(), "temp_video/subtitles/subtitles-original.json")
output_folder = os.path.join(os.getcwd(), "segmented_wavs")  # Make sure it exists
dataset = os.path.join(os.getcwd(), "dataset")  # Make sure it exists

In [None]:
import os
import librosa
import soundfile as sf

# ‚úÖ Paths
wav_file_path = os.path.join(os.getcwd(), "temp_video/cf5f9419ae994200bf4af7472bc51abd.wav")
os.makedirs(output_folder, exist_ok=True)

# ‚úÖ Load full audio
audio, sr = librosa.load(wav_file_path, sr=22050)  # üî• Ensure XTTS-compatible sample rate

# ‚úÖ Split parameters
chunk_duration = 9.6  # Split length in seconds
samples_per_chunk = int(chunk_duration * sr)  # Convert seconds to samples

# ‚úÖ Process and Save Chunks
num_chunks = len(audio) // samples_per_chunk
for i in range(num_chunks + 1):  # +1 to ensure we get the last segment if any
    start = i * samples_per_chunk
    end = min((i + 1) * samples_per_chunk, len(audio))
    chunk_audio = audio[start:end]

    if len(chunk_audio) > 1000:  # üî• Skip empty or tiny segments
        chunk_filename = f"segment_{i:03d}.wav"
        chunk_path = os.path.join(output_folder, chunk_filename)
        sf.write(chunk_path, chunk_audio, sr)
        print(f"‚úÖ Saved {chunk_path}")

print(f"üéâ Splitting complete! Segments saved in: {output_folder}")


In [1]:
import os
import whisper

# ‚úÖ Load Whisper Model (Use 'medium' for better accuracy)
model = whisper.load_model("medium")

# ‚úÖ Paths
input_folder = os.path.join(os.getcwd(), "segmented_wavs")
output_metadata = os.path.join(os.getcwd(), "segmented_wavs/metadata.txt")

# ‚úÖ Transcribe and Save Metadata
with open(output_metadata, "w", encoding="utf-8") as f:
    for file in sorted(os.listdir(input_folder)):
        if file.endswith(".wav"):
            file_path = os.path.join(input_folder, file)
            result = model.transcribe(file_path, language="de")  # Set your language
            transcript = result["text"].strip()
            
            if transcript:
                f.write(f"{file.replace('.wav', '')}|{transcript}\n")
                print(f"‚úÖ Transcribed: {file} ‚Üí {transcript}")

print(f"üéâ Transcription complete! Metadata saved: {output_metadata}")


  checkpoint = torch.load(fp, map_location=device)


‚úÖ Transcribed: segment_000.wav ‚Üí Hallo und herzlich willkommen zur zweiten Folge von Einf√ºhrung in React mit dem Thema React Setup. Noch einmal kurz zu mir, mein Name ist David Losert.
‚úÖ Transcribed: segment_001.wav ‚Üí Ich bin Software Engineer und seit √ºber zehn Jahren im Web unterwegs und arbeite nun auch bereits seit vier Jahren mit React. Neben React mag ich die Arbeit mit Chubbys.
‚úÖ Transcribed: segment_002.wav ‚Üí TypeScript, Node.js, Linux-Servern, Docker und AWS. Die heutige Folge dreht sich also nun komplett darum, eine
‚úÖ Transcribed: segment_003.wav ‚Üí Entwicklungsumgebung aufzusetzen und dort eine erste React Hello World Applikation zu implementieren. Wenn wir uns kurz erinnern, in der letzten Folge habe ich die Geschichte
‚úÖ Transcribed: segment_004.wav ‚Üí und Prinzipien von React kurz vorgestellt und einen ersten theoretischen Einblick in den Virtual Dom und in JSX gegeben. Das habe ich an dieser Stelle auch einbezogen.
‚úÖ Transcribed: segment_005.wav ‚Üí 

In [2]:
import os
import librosa

# ‚úÖ Set Paths
audio_folder = os.path.join(os.getcwd(), "segmented_wavs/wavs")  # Update this path
max_duration = 10.0  # Maximum allowed duration in seconds

# ‚úÖ Scan and Check Durations
long_files = []
for file in sorted(os.listdir(audio_folder)):
    if file.endswith(".wav"):
        file_path = os.path.join(audio_folder, file)
        audio, sr = librosa.load(file_path, sr=None)  # Load with original sample rate
        duration = librosa.get_duration(y=audio, sr=sr)

        if duration > max_duration:
            long_files.append((file, round(duration, 2)))  # Store file name & duration

# ‚úÖ Print Results
if long_files:
    print(f"üö® {len(long_files)} audio files exceed {max_duration}s:")
    for file, duration in long_files:
        print(f"  ‚ùå {file}: {duration}s")
else:
    print("‚úÖ All audio files are within the allowed duration.")



‚úÖ All audio files are within the allowed duration.


In [6]:
import json
import os
import librosa
import soundfile as sf

def extract_audio_segments(wav_file, json_file, output_folder):
    """
    Extracts speech segments from the full WAV file using JSON timestamps.

    Args:
        wav_file (str): Path to the full WAV audio file.
        json_file (str): Path to the JSON subtitles/transcripts file.
        output_folder (str): Folder where extracted audio files and metadata.txt will be saved.
    """
    # Ensure output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Load full audio file
    audio, sr = librosa.load(wav_file, sr=22050)  # Resample to 22.05kHz for TTS

    # Load JSON transcript
    with open(json_file, "r", encoding="utf-8") as f:
        transcript_data = json.load(f)

    metadata_entries = []

    # Process each segment in the JSON file
    for idx, segment in enumerate(transcript_data):
        start_time = segment["start_time"] / 1000.0  # Convert ms to sec
        end_time = segment["end_time"] / 1000.0 +0.5  # Convert ms to sec
        text = segment["text"].strip()

        # Extract audio segment
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        audio_segment = audio[start_sample:end_sample]

        # Save audio segment
        segment_filename = f"audio_{idx+1:03d}.wav"
        segment_path = os.path.join(output_folder, segment_filename)
        sf.write(segment_path, audio_segment, sr)

        # Add entry to metadata
        metadata_entries.append(f"{segment_filename}|{text}|{text.lower()}")

    # Save metadata.txt
    metadata_path = os.path.join(output_folder, "metadata.txt")
    with open(metadata_path, "w", encoding="utf-8") as f:
        f.write("\n".join(metadata_entries))

    print(f"‚úÖ Audio segments saved to: {output_folder}")
    print(f"‚úÖ Metadata file saved: {metadata_path}")

    return metadata_path


In [7]:
metadata_file = extract_audio_segments(wav_file_path, json_transcript_path, output_folder)


‚úÖ Audio segments saved to: /home/ahmet/my_projects/realtime_translator_V2_updated/server/clones
‚úÖ Metadata file saved: /home/ahmet/my_projects/realtime_translator_V2_updated/server/clones/metadata.txt


In [None]:
#!pip install pydub

Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [23]:
import os
import librosa
import soundfile as sf
from pydub import AudioSegment, silence
import numpy as np

def segment_audio_smart_v2(wav_file, output_folder, target_chunk_duration=10, max_chunk_duration=12, min_chunk_duration=5, silence_threshold=-40, silence_padding=0.2):
    """
    Segments an audio file into chunks of approximately target_chunk_duration seconds,
    prioritizing cuts at silent periods to avoid mid-word cuts.

    Args:
        wav_file (str): Path to the input WAV file.
        output_folder (str): Directory for segmented audio files.
        target_chunk_duration (int): Target duration for each chunk in seconds.
        max_chunk_duration (int): Maximum allowed duration for a chunk in seconds.
        min_chunk_duration (int): Minimum allowed duration for a chunk in seconds.
        silence_threshold (int): Silence threshold in dBFS for pydub silence detection.
        silence_padding (float):  Seconds of silence to include after a detected silence point.

    Returns:
        List[str]: List of paths to the generated audio segment files.
    """
    os.makedirs(output_folder, exist_ok=True)
    segment_paths = []
    segment_count = 1

    audio = AudioSegment.from_wav(wav_file)
    audio_duration_sec = len(audio) / 1000  # Duration in seconds
    start_time_ms = 0

    while start_time_ms < len(audio):
        target_end_time_ms = start_time_ms + target_chunk_duration * 1000
        max_end_time_ms = min(start_time_ms + max_chunk_duration * 1000, len(audio))
        min_end_time_ms = min(start_time_ms + min_chunk_duration * 1000, len(audio))

        if max_end_time_ms >= len(audio): # Last segment
            end_time_ms = len(audio)
        else:
            # Look for silence near the target end time
            search_start_ms = max(start_time_ms + min_chunk_duration * 1000, target_end_time_ms - 2000) # Search silence after min duration and around target
            search_end_ms = max_end_time_ms

            silent_ranges = silence.detect_silence(
                audio[search_start_ms:search_end_ms],
                min_silence_len=200,  # Shorter silence detection for finer cuts
                silence_thresh=silence_threshold
            )

            if silent_ranges:
                # Take the first silence found
                silence_end_relative_ms = silent_ranges[0][0] # Start of silence relative to search_start_ms
                end_time_ms = search_start_ms + silence_end_relative_ms + int(silence_padding * 1000) # Cut at silence + padding
                end_time_ms = min(end_time_ms, max_end_time_ms) # Ensure not exceeding max duration
                end_time_ms = max(end_time_ms, min_end_time_ms) # Ensure not less than min duration
            else:
                end_time_ms = max_end_time_ms # If no silence found, cut at max duration


        segment_audio = audio[start_time_ms:end_time_ms]
        segment_filename = f"segment_{segment_count:03d}.wav"
        segment_path = os.path.join(output_folder, segment_filename)
        segment_audio.export(segment_path, format="wav")
        segment_paths.append(segment_path)

        start_time_ms = end_time_ms
        segment_count += 1

    print(f"‚úÖ Smart segmentation v2 done with {len(segment_paths)} files.")
    return segment_paths

In [24]:
segmented_files = segment_audio_smart_v2(
            wav_file_path,
            output_folder_2,
            target_chunk_duration=10,
            max_chunk_duration=12,
            min_chunk_duration=5,
            silence_threshold=-40,
            silence_padding=0.2 # Add a bit of silence after cut
        )
print("Segmented files:", segmented_files)

‚úÖ Smart segmentation v2 done with 256 files.
Segmented files: ['/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_001.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_002.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_003.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_004.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_005.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_006.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_007.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_008.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_009.wav', '/home/ahmet/my_projects/realtime_translator_V2_updated/server/clones_2/segment_010.wav', '/home/ahmet/my_projects/realtime_t

In [25]:
import os

def clean_metadata(metadata_file):
    """
    Removes '.wav' extension from filenames in metadata.txt.
    
    Args:
        metadata_file (str): Path to metadata.txt
    
    Returns:
        str: Path to updated metadata file
    """
    new_metadata_file = metadata_file.replace(".txt", "_cleaned.txt")
    
    with open(metadata_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Remove .wav extension from filenames
    cleaned_lines = [line.replace(".wav|", "|", 1) for line in lines]

    # Save new metadata file
    with open(new_metadata_file, "w", encoding="utf-8") as f:
        f.writelines(cleaned_lines)

    print(f"‚úÖ Updated metadata saved as: {new_metadata_file}")
    return new_metadata_file

# Run the function
metadata_path = output_folder+"/metadata.txt"  # Update with actual path
cleaned_metadata_path = clean_metadata(metadata_path)


‚úÖ Updated metadata saved as: /home/ahmet/my_projects/realtime_translator_V2_updated/server/clones/metadata_cleaned.txt


In [28]:
import os
from trainer import Trainer, TrainerArgs
from TTS.tts.datasets import load_tts_samples
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager

# ‚úÖ Define Paths
dataset = os.path.join(os.getcwd(), "dataset")
DATASET_PATH = os.path.join(os.getcwd(),"audio_dataset")  # Your dataset folder
METADATA_FILE =os.path.join(os.getcwd(),"audio_dataset/metadata.txt")
OUTPUT_PATH = "/home/ahmet/tts_finetuned_models/xtts_v2_gpt"  # Fine-tuned models output
CHECKPOINTS_OUT_PATH = os.path.join(OUTPUT_PATH, "checkpoints")  
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)

# ‚úÖ Load XTTS-V2 Pretrained Model Checkpoint
XTTS_CHECKPOINT = "/home/ahmet/yourtts_model/tts_models--multilingual--multi-dataset--your_tts/model_file.pth"

# ‚úÖ Define Speaker Reference (One high-quality voice sample)
SPEAKER_REFERENCE = [
    os.path.join(DATASET_PATH, "wavs", "segment_001.wav")
]

# ‚úÖ Training Parameters
BATCH_SIZE = 3  # Keep small to avoid memory issues
GRAD_ACCUM_STEPS = 84  # Must be at least 252 when multiplied with batch size
LEARNING_RATE = 5e-6  # Lower learning rate for stable fine-tuning
LANGUAGE = "de"  # Adjust based on dataset

# ‚úÖ DVAE & XTTS Model Files (GPT-Based Fine-Tuning)
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, "dvae.pth")
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, "mel_stats.pth")

# ‚úÖ Download DVAE & XTTS Model Files If Not Available
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([
        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-V2/main/mel_stats.pth",
        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-V2/main/dvae.pth"
    ], CHECKPOINTS_OUT_PATH, progress_bar=True)

# ‚úÖ Define Dataset Configuration
config_dataset = BaseDatasetConfig(
    formatter="custom",  
    dataset_name="custom_xtts_dataset",
    path=DATASET_PATH,
    meta_file_train=os.path.join(DATASET_PATH, METADATA_FILE),
    language=LANGUAGE,
)

# ‚úÖ Load Training Samples
train_samples, eval_samples = load_tts_samples(
    config_dataset,
    eval_split=True,
    eval_split_size=0.05,  # Use 5% for evaluation
)

print(f"‚úÖ Loaded {len(train_samples)} training samples and {len(eval_samples)} eval samples")

# ‚úÖ Define Model Arguments (GPT-Based Fine-Tuning)
model_args = GPTArgs(
    max_conditioning_length=132300,  # 6 seconds
    min_conditioning_length=66150,  # 3 seconds
    debug_loading_failures=False,
    max_wav_length=255995,  # ~11.6 seconds
    max_text_length=200,
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,  # XTTS-V2 checkpoint path
    tokenizer_file=None,  # XTTS already has a tokenizer
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

# ‚úÖ Define Audio Configuration
audio_config = XttsAudioConfig(
    sample_rate=22050,  
    dvae_sample_rate=22050,  
    output_sample_rate=24000
)

# ‚úÖ Define Training Config (GPT-Based)
config = GPTTrainerConfig(
    output_path=OUTPUT_PATH,
    model_args=model_args,
    run_name="GPT_XTTS_v2.0_FT",
    project_name="XTTS_trainer",
    dashboard_logger="tensorboard",
    logger_uri=None,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,  
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,
    eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    log_model_step=1000,
    save_step=5000,  # Save model every 5000 steps
    save_n_checkpoints=1,  
    save_checkpoints=True,
    optimizer="AdamW",
    optimizer_wd_only_on_weights=True,
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=LEARNING_RATE,  # Learning rate
    lr_scheduler="MultiStepLR",
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[
        {
            "text": "This is an AI-powered voice cloning model.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
        {
            "text": "Artificial intelligence is changing the world.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
    ],
)

# ‚úÖ Initialize Model
model = GPTTrainer.init_from_config(config)

# ‚úÖ Initialize Trainer
trainer = Trainer(
    TrainerArgs(
        restore_path=None,  # XTTS checkpoint is restored via `xtts_checkpoint`, so no need to restore here
        skip_train_epoch=False,
        start_with_eval=True,
        grad_accum_steps=GRAD_ACCUM_STEPS,
    ),
    config,
    output_path=OUTPUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

# ‚úÖ Start Training üöÄ
trainer.fit()

print(f"üéâ XTTS-V2 GPT-based fine-tuning completed! Model saved to: {OUTPUT_PATH}")


 > Downloading DVAE files!


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.07k/1.07k [00:01<00:00, 708iB/s]


AttributeError: module 'TTS.tts.datasets' has no attribute 'custom'



In [2]:
import torch

XTTS_CHECKPOINT = "/home/ahmet/xtts_gpt_model/xtts_model.pth"

try:
    model_state = torch.load(XTTS_CHECKPOINT, map_location="cpu")
    print("‚úÖ Model loaded successfully!")
    print("üî• Model Keys:", model_state.keys())  # üî• Print all keys in the checkpoint
except Exception as e:
    print(f"‚ùå Model failed to load! Error: {e}")


  model_state = torch.load(XTTS_CHECKPOINT, map_location="cpu")
  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Model loaded successfully!
üî• Model Keys: dict_keys(['config', 'model', 'scaler', 'optimizer', 'step', 'date', 'model_loss'])


In [31]:
XTTS_CHECKPOINT = "/home/ahmet/xtts_gpt_model/xtts_model.pth"
try:
    model_state = torch.load(XTTS_CHECKPOINT, map_location="cpu")
    print(f"üî• Available Keys in XTTS Checkpoint: {model_state.keys()}")

    if "model" in model_state:
        print("‚úÖ Found `model` key instead of `net`. Trying to load...")
        model.load_state_dict(model_state["model"], strict=False)
        print("‚úÖ XTTS Model loaded successfully!")
    else:
        raise ValueError("‚ùå `model` key is missing in XTTS checkpoint.")

except Exception as e:
    print(f"‚ùå XTTS Model failed to load! Error: {e}")


  model_state = torch.load(XTTS_CHECKPOINT, map_location="cpu")


üî• Available Keys in XTTS Checkpoint: dict_keys(['config', 'model', 'scaler', 'optimizer', 'step', 'date', 'model_loss'])
‚úÖ Found `model` key instead of `net`. Trying to load...
‚ùå XTTS Model failed to load! Error: name 'model' is not defined


In [8]:
import os

wav_folder = "audio_dataset/wavs/"

for filename in os.listdir(wav_folder):
    file_path = os.path.join(wav_folder, filename)
    
    # ‚úÖ If file has no extension, add .wav
    if "." not in filename:
        new_file_path = file_path + ".wav"
        os.rename(file_path, new_file_path)
        print(f"‚úÖ Renamed: {filename} ‚Üí {filename}.wav")
