# TTS HQ model on Podcast scripts
### Author: Kaushik

In [None]:
import json

with open('podcast_scripts.json') as f:
    d = json.load(f)

d[0]

In [None]:
# speecht5_tts : # https://huggingface.co/microsoft/speecht5_tts

# Imports
import re
import json
import torch
import soundfile as sf
import numpy as np
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

In [None]:
# Load TTS models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load Speaker Embeddings (xvectors)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# Define the standard embeddings

# Index 0 is 'bdl' (Male)
MALE_EMBEDDING = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
# Index 7306 is 'slt' (Female)
FEMALE_EMBEDDING = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [None]:
# Trail podcast script from id:0
json_data_entry = d[0]

# Extract Data from JSON id:0
host_gender = json_data_entry["host_gender"].upper()
guest_gender = json_data_entry["guest_gender"].upper()
raw_text = json_data_entry["dialogue"]
output_filename = f"podcast_{json_data_entry['id']}_{json_data_entry['host_gender']}_{json_data_entry['guest_gender']}.wav"

# Assign Embeddings based on Gender Data
# Assign HOST embedding
if host_gender == "MALE":
    speaker_embeddings_host = MALE_EMBEDDING
elif host_gender == "FEMALE":
    speaker_embeddings_host = FEMALE_EMBEDDING
else:
    # Default to MALE if gender is not recognized
    speaker_embeddings_host = MALE_EMBEDDING
    print(f"Warning: Host gender '{host_gender}' not recognized. Defaulting to Male.")

# Assign GUEST embedding
if guest_gender == "MALE":
    speaker_embeddings_guest = MALE_EMBEDDING
elif guest_gender == "FEMALE":
    speaker_embeddings_guest = FEMALE_EMBEDDING
else:
    # Default to FEMALE if gender is not recognized
    speaker_embeddings_guest = FEMALE_EMBEDDING
    print(f"Warning: Guest gender '{guest_gender}' not recognized. Defaulting to Female.")

# Parse the Dialogue
# This regex splits the string while keeping 'HOST:' and 'GUEST:' in the list
segments = re.split(r'(HOST:|GUEST:)', raw_text)

# Generate Audio Loop
all_speech_parts = []
current_speaker_embedding = None

# Create a 0.3 second silence tensor (16000 sampling rate * 0.3 seconds)
silence = torch.zeros(int(16000 * 0.3))

for segment in segments:
    segment = segment.strip()
    
    # Determine speaker
    if segment == "HOST:":
        # Assign the dynamically selected HOST embedding
        current_speaker_embedding = speaker_embeddings_host
        continue
    elif segment == "GUEST:":
        # Assign the dynamically selected GUEST embedding
        current_speaker_embedding = speaker_embeddings_guest
        continue
    
    # Skip empty strings or strings that were just keys
    if not segment or current_speaker_embedding is None:
        continue

    # Process text and generate audio
    inputs = processor(text=segment, return_tensors="pt")
    
    # Generate speech
    speech = model.generate_speech(inputs["input_ids"], current_speaker_embedding, vocoder=vocoder)
    
    # Add to list
    all_speech_parts.append(speech)
    all_speech_parts.append(silence) # Add pause after every segment

# Concatenate and Save
if all_speech_parts:
    # Concatenate all tensors into one
    final_audio = torch.cat(all_speech_parts, dim=0)
    
    # Save file with a descriptive name
    sf.write(output_filename, final_audio.numpy(), samplerate=16000)
    print(f"Audio saved as {output_filename}")
else:
    print("No audio generated.")