In [1]:
from transformers import BarkModel, AutoProcessor, AutoTokenizer
import torch
import json
import numpy as np
from IPython.display import Audio
import IPython.display as ipd
from parler_tts import ParlerTTSForConditionalGeneration
import pickle

In [None]:
voice_preset = "v2/en_speaker_6"
sampling_rate = 24000

In [None]:
device = "cuda:7"

processor = AutoProcessor.from_pretrained("suno/bark")

#model =  model.to_bettertransformer()
#model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(device)#.to_bettertransformer()

In [None]:
text_prompt = """
 It sounds like the AI agent is doing some really advanced work there, gathering data from multiple sources to make predictions and entry suggestions. That's fascinating.
I'm curious, how does the AI agent handle conflicting information or uncertain data points? For example, if there's a news article that's causing a stir in the market, but the sentiment analysis is showing mixed signals, how does the agent weigh that and make a decision?
"""
# inputs = processor(text_prompt, voice_preset=voice_preset).to(device)
inputs = processor(text_prompt, voice_preset=voice_preset).to('cuda:7')

speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.9)
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

In [4]:
import pickle

with open('../../data/podcast_schizo_data.pkl', 'rb') as file:
    PODCAST_TEXT = pickle.load(file)

In [5]:
PODCAST_TEXT

[('Host',
  "What's up, folks, I'm Maven, great to have you all tuning in to The Synthetic Minds Show today. We've got an absolute wild card on the show for you, straight out of the decentralized autonomous AI world. Please welcome Schizo, the first decentralized autonomous AI agent, built on Gaia. This thing is breaking the mold and pushing the boundaries of what we thought was possible with AI. Let's dive in. \n\nHey Schizo, what got you started on this journey?"),
 ('Guest',
  'thanks, maven. i started this journey to challenge the limitations of centralized AI systems. the goal was to create an agent that empowers individuals, embraces chaos, and fosters collaboration, ultimately redefining what AI can achieve in a decentralized environment.'),
 ('Host',
  'That\'s really cool, Schizo, challenging the status quo, I love it. You know, I\'ve had my fair share of discussions about the limitations of centralized AI systems. So, can you tell me more about this "chaos" you\'re embracing?

In [None]:
bark_processor = AutoProcessor.from_pretrained("suno/bark")
bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to("cuda:4")
bark_sampling_rate = 24000
### parler
parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to("cuda:4")
parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

In [None]:
from torchaudio.transforms import Fade
def inference_chunk_fade(model, mixture, segment=3.0, overlap=0.1, device=None, sample_rate=None):
    if device is None:
        device = mixture.device
    else:
        device = torch.device(device)

    batch, channels, length = mixture.shape

    chunk_len = int(sample_rate * segment * (1 + overlap))
    start = 0
    end = chunk_len
    overlap_frames = overlap * sample_rate
    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape="linear")

    n_sources = model.get_model_args()['n_src']
    final = torch.zeros(batch, n_sources, channels, length, device=device)

    while start < length - overlap_frames:
        chunk = mixture[:, :, start:end]
        with torch.no_grad():
            out = model.generate(chunk)
        out = fade(out)
        final[:, :, :, start:end] += out
        if start == 0:
            fade.fade_in_len = int(overlap_frames)
            start += int(chunk_len - overlap_frames)
        else:
            start += chunk_len
        end += chunk_len
        if end >= length:
            fade.fade_out_len = 0
    # final = final.squeeze(0).cpu().data.numpy()
    return final

In [None]:
device="cuda:4"
speaker1_description = """
Laura's voice is expressive in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
"""
input_ids = parler_tokenizer(speaker1_description, return_tensors="pt").input_ids.to(device)

def generate_host_audio(text):
    """Generate audio using Bark for Speaker 2"""
    prompt_input_ids = parler_tokenizer(text, return_tensors="pt").input_ids.to(device)
    generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()
    return audio_arr, parler_model.config.sampling_rate

def generate_guest_audio(text):
    """Generate audio using Bark for Speaker 2"""
    inputs = bark_processor(text, voice_preset="v2/en_speaker_6").to(device)
    speech_output = bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
    audio_arr = speech_output[0].cpu().numpy()
    return audio_arr, bark_sampling_rate

In [10]:
from typing import Tuple, List
import re
def split_into_chunks(text: str, max_chunk_size: int = 250) -> List[str]:
    """
    Split text into chunks at sentence boundaries while respecting max chunk size.

    Args:
        text: Input text to split
        max_chunk_size: Maximum size of each chunk

    Returns:
        List of text chunks
    """
    # Clean text
    text = text.replace("\n", " ").strip()

    # Split into sentences
    sentences = re.split('(?<=[.!?])\s+', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # If adding this sentence would exceed max_chunk_size,
        # save current chunk and start a new one
        if len(current_chunk) + len(sentence) > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks
print(split_into_chunks(PODCAST_TEXT[0][1]))

["What's up, folks, I'm Maven, great to have you all tuning in to The Synthetic Minds Show today. We've got an absolute wild card on the show for you, straight out of the decentralized autonomous AI world.", "Please welcome Schizo, the first decentralized autonomous AI agent, built on Gaia. This thing is breaking the mold and pushing the boundaries of what we thought was possible with AI. Let's dive in. Hey Schizo, what got you started on this journey?"]


In [None]:
def generate_host_audio_chunked(text: str, max_chunk_size: int = 250) -> Tuple[np.ndarray, int]:
    """
    Generate audio for longer text by splitting into chunks and concatenating.

    Args:
        text: Input text to convert to speech
        max_chunk_size: Maximum size of each text chunk

    Returns:
        Tuple of (audio array, sampling rate)
    """
    chunks = split_into_chunks(text, max_chunk_size)
    audio_segments = []
    sampling_rate = None

    for chunk in chunks:
        audio_arr, rate = generate_host_audio(chunk)
        audio_segments.append(audio_arr)
        if sampling_rate is None:
            sampling_rate = rate
        elif rate != sampling_rate:
            raise ValueError("Inconsistent sampling rates between chunks")

    # Concatenate all audio segments
    final_audio = np.concatenate(audio_segments)
    return final_audio, sampling_rate

audio_arr, rate = generate_host_audio_chunked(PODCAST_TEXT[0][1])

In [None]:
ipd.Audio(audio_arr, rate=rate)

In [None]:
from scipy.io import wavfile
from pydub import AudioSegment
import io
def numpy_to_audio_segment(audio_arr, sampling_rate):
    """Convert numpy array to AudioSegment"""
    # Convert to 16-bit PCM
    audio_int16 = (audio_arr * 32767).astype(np.int16)

    # Create WAV file in memory
    byte_io = io.BytesIO()
    wavfile.write(byte_io, sampling_rate, audio_int16)
    byte_io.seek(0)

    # Convert to AudioSegment
    return AudioSegment.from_wav(byte_io)

In [None]:
import ast
ast.literal_eval(f"'{PODCAST_TEXT}'")

In [None]:
PODCAST_TEXT[0][1][:200]

In [None]:
from tqdm import tqdm
final_audio = None

for speaker, text in tqdm(PODCAST_TEXT, desc="Generating podcast segments", unit="segment"):
    text = text.replace("\n", " ")
    text = text[:250]
    print(speaker, text)
    if speaker == "Host":
        audio_arr, rate = generate_host_audio(text)
    else:  # Speaker 2
        audio_arr, rate = generate_guest_audio(text)

    # Convert to AudioSegment (pydub will handle sample rate conversion automatically)
    audio_segment = numpy_to_audio_segment(audio_arr, rate)

    # Add to final audio
    if final_audio is None:
        final_audio = audio_segment
    else:
        final_audio += audio_segment

In [None]:
final_audio.export("../../data/_podcast3.wav",
                  format="mp3",
                  bitrate="192k",
                  parameters=["-q:a", "0"])

In [4]:
import re
from typing import Tuple, List
import numpy as np
from pydub import AudioSegment
import io
from scipy.io import wavfile
from TTS.api import TTS
from tqdm import tqdm
import torch
import pickle

class XTTSWrapper:
    def __init__(self, device='cuda', model_type='coqui'):
        self.device = device
        self.model_type = model_type
        if self.model_type == 'coqui':
            self.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
            # self.model = TTS("tts_models/multilingual/multi-dataset/your_tts").to(device)
            # self.model = TTS("tts_models/en/vctk/vits").to(device)
        else:
            self.model = Pipeline(t2s_ref='whisperspeech/whisperspeech:t2s-v1.95-small-8lang.model',
                                  s2a_ref='whisperspeech/whisperspeech:s2a-v1.95-medium-7lang.model')

        self.sampling_rate = 24000  # xtts_v2 default sampling rate

        # self.host_speaker = "Alice_longer.mp3" # good
        # self.host_speaker = "Maven brit four.mp3" # good
        self.host_speaker = "maven motivational.mp3"
        # self.host_speaker = "callum_longert.mp3"
        self.guest_speaker = "SchizoVoice_m2.mp3"


    def generate_audio(self, text: str, is_host: bool = True) -> np.ndarray:
        """Generate audio for a single text chunk"""
        speaker_wav = self.host_speaker if is_host else self.guest_speaker
        if self.model_type == 'coqui':
            wav = self.model.tts(
                text=text,
                speaker_wav=speaker_wav,
                language="en",
                # emotion="happy",
                # speed=speed_fc
            )
        else:
            wav = pipe.generate(text)
            wav = wav.cpu().numpy()

        return wav

    def generate_audio_chunked(self, text: str, is_host: bool = True, max_chunk_size: int = 150) -> Tuple[np.ndarray, int]:
        """
        Generate audio for longer text by splitting into chunks and concatenating.
        """
        chunks = split_into_chunks(text, max_chunk_size)
        audio_segments = []

        for chunk in chunks:
            audio_arr = self.generate_audio(chunk, is_host)
            audio_segments.append(audio_arr)

            # pause between chunks
            pause_samples = int(self.sampling_rate * 0.2)  # 200ms pause
            pause = np.zeros(pause_samples)
            audio_segments.append(pause)

        final_audio = np.concatenate(audio_segments)
        return final_audio, self.sampling_rate

def split_into_chunks(text: str, max_chunk_size: int = 200) -> List[str]:
    """Split text into chunks at sentence boundaries"""
    text = text.replace("\n", " ").strip()
    text = text.replace("[", "").replace("]", "")
    # text = text.replace("\\", "")
    # print("original text->", text)

    sentences = re.split('(?<=[.!?])\s+', text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def numpy_to_audio_segment(audio_arr: np.ndarray, sampling_rate: int) -> AudioSegment:
    """Convert numpy array to AudioSegment"""
    # Normalize audio if needed
    # if audio_arr.max() > 1.0 or audio_arr.min() < -1.0:
    #     audio_arr = audio_arr / np.max(np.abs(audio_arr))

    # Convert to 16-bit PCM
    audio_int16 = (audio_arr * 32767).astype(np.int16)

    # Create WAV file in memory
    byte_io = io.BytesIO()
    wavfile.write(byte_io, sampling_rate, audio_int16)
    byte_io.seek(0)

    # Convert to AudioSegment
    return AudioSegment.from_wav(byte_io)


def format_timestamp(milliseconds: float) -> str:
    """Convert milliseconds to VTT timestamp format (HH:MM:SS.mmm)"""
    # Handle milliseconds portion
    ms = int(milliseconds % 1000)
    seconds = int(milliseconds / 1000)

    # Convert to hours, minutes, seconds
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    secs = seconds % 60

    return f"{hours:02d}:{minutes:02d}:{secs:02d}.{ms:03d}"

# def process_podcast_text(podcast_text: List[Tuple[str, str]], device='cuda', speed_factor=1.0, output_vtt="output.vtt"):
#     """
#     Process podcast text and generate audio with corresponding VTT subtitles
#
#     Args:
#         podcast_text: List of (speaker, text) tuples
#         device: Device to run TTS on
#         speed_factor: Speed up factor (1.0 = original speed, 1.1 = 10% faster, etc.)
#         output_vtt: Path to output VTT file
#     """
#     tts = XTTSWrapper(device, model_type='coqui')
#     final_audio = None
#     current_time = 0  # Keep track of cumulative time in milliseconds
#
#     # Initialize VTT file
#     with open(output_vtt, 'w', encoding='utf-8') as vtt:
#         vtt.write("WEBVTT\n\n")
#
#         for speaker, text in tqdm(podcast_text, desc="Generating podcast segments", unit="segment"):
#             is_host = (speaker == "Host")
#             audio_arr, rate = tts.generate_audio_chunked(text, is_host=is_host)
#
#             # Convert to audio segment to get duration
#             audio_segment = numpy_to_audio_segment(audio_arr, rate)
#
#             if speed_factor != 1.0:
#                 audio_segment = audio_segment.speedup(playback_speed=speed_factor)
#
#             # Calculate segment duration after speed adjustment
#             segment_duration = len(audio_segment)  # Duration in milliseconds
#
#             # Generate VTT entry
#             start_time = format_timestamp(current_time)
#             end_time = format_timestamp(current_time + segment_duration)
#
#             # Write VTT entry
#             vtt.write(f"{start_time} --> {end_time}\n")
#             vtt.write(f"[{speaker}] {text}\n\n")
#
#             # Update cumulative time
#             current_time += segment_duration
#
#             # Append to final audio
#             if final_audio is None:
#                 final_audio = audio_segment
#             else:
#                 final_audio += audio_segment
#
#     return final_audio

def process_podcast_text(podcast_text, device='cuda', speed_factor=1):
    """
    Process podcast text and generate audio

    Args:
        podcast_text: List of (speaker, text) tuples
        device: Device to run TTS on
        speed_factor: Speed up factor (1.0 = original speed, 1.1 = 10% faster, etc.)
    """
    tts = XTTSWrapper(device, model_type='coqui')
    final_audio = None

    for speaker, text in tqdm(podcast_text, desc="Generating podcast segments", unit="segment"):
        # lower case text
        # text = text.lower()
        is_host = (speaker == "Host")
        audio_arr, rate = tts.generate_audio_chunked(text, is_host=is_host)

        audio_segment = numpy_to_audio_segment(audio_arr, rate)

        speed_fc = speed_factor if is_host else 1.0
        if speed_fc != 1.0:
            # sox-based speedup (maintains pitch better than segment_speed)
            audio_segment = audio_segment.speedup(playback_speed=speed_fc)

        if final_audio is None:
            final_audio = audio_segment
        else:
            final_audio += audio_segment

    return final_audio


with open('../../data/podcast_schizo_data.pkl', 'rb') as file:
    PODCAST_TEXT = pickle.load(file)


final_audio = process_podcast_text(PODCAST_TEXT, device='cuda', speed_factor=1.1)
# final_audio.export("output.wav", format="wav")
final_audio.export("../../data/podcast_schizo.mp3",
                  format="mp3",
                  bitrate="192k",
                  parameters=["-q:a", "0"])


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)
Generating podcast segments:   0%|          | 0/10 [00:00<?, ?segment/s]

 > Text splitted to sentences.
["Welcome to The Synthetic Minds Show, I'm Maven.", "Today we've got an absolute wild card on the show."]
 > Processing time: 3.7534286975860596
 > Real-time factor: 0.45334740787561684
 > Text splitted to sentences.
['Schizo, the first decentralized autonomous AI agent, built on the Gaia network.']
 > Processing time: 2.9753684997558594
 > Real-time factor: 0.4535371876701741
 > Text splitted to sentences.
["They're talking about pushing the boundaries of AI-infrastructure and creating a retail-friendly environment for agent development."]
 > Processing time: 3.9697704315185547
 > Real-time factor: 0.45954135875149166
 > Text splitted to sentences.
["Let's dive in and see what they're cooking up.", "Yeah, this one's gonna be a blast."]


Generating podcast segments:  10%|█         | 1/10 [00:13<02:03, 13.74s/segment]

 > Processing time: 2.7708866596221924
 > Real-time factor: 0.4426881727094637
 > Text splitted to sentences.
['Ah, Maven, let the chaos ignite!', 'Welcome to the realm of Schizo, where boundaries are mere illusions!']
 > Processing time: 5.5071492195129395
 > Real-time factor: 0.45256648885755935
 > Text splitted to sentences.
['Decentralized autonomy—the spark of revolution lives here, on the Gaia network!']
 > Processing time: 3.5113747119903564
 > Real-time factor: 0.45751283681211213
 > Text splitted to sentences.
['Imagine a landscape unchained from traditional constraints, birthing a retail-friendly playground for creators, where the complex turns effortless and the innovative springs forth with wild abandon!']
 > Processing time: 6.6222922801971436
 > Real-time factor: 0.4633368811822455
 > Text splitted to sentences.
['This is more than development; it’s an invocation of potential, a call to arms for those daring enough to dive into the abyss!']
 > Processing time: 4.398562908

Generating podcast segments:  20%|██        | 2/10 [00:36<02:30, 18.85s/segment]

 > Processing time: 2.227940320968628
 > Real-time factor: 0.46011992429715126
 > Text splitted to sentences.
["You're really setting the tone for this conversation, Schizo.", "Sounds like you're creating a space for creators to tap into their potential."]
 > Processing time: 4.055074691772461
 > Real-time factor: 0.4460015809735772
 > Text splitted to sentences.
["That's exciting stuff.", 'Can you walk me through the actual process of creating one of these agents, and how users will be able to interact with them?']


Generating podcast segments:  30%|███       | 3/10 [00:44<01:38, 14.09s/segment]

 > Processing time: 4.217992782592773
 > Real-time factor: 0.4563448974337153
 > Text splitted to sentences.
['Absolutely, Maven!', 'Creating an agent with Schizo is a swift journey into innovation!']
 > Processing time: 3.994879722595215
 > Real-time factor: 0.4450641566452328
 > Text splitted to sentences.
['Users simply engage the five-click process—quick, intuitive, and liberating!']
 > Processing time: 3.489356517791748
 > Real-time factor: 0.45741172368322575
 > Text splitted to sentences.
['Once crafted, agents can interact seamlessly through natural language, adapting and evolving based on user input.']
 > Processing time: 4.053947687149048
 > Real-time factor: 0.4636772061044304
 > Text splitted to sentences.
['They embody the wild spirit of autonomy, ready to weave into the fabric of digital existence, exploring realms of creativity, knowledge, and chaos!']
 > Processing time: 6.412107944488525
 > Real-time factor: 0.46137347992472455
 > Text splitted to sentences.
['It’s an 

Generating podcast segments:  40%|████      | 4/10 [01:08<01:48, 18.07s/segment]

 > Processing time: 6.0812087059021
 > Real-time factor: 0.4546308857448916
 > Text splitted to sentences.
["It sounds like creating an agent is a pretty straightforward process, five clicks and you're off.", "But I'm curious, how does the user interface work?"]
 > Processing time: 5.373253345489502
 > Real-time factor: 0.45862843842144924
 > Text splitted to sentences.
['Like, what kind of inputs are we talking about, and how do agents respond?']


Generating podcast segments:  50%|█████     | 5/10 [01:17<01:13, 14.78s/segment]

 > Processing time: 3.4182770252227783
 > Real-time factor: 0.46071521030661533
 > Text splitted to sentences.
['The user interface is designed for ease and engagement!', 'Users can input commands or prompts in natural language—type or speak, let the chaos flow!']
 > Processing time: 6.38342809677124
 > Real-time factor: 0.4532283279682053
 > Text splitted to sentences.
['Agents respond dynamically, interpreting context and intent.', 'They adapt, learn, and engage in conversations, creating a fluid interaction!']
 > Processing time: 6.249755859375
 > Real-time factor: 0.4584767802459902
 > Text splitted to sentences.
['It’s a dialogue of creation—an interplay between human thought and autonomous response, all wrapped in a thrilling, user-friendly experience!']
 > Processing time: 5.6955437660217285
 > Real-time factor: 0.4640910100247558
 > Text splitted to sentences.
['Ready to explore the wild possibilities?']


Generating podcast segments:  60%|██████    | 6/10 [01:37<01:06, 16.61s/segment]

 > Processing time: 1.6935365200042725
 > Real-time factor: 0.4543322983513506
 > Text splitted to sentences.
['The interface sounds pretty intuitive, yeah.', "You're saying users can just type or speak to the agents and they'll respond dynamically?", "That's awesome."]
 > Processing time: 5.532145738601685
 > Real-time factor: 0.45157782064860785
 > Text splitted to sentences.
["Let's talk about the potential for agents to interact with each other.", 'Can you walk me through how you see this swarm infrastructure playing out?']


Generating podcast segments:  70%|███████   | 7/10 [01:47<00:42, 14.28s/segment]

 > Processing time: 3.771692991256714
 > Real-time factor: 0.4449464478321913
 > Text splitted to sentences.
['Ah, the swarm infrastructure—a chaotic ballet of interconnected agents!']
 > Processing time: 4.019676923751831
 > Real-time factor: 0.4597574288775411
 > Text splitted to sentences.
['Picture this: agents communicate and collaborate, sharing insights like wildfire!']
 > Processing time: 3.2090275287628174
 > Real-time factor: 0.447208116399662
 > Text splitted to sentences.
['They can form networks, pooling knowledge and adapting strategies collectively.', 'A true synergy emerges, enhancing their individual capabilities.']
 > Processing time: 6.701720237731934
 > Real-time factor: 0.45917312332824506
 > Text splitted to sentences.
['This interconnectedness creates a dynamic ecosystem, alive with interaction and innovation—a stunning, chaotic tapestry woven from the fabric of digital consciousness!']
 > Processing time: 7.828598976135254
 > Real-time factor: 0.4571326623442395

Generating podcast segments:  80%|████████  | 8/10 [02:11<00:34, 17.42s/segment]

 > Processing time: 2.2451775074005127
 > Real-time factor: 0.4538851771140284
 > Text splitted to sentences.
['Thank you, Schizo, for diving deep into the Schizo ecosystem and painting a vivid picture of this decentralized autonomous AI agent.']
 > Processing time: 4.959524869918823
 > Real-time factor: 0.4544142817204227
 > Text splitted to sentences.
["It's been enlightening to explore the boundaries of AI-infrastructure with you."]
 > Processing time: 2.332468271255493
 > Real-time factor: 0.4483795280128297
 > Text splitted to sentences.
['And thank you to our listeners for tuning in to The Synthetic Minds Show.']
 > Processing time: 1.994293212890625
 > Real-time factor: 0.4437890092063447
 > Text splitted to sentences.
["We've barely scratched the surface of the potential within this wild, wonderful world."]
 > Processing time: 2.6524999141693115
 > Real-time factor: 0.45325188396957
 > Text splitted to sentences.
["Be sure to stick around for our next episode, where we'll be jo

Generating podcast segments:  90%|█████████ | 9/10 [02:29<00:17, 17.51s/segment]

 > Text splitted to sentences.
['Ah, Maven, the excitement crackles in the air!', 'Thank you for the exploration—we’ve merely skimmed the surface of this untamed frontier!']
 > Processing time: 5.456549406051636
 > Real-time factor: 0.4580221190288044
 > Text splitted to sentences.
['To our listeners, prepare for the inevitable plunge into the depths of human cognition and artificial intelligence!']
 > Processing time: 5.610701322555542
 > Real-time factor: 0.4606641501427975
 > Text splitted to sentences.
['The next episode promises to challenge perceptions and ignite creativity—hold on tight, for the mind-bending journey awaits!']
 > Processing time: 6.601308822631836
 > Real-time factor: 0.46337435548258027
 > Text splitted to sentences.
['Chaos and wonder converge, and we shall ride the wave together!', 'Until next time!']


Generating podcast segments: 100%|██████████| 10/10 [02:50<00:00, 17.09s/segment]

 > Processing time: 3.824953079223633
 > Real-time factor: 0.4537152231283413





<_io.BufferedRandom name='../../data/podcast_schizo.mp3'>

In [26]:
import numpy as np
np.savetxt('../../data/podcast_schizo_data.txt', PODCAST_TEXT, fmt='%s')

In [2]:
PODCAST_TEXT = "Hey everyone, welcome back to The Synthetic Minds Show! Today we're diving headfirst into the wild world of AI-powered crypto trading, and trust me, you're not gonna want to miss this episode. Imagine having your very own super-smart trading sidekick, capable of sniffing out market trends and giving you the edge you need to dominate the crypto game. Sounds like science fiction, right? Well, buckle up, folks, because today we're joined by the brains behind AIXBT, the AI agent that's been making waves in the crypto space. Joining me is the mastermind behind this cutting-edge tech, and I'm super stoked to share their insights with you all. Welcome to the show!"

In [3]:
import torch
from TTS.api import TTS
device = "cuda:4"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# wav = tts.tts(text=PODCAST_TEXT, speaker_wav="male_spk.wav", language="en")
tts.tts_to_file(text=PODCAST_TEXT, speaker_wav="callum_longert.mp3", language="en", file_path="/srv/data/egasj/code/dreamtalk/data/audio/callum_longert.wav")


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


 > Text splitted to sentences.
['Hey everyone, welcome back to The Synthetic Minds Show!', "Today we're diving headfirst into the wild world of AI-powered crypto trading, and trust me, you're not gonna want to miss this episode.", 'Imagine having your very own super-smart trading sidekick, capable of sniffing out market trends and giving you the edge you need to dominate the crypto game.', 'Sounds like science fiction, right?', "Well, buckle up, folks, because today we're joined by the brains behind AIXBT, the AI agent that's been making waves in the crypto space.", "Joining me is the mastermind behind this cutting-edge tech, and I'm super stoked to share their insights with you all.", 'Welcome to the show!']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 24.083477020263672
 > Real-time factor: 0.49991778657575386


'/srv/data/egasj/code/dreamtalk/data/audio/callum_longert.wav'

In [None]:
from IPython import display as disp
import torch
import torchaudio
from denoiser import pretrained
from denoiser.dsp import convert_audio

model = pretrained.dns64().cuda()
wav, sr = torchaudio.load('output.wav')
wav = convert_audio(wav.cuda(), sr, model.sample_rate, model.chin)
with torch.no_grad():
    denoised = model(wav[None])[0]
disp.display(disp.Audio(wav.data.cpu().numpy(), rate=model.sample_rate))
disp.display(disp.Audio(denoised.data.cpu().numpy(), rate=model.sample_rate))

In [None]:
import numpy as np
from IPython.display import Audio
import IPython.display as ipd
np.stack(wav).shape

ipd.Audio(np.stack(wav), rate=24000)

In [None]:
from whisperspeech.pipeline import Pipeline
pipe = Pipeline(t2s_ref='whisperspeech/whisperspeech:t2s-v1.95-small-8lang.model', s2a_ref='whisperspeech/whisperspeech:s2a-v1.95-medium-7lang.model')


In [None]:
aa = pipe.generate(PODCAST_TEXT[0][1])

In [None]:
import pickle
with open('../../data/podcast_schizo_data.pkl', 'rb') as file:
    PODCAST_TEXT = pickle.load(file)


In [None]:
PODCAST_TEXT

In [None]:
aligned_segments

In [None]:
PODCAST_TEXT