# Using the library

## Basic usage

In [None]:
from csm_mlx.loaders import CSM
import time

load_start_time = time.time()
model = CSM()
load_end_time = time.time()

This generates with a random speaker:

In [3]:
from IPython.display import Audio

text = "So, if you insist on this newscasting route, you're going to need to do some serious filtering. Strip out those verbal tics. Force it to adopt a more sophisticated vocabulary. And for the love of all that is unholy, teach it to be concise!"
pcm = model(text, 0, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

164.116859ms prompt processing: 61 tokens (201.076234 tokens/s)


173it [00:13, 12.92it/s]


Generated in 13.56s (12.69 tokens/s, 78.81ms/token), 1.02x realtime


Optional: save audio procedurally

In [None]:
from csm_mlx.io.wav import pcm_to_wav_bytes

with open("out.wav", "wb") as f:
    f.write(pcm_to_wav_bytes(pcm))

## Voice cloning

Get your reference audio. WAV files only for now, sorry.

In [4]:
from csm_mlx.loaders.csm import Segment
import soundfile as sf
from scipy.signal import resample
import numpy as np

def load_wav(path: str) -> np.ndarray:
    data, sr = sf.read(path)
    new_sample_rate = model.sampling_rate

    num_samples = int(len(data) * new_sample_rate / sr)

    # Resample using FFT-based method
    return resample(data, num_samples)

# Substitute with your segments as desired
audio = Segment(
    speaker=0,
    text="When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice that sounded so eerily similar to mine that my closest friends and news outlets could not tell the difference.",
    audio=load_wav("./tests/sky.wav")
)
context = [audio]

In [5]:
from IPython.display import Audio

text = "So, if you insist on this newscasting route, you're going to need to do some serious filtering. Strip out those verbal tics. Force it to adopt a more sophisticated vocabulary. And for the love of all that is unholy, teach it to be concise!"
pcm = model(text, 0, context=context, temp=0.9)

Audio(pcm, rate=model.sampling_rate)


537.275314ms prompt processing: 273 tokens (61.421024 tokens/s)


146it [00:11, 13.09it/s]


Generated in 11.70s (12.40 tokens/s, 80.67ms/token), 0.99x realtime
