# Setup

In [1]:
!pwd


/home/momo/mlprojects/mockingjay/tts


In [2]:
import torch
from bark_vocos import BarkVocos
device = "cuda:0" if torch.cuda.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


# Initialize model

In [3]:
# monkey-patch optimum.utils.normalized_config.NormalizedConfigManager
from optimum.utils.normalized_config import NormalizedConfigManager, GPT2LikeNormalizedTextConfig, NormalizedTextConfig
NormalizedConfigManager._conf = {
        "gpt2": GPT2LikeNormalizedTextConfig,
        "coarse_acoustics": NormalizedTextConfig,
    }

model_id = "suno/bark"
model_path = "./bark-model"
model = BarkVocos.from_pretrained(model_id, torch_dtype=torch.float32)

model = model.to(device)
model =  model.to_bettertransformer()


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [4]:
text_prompt = "Hello, my name is Suno. And, uh — and I like pizza. But I also have other interests such as playing tic tac toe."


In [5]:
from processing_bark import BarkProcessor
import nltk

nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(text_prompt)
print(sentences)
processor = BarkProcessor.from_pretrained(model_id)


[nltk_data] Downloading package punkt to /home/momo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Hello, my name is Suno.', 'And, uh — and I like pizza.', 'But I also have other interests such as playing tic tac toe.']


In [6]:
voice_presets = [
    "voice_presets/snoop-dogg-hb-7s.npz",
    ]
semantic_temp = 0.7
coarse_temp = 0.7
fine_temp = 0.4
output_sample_rate=44100
do_sample = True


In [7]:
from IPython.display import Audio
torch.manual_seed(46)
audio_previews = []

for voice_preset in voice_presets:
    print(f"Generating with voice preset {voice_preset}")
    inputs = processor(sentences, voice_preset=voice_preset).to(device)
    output = model.generate(**inputs, do_sample=do_sample, fine_temperature=fine_temp, coarse_temperature=coarse_temp, semantic_temperature=semantic_temp)
    print(type(output), output.shape)
    audio_previews.append(Audio(output, rate=output_sample_rate))


Generating with voice preset voice_presets/snoop-dogg-hb-7s.npz


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


<class 'torch.Tensor'> torch.Size([1, 619752])


In [8]:
audio_previews[0] # snoop-dogg-hb-7s-v0-bs16-e9.npz


# Snoop Dogg
## suno/bark
Seed 48:
- snoop-dogg-hb-12s-s.npz: best
- snoop-dogg-hb-12s-l.npz: nok
- snoop-dogg-hb-12s-2-s.npz: ok
- snoop-dogg-hb-12s-2-l.npz: gibberish

Seed 42:
- snoop-dogg-hb-12s-s.npz: ok
- snoop-dogg-hb-12s-l.npz: ok
- snoop-dogg-hb-12s-2-s.npz: nok
- snoop-dogg-hb-12s-2-l.npz: soso

Prompt switched to "It's all about the weed brother. It's your dawg Snoop, speaking from Shanghai."

Seed 43:
- snoop-dogg-hb-12s-s.npz: best
- snoop-dogg-hb-12s-l.npz: ok
- snoop-dogg-hb-12s-2-s.npz: nok
- snoop-dogg-hb-12s-2-l.npz: gibberish

Seed 44:
- snoop-dogg-hb-12s-s.npz: best - Shanghai mispronounced
- snoop-dogg-hb-12s-l.npz: ok - Shanghai mispronounced
- snoop-dogg-hb-12s-2-s.npz: nok
- snoop-dogg-hb-12s-2-l.npz: gibberish

Seed 45:
- snoop-dogg-hb-12s-s.npz: ok-ish
- snoop-dogg-hb-12s-l.npz: gibberish
- snoop-dogg-hb-12s-2-s.npz: gibberish
- snoop-dogg-hb-12s-2-l.npz: gibberish

Seed 46:
- snoop-dogg-hb-12s-s.npz: best
- snoop-dogg-hb-12s-l.npz: ok
- snoop-dogg-hb-12s-2-s.npz: gibberish
- snoop-dogg-hb-12s-2-l.npz: gibberish

Discarding snoop-dogg-hb-12s-2
Switching prompt to "It's all about the weed brother. It's your dawg Snoop speaking from Shanghai. Hello Natasha, how are you doing today?"

Seed 46:
- snoop-dogg-hb-12s-s.npz: best
- snoop-dogg-hb-12s-l.npz: gibberish

Seed 47:
- snoop-dogg-hb-12s-s.npz: nok
- snoop-dogg-hb-12s-l.npz: gibberish

Seed 48:
- snoop-dogg-hb-12s-s.npz: ok
- snoop-dogg-hb-12s-l.npz: gibberish (psychopath)

Seed 49:
- snoop-dogg-hb-12s-s.npz: ok
- snoop-dogg-hb-12s-l.npz: gibberish

Seed 50:
- snoop-dogg-hb-12s-s.npz: ok
- snoop-dogg-hb-12s-l.npz: gibberish

## suno/bark-small

Seed 46:
- snoop-dogg-hb-12s-s.npz:
- snoop-dogg-hb-12s-l.npz:

Seed 47:
- snoop-dogg-hb-12s-s.npz:
- snoop-dogg-hb-12s-l.npz:

Seed 48:
- snoop-dogg-hb-12s-s.npz:
- snoop-dogg-hb-12s-l.npz:

Seed 49:
- snoop-dogg-hb-12s-s.npz:
- snoop-dogg-hb-12s-l.npz: 

Seed 50:
- snoop-dogg-hb-12s-s.npz:
- snoop-dogg-hb-12s-l.npz: 