# Setup

In [1]:
!pwd


/home/momo/mlprojects/mockingjay/tts


In [2]:
import numpy as np
voice_preset_path = 'voice_presets/obama-7s.npz'
voice_preset = np.load(voice_preset_path)


In [3]:
# Load the voice preset file and base64 encode it
import base64
import os

with open(voice_preset_path, 'rb') as f:
    voice_preset_base64 = base64.b64encode(f.read()).decode('utf-8')

len(voice_preset_base64)


# Load the np array from the base64 encoded string
import io
import numpy as np

voice_preset = np.load(io.BytesIO(base64.b64decode(voice_preset_base64)))
voice_preset
# Convert the NpzFile object to a dict
voice_preset = dict(voice_preset)


In [4]:
import torch
from src.bark.bark_vocos import BarkVocos
device = "cuda:0" if torch.cuda.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


# Initialize model

In [5]:
# monkey-patch optimum.utils.normalized_config.NormalizedConfigManager
from optimum.utils.normalized_config import NormalizedConfigManager, GPT2LikeNormalizedTextConfig, NormalizedTextConfig
NormalizedConfigManager._conf = {
        "gpt2": GPT2LikeNormalizedTextConfig,
        "coarse_acoustics": NormalizedTextConfig,
    }

model_id = "suno/bark"
model_path = "./bark-model"
model = BarkVocos.from_pretrained(model_id, torch_dtype=torch.float32)

model = model.to(device)
model =  model.to_bettertransformer()


The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [6]:
text_prompt = "Hello, my name is Suno. And, uh — and I like pizza. But I also have other interests such as playing tic tac toe."


In [7]:
from src.bark.processing_bark import BarkProcessor
import nltk

nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(text_prompt)
print(sentences)
processor = BarkProcessor.from_pretrained(model_id)


[nltk_data] Downloading package punkt to /home/momo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Hello, my name is Suno.', 'And, uh — and I like pizza.', 'But I also have other interests such as playing tic tac toe.']


In [8]:
voice_presets = [
    "voice_presets/snoop-dogg-hb-7s.npz",
    ]
semantic_temp = 0.7
coarse_temp = 0.7
fine_temp = 0.4
output_sample_rate=44100
do_sample = True


In [9]:
from IPython.display import Audio
torch.manual_seed(48)

print(f"Generating with voice preset {voice_preset}")
inputs = processor(sentences, voice_preset=voice_preset).to(device)
output = model.generate(**inputs, do_sample=do_sample, fine_temperature=fine_temp, coarse_temperature=coarse_temp, semantic_temperature=semantic_temp)
print(type(output), output.shape)
Audio(output.T, rate=output_sample_rate)


Generating with voice preset {'semantic_prompt': array([ 206,  302,   10,   41,   41, 1229, 6960, 6960, 7367, 7367, 7367,
       1359, 1359,  704,  326,   33, 1025, 1025,   10, 6814, 6814, 6814,
         10,   10, 7385, 7385,  173,  173,  147,  302, 2074, 9817, 3589,
       3589, 3589,   17,  298, 9182, 9182, 6948, 5626, 1465,   92,   59,
         59, 7867,   41,   41, 7735,  362,  362, 7660, 2457, 2622,   71,
         59,   28,   28,   28,   28, 1133, 3195, 6155, 3182, 3755, 4667,
         50,   27,   27,   27, 3399, 6389, 6389,  300,  300, 1755,  959,
        959,  100, 1243, 2597, 2514, 2969, 5231, 1699, 1243,  328, 4228,
         41,   10,   41,   43, 3956, 1463, 3840, 3890, 1450,   92,   59,
         28, 9379, 4284, 4284,  321, 1314, 1314, 1314,  657, 6453, 9458,
       9458,  173,   10,   10,  140, 1095, 1095,  635,  292,   41,   41,
        927, 8590, 5371, 3941,  836,  541,   27,   27,   27,   27,   10,
          5, 2194, 6265, 1025, 1025, 7231, 1983, 5673, 4119,   41,   41,
  

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


<class 'torch.Tensor'> torch.Size([413952, 1])
