## Setup Notebook, Install


In [None]:
!git clone https://github.com/JonathanFly/bark.git
%cd bark
!pip install -r requirements-pip.txt
!pip install encodec rich-argparse
!pip install librosa pydub

Cloning into 'bark'...
remote: Enumerating objects: 854, done.[K
remote: Counting objects: 100% (393/393), done.[K
remote: Compressing objects: 100% (187/187), done.[K
remote: Total 854 (delta 297), reused 249 (delta 204), pack-reused 461[K
Receiving objects: 100% (854/854), 3.50 MiB | 9.79 MiB/s, done.
Resolving deltas: 100% (329/329), done.
/content/bark
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.127-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m967.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Do

## Run Once Per Notebook Restart (if files still exist)

In [None]:
import os
import time
from bark_infinity import config
import numpy as np

logger = config.logger
logger.setLevel("WARNING")

from bark_infinity import generation
from bark_infinity import api

import rich
from rich import print
from rich import pretty
from rich.pretty import pprint
from rich import inspect

import librosa
from pydub import AudioSegment
import ipywidgets as widgets
from IPython.display import display, Audio
from io import BytesIO

# None of this code, just fiddlign with Colab stuff
# Just to save Colab with outputs and float32 wavs are GIGANTO
# actually this doesn't work, the iPython widget converts it back to float32? or I messed up

def display_audio_int16_but(audio_arr_segments, file_name, sample_rate=generation.SAMPLE_RATE,  width='200px'):
    file_name_label = widgets.Label(value=f"Playing: {file_name}")
    file_name_label.layout.width = width
    audio_data_int16 = audio_arr_segments
    if isinstance(audio_data_int16, list):
        audio_data_int16 = np.concatenate(audio_data_int16)        

    #audio_data_int16 = np.int16(audio_data_int16 * np.iinfo(np.int16).max)


    audio_widget = Audio(audio_data_int16, rate=sample_rate)
    display(file_name_label, audio_widget)
    

def on_button_click(button):
    audio_data, sample_rate = librosa.load(button.wav_path, sr=None)
    file_name = os.path.basename(button.wav_path)
    display_audio_int16_but(audio_data,file_name, sample_rate)


def display_wav_files(directory):
    subdirs, wav_files = [], []
    
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        
        if os.path.isfile(item_path) and item_path.endswith('.wav'):
            wav_files.append(item_path)
        elif os.path.isdir(item_path):
            subdirs.append(item_path)

    wav_files.sort(key=lambda x: os.path.basename(x))

    for wav_file in wav_files:

        filename = os.path.basename(wav_file)
        print(f" {filename}")
        button = widgets.Button(description=f"Play {filename}")
        button.wav_path = wav_file  
        button.on_click(on_button_click)
        display(button)

    for subdir in sorted(subdirs):
        print(f"<{subdir}>")
        display_wav_files(subdir)



## Generate


### Choose Bark Models

In [None]:
generation.OFFLOAD_CPU = False # On your home system set to True probably, but Colab GPU should have plenty of memory for all three models
generation.preload_models() # Optional, will lazy load if not preloaded. First time run in New Colab has to download models

Downloading text suno/bark remote model file https://huggingface.co/suno/bark/resolve/main/text_2.pt text_2.pt to /root/.cache/suno/bark_v0


Downloading text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

Loading text model from /root/.cache/suno/bark_v0/text_2.pt to cuda


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading coarse suno/bark remote model file https://huggingface.co/suno/bark/resolve/main/coarse_2.pt coarse_2.pt to /root/.cache/suno/bark_v0


Downloading coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Loading coarse model from /root/.cache/suno/bark_v0/coarse_2.pt to cuda
Downloading fine suno/bark remote model file https://huggingface.co/suno/bark/resolve/main/fine_2.pt fine_2.pt to /root/.cache/suno/bark_v0


Downloading fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

Loading fine model from /root/.cache/suno/bark_v0/fine_2.pt to cuda


Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:00<00:00, 137MB/s]


### Set Text and Other Generation Options

In [None]:
text = """
Hey, have you heard about this new text-to-audio model called "Bark"? 
It's like rain on your wedding day. It's a free ride when you've already paid. It's the good advice that you just didn't take.
And who would've thought? It figures.

Well, life has a funny way of sneaking up on you. When you think everything's okay and everything's going right. 
And life has a funny way of helping you out. When you think everything's gone wrong. 
And everything blows up in your face.

It's a traffic jam when you're already late. A "No smoking" sign on your cigarette break.
It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams.
And then meeting his beautiful wife.

And isn't it ironic? Don't you think? A little too ironic.
And yeah, I really do think.
"""

# FOr split set split_character_goal_length and split_character_max_length
kwargs = {}

kwargs = config.load_all_defaults()
kwargs['text_prompt'] = text
kwargs['hoarder_mode'] = True
kwargs["output_dir"] = 'bark_samples'
kwargs["history_prompt"] = None
# kwargs["single_starting_seed"] = None # 
# If you set seed you might want manually call generation.set_seed(-1) after to disable deterministic generation settings 
# I'm not cleaning up after this paramater at the moment and I'm not sure on other side effects
kwargs["stable_mode_interval"] = 1 # 0 for continous, 2,3,4 for mixed
kwargs["split_character_goal_length"] = 90
kwargs["split_character_max_length"] = 130
# kwargs["output_iterations"] = 1
kwargs["add_silence_between_segments"] = .025 # See: https://github.com/suno-ai/bark/blob/main/notebooks/long_form_generation.ipynb but not great for songs or stable_mode_interval 0
kwargs["semantic_min_eos_p"] = 0.05 # 0.20 is default, lower means more likely to stotp


# not sure on overall effect so far from these, but for example:
kwargs["semantic_top_k"] = 50
kwargs["semantic_top_p"] = 0.95

### First Attempt

#### Before we run, let's double check out settings

In [None]:
kwargs["dry_run"] = True # Check how the text is being split, don't actually run the model. 
full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)

Hey, have you heard about this new text-to-audio model called "Bark"? It's like rain on your wedding day. It's a free ride when you've already paid.


It's the good advice that you just didn't take. And who would've thought? It figures. Well, life has a funny way of sneaking up on you.


When you think everything's okay and everything's going right. And life has a funny way of helping you out. When you think everything's gone wrong.


And everything blows up in your face. It's a traffic jam when you're already late. A "No smoking" sign on your cigarette break.


It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams. And then meeting his beautiful wife. And isn't it ironic? Don't you think?


A little too ironic. And yeah, I really do think.
Saved to bark_samples/Hey_have_you_heard_a-SPK-random.wav/final_A_little_too_ironic_-SPK-random.wav


In [None]:
# that's the output we expect to see, we didn't generate audio yet
# these text segments look a little small small so let's try this instead
kwargs["split_character_goal_length"] = 110
kwargs["split_character_max_length"] = 175

full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)

Hey, have you heard about this new text-to-audio model called "Bark"? It's like rain on your wedding day. It's a free ride when you've already paid.


It's the good advice that you just didn't take. And who would've thought? It figures. Well, life has a funny way of sneaking up on you.


When you think everything's okay and everything's going right. And life has a funny way of helping you out. When you think everything's gone wrong.


And everything blows up in your face. It's a traffic jam when you're already late. A "No smoking" sign on your cigarette break.


It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams. And then meeting his beautiful wife. And isn't it ironic? Don't you think?


A little too ironic. And yeah, I really do think.
Saved to bark_samples/Hey_have_you_heard_a-SPK-random.wav/final_A_little_too_ironic_-SPK-random.wav


#### Run Bark

In [None]:
# These segement sizes look better so now so set dry_run to False to run for real
# Because we set hoarder_mode we can see the wav files for each segment in the Colab File Manager

kwargs["dry_run"] = False
full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)

Hey, have you heard about this new text-to-audio model called "Bark"? It's like rain on your wedding day. It's a free ride when you've already paid.


100%|██████████| 100/100 [00:07<00:00, 13.53it/s]
100%|██████████| 28/28 [00:25<00:00,  1.10it/s]
100%|██████████| 1/1 [00:02<00:00,  2.55s/it]


It's the good advice that you just didn't take. And who would've thought? It figures. Well, life has a funny way of sneaking up on you.


100%|██████████| 100/100 [00:07<00:00, 13.78it/s]
100%|██████████| 25/25 [00:23<00:00,  1.07it/s]
100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


When you think everything's okay and everything's going right. And life has a funny way of helping you out. When you think everything's gone wrong.


100%|██████████| 100/100 [00:06<00:00, 14.97it/s]
100%|██████████| 28/28 [00:26<00:00,  1.04it/s]
100%|██████████| 2/2 [00:03<00:00,  1.76s/it]


And everything blows up in your face. It's a traffic jam when you're already late. A "No smoking" sign on your cigarette break.


100%|██████████| 100/100 [00:08<00:00, 12.41it/s]
100%|██████████| 32/32 [00:31<00:00,  1.03it/s]
100%|██████████| 2/2 [00:03<00:00,  1.76s/it]


It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams. And then meeting his beautiful wife. And isn't it ironic? Don't you think?


100%|██████████| 100/100 [00:07<00:00, 14.09it/s]
100%|██████████| 30/30 [00:28<00:00,  1.04it/s]
100%|██████████| 2/2 [00:03<00:00,  1.63s/it]


A little too ironic. And yeah, I really do think.


100%|██████████| 100/100 [00:03<00:00, 26.92it/s] 
100%|██████████| 12/12 [00:11<00:00,  1.06it/s]
100%|██████████| 1/1 [00:01<00:00,  1.80s/it]


Saved to bark_samples/Hey_have_you_heard_a-SPK-random.wav/final_A_little_too_ironic_-SPK-random.wav


In [None]:
print(f"  final wav at {final_filename_will_be}  ")
# (we see many wav because we set hoarder_mode, but one file will be the final product
# set hoarder_mode=False if you just want the file wav and aren't in explore mode

# or play here 
Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE) 


In [None]:
# because we set hoarder mode we also saved each segement as its own seperate sample with wav

!find "bark_samples/" -name "*.npz"

display_wav_files("bark_samples/")

bark_samples/Hey_have_you_heard_a-SPK-random.wav/006_A_little_too_ironic_-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/003_When_you_think_every-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/001_Hey_have_you_heard_a-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/004_And_everything_blows-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/final_A_little_too_ironic_-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/002_Its_the_good_advice_-SPK-random.wav.npz
bark_samples/Hey_have_you_heard_a-SPK-random.wav/005_Its_like_ten_thousan-SPK-random.wav.npz


Button(description='Play 001_Hey_have_you_heard_a-SPK-random.wav', style=ButtonStyle())

Button(description='Play 002_Its_the_good_advice_-SPK-random.wav', style=ButtonStyle())

Button(description='Play 003_When_you_think_every-SPK-random.wav', style=ButtonStyle())

Button(description='Play 004_And_everything_blows-SPK-random.wav', style=ButtonStyle())

Button(description='Play 005_Its_like_ten_thousan-SPK-random.wav', style=ButtonStyle())

Button(description='Play 006_A_little_too_ironic_-SPK-random.wav', style=ButtonStyle())

Button(description='Play final_A_little_too_ironic_-SPK-random.wav', style=ButtonStyle())

### Second Attempt. Can we do better?

In [None]:
# we used stable_mode_interval = 1, so the history_prompt does not evolve between segments
# even still the voices that are saved for each segment are one-generation different than the original history prompt
# this means they are a *little* bit different, and we may prefer one of them over the original
# for example maybe segment 2 was a little more clear, or had a particular emotion, we could use that segment's version as the speaker
# in the particular run I'm doing now, that segment ended with a little bit an interesting accent. I'm curious if I can bring that out more.

# (should probably rename the file to something sensible though)

kwargs["history_prompt"] = "/content/bark/bark_samples/Hey_have_you_heard_a-SPK-random.wav/002_Its_the_good_advice_-SPK-random.wav.npz"

In [40]:
kwargs["text_prompt"] = f"I'm speaker number two. I'm the best speaker. Also I'm a free spirit. Let me evolve my voice with every step. Here's my version."
kwargs["text_prompt"] += text
kwargs["stable_mode_interval"] = 0 
kwargs["output_dir"] = "speaker_2_test"
kwargs["add_silence_between_segments"] = 0.0 # No silence, fully merge clips

kwargs["semantic_min_eos_p"] = 0.20 # Back to default, let Bark umm and ahh a bit
full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)



 83%|████████▎ | 83/100 [01:27<00:17,  1.05s/it]
 80%|████████  | 80/100 [00:53<00:13,  1.48it/s]


I'm speaker number two. I'm the best speaker. Also I'm a free spirit. Let me evolve my voice with every step. Here's my version.


100%|██████████| 100/100 [00:09<00:00, 10.38it/s]
100%|██████████| 36/36 [00:35<00:00,  1.02it/s]
100%|██████████| 3/3 [00:04<00:00,  1.58s/it]


Hey, have you heard about this new text-to-audio model called "Bark"? It's like rain on your wedding day. It's a free ride when you've already paid.


100%|██████████| 100/100 [00:09<00:00, 10.69it/s]
100%|██████████| 32/32 [00:30<00:00,  1.04it/s]
100%|██████████| 2/2 [00:03<00:00,  1.61s/it]


It's the good advice that you just didn't take. And who would've thought? It figures. Well, life has a funny way of sneaking up on you.


100%|██████████| 100/100 [00:09<00:00, 10.20it/s]
100%|██████████| 34/34 [00:33<00:00,  1.02it/s]
100%|██████████| 2/2 [00:03<00:00,  1.74s/it]


When you think everything's okay and everything's going right. And life has a funny way of helping you out. When you think everything's gone wrong.


100%|██████████| 100/100 [00:07<00:00, 13.63it/s]
100%|██████████| 31/31 [00:29<00:00,  1.05it/s]
100%|██████████| 2/2 [00:03<00:00,  1.75s/it]


And everything blows up in your face. It's a traffic jam when you're already late. A "No smoking" sign on your cigarette break.


100%|██████████| 100/100 [00:07<00:00, 12.86it/s]
100%|██████████| 30/30 [00:29<00:00,  1.02it/s]
100%|██████████| 2/2 [00:03<00:00,  1.64s/it]


It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams. And then meeting his beautiful wife. And isn't it ironic? Don't you think?


100%|██████████| 100/100 [00:09<00:00, 10.32it/s]
100%|██████████| 35/35 [00:34<00:00,  1.01it/s]
100%|██████████| 3/3 [00:04<00:00,  1.60s/it]


A little too ironic. And yeah, I really do think.


100%|██████████| 100/100 [00:06<00:00, 16.28it/s]
100%|██████████| 26/26 [00:25<00:00,  1.01it/s]
100%|██████████| 2/2 [00:03<00:00,  1.67s/it]


Saved to speaker_2_test/Im_speaker_number_tw-SPK-002_Its_the_good_advice_-SPK-random.wav.wav/final_A_little_too_ironic_-SPK-002_Its_the_good_advice_-SPK-random.wav.wav


In [None]:
print(f"  final wave at {final_filename_will_be}")
Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE) 

In [43]:
# this clip probably got really weird after a full segments, fully feedbacking into itself. So kwargs["stable_mode_interval"] = 3 might be a good compromise

display_wav_files("speaker_2_test")

Button(description='Play 001_Im_speaker_number_tw-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 002_Hey_have_you_heard_a-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 003_Its_the_good_advice_-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 004_When_you_think_every-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 005_And_everything_blows-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 006_Its_like_ten_thousan-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play 007_A_little_too_ironic_-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Butt…

Button(description='Play final_A_little_too_ironic_-SPK-002_Its_the_good_advice_-SPK-random.wav.wav', style=Bu…

### Finding Our Voice

In [None]:
# That final clip is an improvement, the random voice we got isn't bad but it's not quite doing our beautiful prose justice
# we could use an existing history_prompt, but let's try to summon a perfect speaker from the model
# we do that by generating many speakers randomly
# we could use our first segment text, in my experience there is a better method
# try to image: what type of text would be the context in voice I want to hear is likely to appear?
# then let's generate 20 sample clips from that

# TODO