In [1]:
# 🔹 Step 1: Install required libraries
!pip install git+https://github.com/suno-ai/bark.git
!pip install librosa soundfile

# 🔹 Step 2: Import libraries
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
from google.colab import files
import torch
import librosa
import numpy as np
import soundfile as sf

# 🔹 Step 3: Fix torch.load bug
original_torch_load = torch.load
def custom_torch_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_torch_load(*args, **kwargs)
torch.load = custom_torch_load

# 🔹 Step 4: Load Bark Models (small models for speed)
preload_models(text_use_small=True, coarse_use_small=True, fine_use_small=True)

# 🔹 Step 5: Choose Mode (Text-only OR Reference Voice)
mode = input("Enter mode: '1' for text-to-voice (Week 1), '2' for reference voice + text (Week 2): ")

if mode == '1':
    # Week 1: Just text to speech using default Bark voice
    text = input("Enter your text: ")
    audio_array = generate_audio(text)
    write_wav("bark_week1.wav", SAMPLE_RATE, audio_array)
    files.download("bark_week1.wav")
    print("✅ Week 1 voice generated and downloaded.")

elif mode == '2':
    # Week 2: Upload reference audio and generate speech (simulated cloning)
    print("Upload your 30–60 sec reference WAV file (mono, 24kHz)...")
    uploaded = files.upload()
    audio_file = list(uploaded.keys())[0]

    # Load and normalize reference audio
    def preprocess_audio(audio_path, target_sr=24000):
        audio, sr = sf.read(audio_path)
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        if audio.ndim > 1:
            audio = np.mean(audio, axis=1)
        audio = audio / (np.max(np.abs(audio)) + 1e-6)
        return audio

    reference_audio = preprocess_audio(audio_file)

    text = input("Enter the text you want the cloned voice to say: ")
    audio_array = generate_audio(text, history_prompt="v2/en_speaker_9")  # default voice
    write_wav("cloned_voice_week2.wav", SAMPLE_RATE, audio_array)
    files.download("cloned_voice_week2.wav")
    print("✅ Week 2 simulated voice generated and downloaded.")

else:
    print("❌ Invalid input. Run again and enter '1' or '2'.")


Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-uc3800e2
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-uc3800e2
  Resolved https://github.com/suno-ai/bark.git to commit f4f32d4cd480dfec1c245d258174bc9bde3c2148
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from suno-bark==0.0.1a0)
  Downloading boto3-1.40.1-py3-none-any.whl.metadata (6.7 kB)
Collecting encodec (from suno-bark==0.0.1a0)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting funcy (from suno-bark==0.0.1a0)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collec

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


text.pt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

coarse.pt:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

fine.pt:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:00<00:00, 141MB/s]


Enter mode: '1' for text-to-voice (Week 1), '2' for reference voice + text (Week 2): 1
Enter your text: hello my name is shan and this is my week one and week two combine task


100%|██████████| 396/396 [00:22<00:00, 17.36it/s]
100%|██████████| 20/20 [01:53<00:00,  5.70s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Week 1 voice generated and downloaded.
