In [12]:
!pip install -q gradio soundfile numpy scipy sqlalchemy python-dotenv pydub librosa requests
# Optional heavy libs if you want HF/transformers locally (GPU required)
# !pip install -q transformers accelerate diffusers "huggingface_hub>=0.10.0" replicate openai sentence-transformers

In [13]:
# AI Music & Voice Generator
# Usage: set HF_TOKEN and REPLICATE_API_TOKEN as environment variables if you want heavy models.
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)  # SyntaxWarning

In [14]:
import os
import io
import uuid
import time
import json
import sqlite3
import base64
import random
import logging
import shutil
from pathlib import Path
from typing import Optional

import numpy as np
import soundfile as sf
from pydub import AudioSegment  # optional helper (kept for future use)
import gradio as gr

In [15]:
#  Config / Tokens

HUGGINGFACE_TOKEN = os.environ.get("", "")  # set in environment for HF usage
REPLICATE_TOKEN = os.environ.get("REPLICATE_API_TOKEN", "")  # set in environment for Replicate usage

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ai music voice")

In [16]:
# Directories & DB init

DATA_DIR = Path("ai_music_voice_data")
AUDIO_DIR = DATA_DIR / "audio"
DB_PATH = DATA_DIR / "meta.db"
DATA_DIR.mkdir(exist_ok=True)
AUDIO_DIR.mkdir(exist_ok=True)

# Simple sqlite DB to record generations
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
c = conn.cursor()
c.execute(
    """
    CREATE TABLE IF NOT EXISTS generations (
        id TEXT PRIMARY KEY,
        kind TEXT,
        prompt TEXT,
        voice_sample_path TEXT,
        file_path TEXT,
        provider TEXT,
        created_at REAL
    )
    """
)
conn.commit()

In [17]:
# Procedural / offline generators

def synth_simple_music(duration_sec: int = 20, bpm: int = 100, seed: Optional[int] = None) -> str:
    """
    Procedural background music generator -> writes WAV and returns path.
    Simple pad + chord progression, offline (no external APIs).
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    sr = 24000
    t = np.linspace(0, duration_sec, int(sr * duration_sec), endpoint=False)


In [18]:
def synth_simple_music(duration_sec: int = 20, bpm: int = 100, seed: Optional[int] = None) -> str:
    """
    Procedural background music generator -> writes WAV and returns path.
    Simple pad + chord progression, offline (no external APIs).
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    sr = 24000
    t = np.linspace(0, duration_sec, int(sr * duration_sec), endpoint=False)

    # simple chord palette (triads)
    base_freqs = [
        [261.63, 329.63, 392.00],  # C major-ish
        [293.66, 369.99, 440.00],  # D minor-ish
        [329.63, 415.30, 493.88],  # E-ish
        [349.23, 440.00, 523.25],  # F major-ish
    ]

    pad = np.zeros_like(t)
    block_len = int(len(t) / len(base_freqs))

    for ci, freqs in enumerate(base_freqs):
        start = ci * block_len
        end = start + block_len if ci < len(base_freqs) - 1 else len(t)
        seg_t = t[start:end]
        s = np.zeros_like(seg_t)
        for f in freqs:
            detune = random.uniform(-1.0, 1.0)
            s += 0.3 * np.sin(2 * np.pi * (f + detune) * seg_t)
        # short fade in/out envelope for each block
        env = np.ones_like(s)
        ramp = max(1, int(len(s) * 0.05))
        env[:ramp] *= np.linspace(0, 1, ramp)
        env[-ramp:] *= np.linspace(1, 0, ramp) # Corrected fade-out
        pad[start:end] += s * env

    # mild reverb-ish convolution with exponential kernel
    kernel = np.exp(-np.linspace(0, 2, 800))
    pad = np.convolve(pad, kernel, mode="same")
    pad = pad / (np.max(np.abs(pad)) + 1e-9) * 0.6

    out_path = AUDIO_DIR / f"music_proc_{int(time.time())}.wav"
    sf.write(str(out_path), pad.astype(np.float32), sr)
    return str(out_path)

def synth_simple_voice(prompt_text: str, voice_seed: Optional[int] = None) -> str:
    """
    Lightweight synthetic 'singing' voice: maps token sequence to simple sine-note sequence.
    Offline, deterministic-ish if seed provided.
    """
    if voice_seed is not None:
        random.seed(voice_seed)
        np.random.seed(voice_seed)

    sr = 24000
    tokens = prompt_text.split()
    syl = max(1, min(60, len(tokens)))
    duration = 2.5 + syl * 0.12
    t = np.linspace(0, duration, int(sr * duration), endpoint=False)
    mel = np.zeros_like(t)

    samples_per_token = max(1, int(len(t) / len(tokens)))
    for i, token in enumerate(tokens):
        f = 220 + (i % 12) * 30  # simple scale mapping
        start = i * samples_per_token
        end = min(len(t), start + samples_per_token)
        seg_len = end - start
        if seg_len <= 0:
            continue
        seg_t = np.linspace(0, seg_len / sr, seg_len, endpoint=False)
        env = np.hanning(len(seg_t))
        mel[start:end] += 0.5 * env * np.sin(2 * np.pi * f * seg_t)

    mel = mel + 0.01 * np.random.randn(len(mel))
    mel = mel / (np.max(np.abs(mel)) + 1e-9) * 0.7
    out_path = AUDIO_DIR / f"voice_proc_{int(time.time())}.wav"
    sf.write(str(out_path), mel.astype(np.float32), sr)
    return str(out_path)

In [19]:
# Heavy model templates (placeholders) -- require tokens

def generate_music_hf_inference(prompt: str, duration: int = 10) -> str:
    """
    Template for Hugging Face Inference API usage.
    NOTE: This is a minimal template — adjust endpoint/model and parameters per chosen model.
    """
    if not HUGGINGFACE_TOKEN:
        raise RuntimeError("Hugging Face token not set (HF_TOKEN).")
    import requests

    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
    api_url = "https://api-inference.huggingface.co/models/facebook/musicgen-small"  # example
    payload = {"inputs": prompt, "parameters": {"duration": duration}}
    r = requests.post(api_url, headers=headers, json=payload)
    if r.status_code != 200:
        raise RuntimeError(f"HuggingFace API error: {r.status_code} {r.text}")

    out_path = AUDIO_DIR / f"music_hf_{int(time.time())}.wav"
    with open(out_path, "wb") as f:
        f.write(r.content)
    return str(out_path)


def generate_music_replicate(prompt: str, duration: int = 10) -> str:
    """
    Template for Replicate model usage.
    NOTE: replace model slug and handling as per the actual model's output format.
    """
    if not REPLICATE_TOKEN:
        raise RuntimeError("REPLICATE_API_TOKEN not set.")
    try:
        import replicate
        client = replicate.Client(api_token=REPLICATE_TOKEN)
        # model slug here is placeholder
        model = client.models.get("suno/musicgen-small")
        output = model.predict(prompt=prompt, duration=duration)
        # assume output contains a URL as first element
        url = output[0]
        import requests
        r = requests.get(url)
        out_path = AUDIO_DIR / f"music_repl_{int(time.time())}.wav"
        with open(out_path, "wb") as f:
            f.write(r.content)
        return str(out_path)
    except Exception as e:
        raise RuntimeError("Replicate integration failed: " + str(e))


def generate_voice_bark_replicate(text: str, voice_sample_path: Optional[str] = None) -> str:
    """
    Template for calling a voice synthesis/cloning model on Replicate.
    Replace model slug and inputs per actual model spec.
    """
    if not REPLICATE_TOKEN:
        raise RuntimeError("REPLICATE_API_TOKEN not set.")
    try:
        import replicate
        client = replicate.Client(api_token=REPLICATE_TOKEN)
        # placeholder slug — change to real model
        model = client.models.get("example/bark-v1")
        inputs = {"text": text}
        if voice_sample_path:
            inputs["voice_sample"] = open(voice_sample_path, "rb")
        output = model.predict(**inputs)
        url = output[0]
        import requests
        r = requests.get(url)
        out_path = AUDIO_DIR / f"voice_repl_{int(time.time())}.wav"
        with open(out_path, "wb") as f:
            f.write(r.content)
        return str(out_path)
    except Exception as e:
        raise RuntimeError("Replicate Bark template failed: " + str(e))


In [20]:
# Helpers: DB register

def register_generation(kind: str, prompt: str, file_path: str, provider: str = "procedural") -> str:
    gen_id = str(uuid.uuid4())
    created = time.time()
    c.execute(
        "INSERT INTO generations (id, kind, prompt, voice_sample_path, file_path, provider, created_at) VALUES (?,?,?,?,?,?,?)",
        (gen_id, kind, prompt, None, file_path, provider, created),
    )
    conn.commit()
    return gen_id

In [21]:
# UI functions (wired to Gradio)

def ui_generate_music(prompt, duration, use_heavy, provider_choice):
    try:
        if use_heavy and provider_choice == "hf":
            path = generate_music_hf_inference(prompt, duration=int(duration))
            provider = "hf_musicgen"
        elif use_heavy and provider_choice == "replicate":
            path = generate_music_replicate(prompt, duration=int(duration))
            provider = "replicate_musicgen"
        else:
            path = synth_simple_music(duration_sec=int(duration))
            provider = "procedural"
        gen_id = register_generation("music", prompt, path, provider)
        return f"Generated music (id={gen_id}) using {provider}", path
    except Exception as e:
        logger.exception("Music generation error")
        return f"Error: {str(e)}", None


def ui_generate_voice(prompt, voice_upload, use_heavy, provider_choice):
    try:
        sample_path = None
        # Gradio may send np array, file path, or file-like — handle common cases
        if voice_upload is not None:
            if isinstance(voice_upload, tuple) and len(voice_upload) == 2:
                # (sample_rate, np_array)
                sr, arr = voice_upload
                sample_path = AUDIO_DIR / f"voice_sample_{int(time.time())}.wav"
                sf.write(str(sample_path), np.array(arr), sr)
                sample_path = str(sample_path)
            elif isinstance(voice_upload, str) and os.path.exists(voice_upload):
                sample_path = str(AUDIO_DIR / f"voice_sample_copy_{int(time.time())}.wav")
                shutil.copy(voice_upload, sample_path)
            else:
                # fallback: try to read bytes and write to file
                try:
                    fp = AUDIO_DIR / f"voice_sample_{int(time.time())}.wav"
                    with open(fp, "wb") as f:
                        if hasattr(voice_upload, "read"):
                            f.write(voice_upload.read())
                            sample_path = str(fp)
                except Exception:
                    sample_path = None

        if use_heavy and provider_choice == "replicate":
            out_path = generate_voice_bark_replicate(prompt, voice_sample_path=sample_path)
            provider = "replicate_bark"
        else:
            out_path = synth_simple_voice(prompt)
            provider = "procedural_tts"
        gen_id = register_generation("voice", prompt, out_path, provider)
        return f"Voice generated (id={gen_id}) using {provider}", out_path
    except Exception as e:
        logger.exception("Voice generation error")
        return f"Error: {str(e)}", None

In [11]:
# Build Gradio UI and wire up button click handlers

def build_and_launch(share: bool = True, inbrowser: bool = False):
    with gr.Blocks() as demo:
        gr.Markdown("# AI Music & Voice Generator — Prototype (Clean)")

        with gr.Tab("Music Generator"):
            prompt_m = gr.Textbox(lines=3, label="Music Prompt (describe mood/instruments)", value="Calm ambient pad with gentle bells")
            duration = gr.Slider(5, 120, value=20, step=5, label="Duration (sec)")
            use_heavy_music = gr.Checkbox(label="Use heavy model (MusicGen/Riffusion) if available", value=False)
            provider_music = gr.Radio(["procedural", "hf", "replicate"], value="procedural", label="Provider (hf/replicate/procedural)")
            btn_m = gr.Button("Generate Music")
            out_msg_m = gr.Textbox(label="Status")
            out_audio_m = gr.Audio(label="Generated Music")

        with gr.Tab("Voice / Singer Generator"):
            prompt_v = gr.Textbox(lines=3, label="Lyrics or text to sing/voice", value="Hello world, this is a demo singing line.")
            voice_sample = gr.Audio(type="numpy", label="Upload voice sample for cloning (optional)")
            use_heavy_voice = gr.Checkbox(label="Use heavy model (Bark/VALL-E/RVC) if available", value=False)
            provider_voice = gr.Radio(["procedural", "replicate"], value="procedural", label="Provider")
            btn_v = gr.Button("Generate Voice")
            out_msg_v = gr.Textbox(label="Status")
            out_audio_v = gr.Audio(label="Generated Voice")

        btn_m.click(fn=ui_generate_music,
                    inputs=[prompt_m, duration, use_heavy_music, provider_music],
                    outputs=[out_msg_m, out_audio_m])
        btn_v.click(fn=ui_generate_voice,
                    inputs=[prompt_v, voice_sample, use_heavy_voice, provider_voice],
                    outputs=[out_msg_v, out_audio_v])

    demo.launch(share=share, inbrowser=inbrowser)
    print("Demo launched. If running in Colab you will see a Gradio link above.")


# If this file is run as a script, start the UI
if __name__ == "__main__":
    build_and_launch(share=True, inbrowser=False)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b25d4b87481048c237.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Demo launched. If running in Colab you will see a Gradio link above.
