<a href="https://colab.research.google.com/github/steinhaug/stable-diffusion/blob/main/tool/nogui__demucs_and_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Google Drive

In [1]:
# Mount Drive at /content/drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Refresh Drive connection
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# GPU Mode: Stem Splitting (VOICE, BASS, etc)

In [None]:
# Install the demucs library by Facebook research
!python3 -m pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs

Collecting demucs
  Cloning https://github.com/facebookresearch/demucs to /tmp/pip-install-g92vrxp3/demucs_23ff90c2f5954f49b1e5b9981e0de8a1
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/demucs /tmp/pip-install-g92vrxp3/demucs_23ff90c2f5954f49b1e5b9981e0de8a1
  Resolved https://github.com/facebookresearch/demucs to commit e976d93ecc3865e5757426930257e200846a520a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dora-search (from demucs)
  Downloading dora_search-0.1.12.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.1/87.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting julius>=0.2.3 (from demucs)
  Downloading julius-0.2.7.tar.gz (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 k

In [None]:
#@title **Basic Configuration**

# Customize the following options!
model = "htdemucs_ft" #"htdemucs_ft
extensions = ["mp3", "wav", "ogg", "flac"]  # we will look for all those file types.
two_stems = None   # only separate one stems from the rest, for instance
# two_stems = "vocals"

# Options for the output audio.
mp3 = True
mp3_rate = 320
float32 = False  # output as float 32 wavs, unsused if 'mp3' is True.
int24 = False    # output as int24 wavs, unused if 'mp3' is True.
# You cannot set both `float32 = True` and `int24 = True` !!


# @markdown **`in_path`**: Choose source folder, music files
in_path = '/content/drive/MyDrive/demucs/split_parts' # @param {type:"string"}

# @markdown **`out_path`**: Choose save folder for stems
out_path = '/content/drive/MyDrive/demucs/other_stems' # @param {type:"string"}


import io
from pathlib import Path
import select
from shutil import rmtree
import subprocess as sp
import sys
from typing import Dict, Tuple, Optional, IO

from google.colab import files

def find_files(in_path):
    out = []
    for file in Path(in_path).iterdir():
        if file.suffix.lower().lstrip(".") in extensions:
            out.append(file)
    return out

def copy_process_streams(process: sp.Popen):
    def raw(stream: Optional[IO[bytes]]) -> IO[bytes]:
        assert stream is not None
        if isinstance(stream, io.BufferedIOBase):
            stream = stream.raw
        return stream

    p_stdout, p_stderr = raw(process.stdout), raw(process.stderr)
    stream_by_fd: Dict[int, Tuple[IO[bytes], io.StringIO, IO[str]]] = {
        p_stdout.fileno(): (p_stdout, sys.stdout),
        p_stderr.fileno(): (p_stderr, sys.stderr),
    }
    fds = list(stream_by_fd.keys())

    while fds:
        # `select` syscall will wait until one of the file descriptors has content.
        ready, _, _ = select.select(fds, [], [])
        for fd in ready:
            p_stream, std = stream_by_fd[fd]
            raw_buf = p_stream.read(2 ** 16)
            if not raw_buf:
                fds.remove(fd)
                continue
            buf = raw_buf.decode()
            std.write(buf)
            std.flush()

def separate(inp=None, outp=None):
    inp = inp or in_path
    outp = outp or out_path
    cmd = ["python3", "-m", "demucs.separate", "-o", str(outp), "-n", model]
    if mp3:
        cmd += ["--mp3", f"--mp3-bitrate={mp3_rate}"]
    if float32:
        cmd += ["--float32"]
    if int24:
        cmd += ["--int24"]
    if two_stems is not None:
        cmd += [f"--two-stems={two_stems}"]
    files = [str(f) for f in find_files(inp)]
    if not files:
        print(f"No valid audio files in {in_path}")
        return
    print("Going to separate the files:")
    print('\n'.join(files))
    print("With command: ", " ".join(cmd))
    p = sp.Popen(cmd + files, stdout=sp.PIPE, stderr=sp.PIPE)
    copy_process_streams(p)
    p.wait()
    if p.returncode != 0:
        print("Command failed, something went wrong.")


def from_upload():
    out_path = Path('separated')
    in_path = Path('tmp_in')

    if in_path.exists():
        rmtree(in_path)
    in_path.mkdir()

    if out_path.exists():
        rmtree(out_path)
    out_path.mkdir()

    uploaded = files.upload()
    for name, content in uploaded.items():
        (in_path / name).write_bytes(content)
    separate(in_path, out_path)


In [None]:
#@title Process **in_path** folder.
#@markdown For large files 10+ minutes, consider splitting into smaller chunks if colab crashes. Larger files require more Vram,

#@markdown **execute seperate()**

# This can be quite slow, in particular the loading, and saving from GDrive. Please be patient!
# This is from google drive! Also, this will separate all the files inside the MyDrive/demucs folder,
# so when you are happy with the results, remove the songs from there.
separate()

Going to separate the files:
/content/drive/MyDrive/demucs/split_parts/part_8.mp3
/content/drive/MyDrive/demucs/split_parts/part_4.mp3
/content/drive/MyDrive/demucs/split_parts/part_3.mp3
/content/drive/MyDrive/demucs/split_parts/part_2.mp3
/content/drive/MyDrive/demucs/split_parts/part_6.mp3
/content/drive/MyDrive/demucs/split_parts/part_5.mp3
/content/drive/MyDrive/demucs/split_parts/part_7.mp3
/content/drive/MyDrive/demucs/split_parts/part_1.mp3
With command:  python3 -m demucs.separate -o /content/drive/MyDrive/demucs/other_stems/ -n htdemucs_ft --mp3 --mp3-bitrate=320
Selected model is a bag of 4 models. You will see that many progress bars per track.
Separated tracks will be stored in /content/drive/MyDrive/demucs/other_stems/htdemucs_ft
Separating track /content/drive/MyDrive/demucs/split_parts/part_8.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:19<00:00, 17.28seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:12<00:00, 25.95seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 25.45seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 25.45seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_4.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.43seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.43seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.80seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.37seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_3.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.62seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.09seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.18seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.39seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_2.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.90seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.77seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:14<00:00, 23.79seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.41seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_6.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.83seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.67seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.47seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.49seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_5.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.82seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.64seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.36seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.40seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_7.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.71seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.26seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.14seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.52seconds/s]


Separating track /content/drive/MyDrive/demucs/split_parts/part_1.mp3


100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.81seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.48seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.40seconds/s]
100%|██████████████████████████████████████████████████████████████████████| 333.45/333.45 [00:13<00:00, 24.44seconds/s]


In [None]:
# This is manual upload and download :)
from_upload()
!zip -r separated.zip separated
files.download('./separated.zip')

# open-ai whisper

In [None]:
#@title install whisper library by openai
!pip install -U openai-whisper
#!pip install git+https://github.com/openai/whisper.git


In [None]:
#@title Quick explernation
# Syntax:
# !whisper [audio_file_paths...] --model [model_name]

# Required VRAM in parentheses
# models english only: medium.en, small.en
# models: small (2gb), medium (5gb), large (10gb)

# speed: 1x large, 2x medium, 4x small, 8x turbo

In [None]:
# CD into the folder first, so that transcribed files are saved correctly
%cd "/content/drive/MyDrive/audio/subtitle/"
!whisper "/content/drive/MyDrive/audio/subtitle/serotonin.aac" --model medium.en

# tool: splitting large MP3 files

In [None]:
#@title Install libraries
!pip install pydub
!apt-get install ffmpeg

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
#@title Functions for splitting
from pydub import AudioSegment
import math
import os

def split_mp3_into_parts(mp3_file_path, output_dir='split_parts', parts=4):
    # Load the MP3 file
    audio = AudioSegment.from_mp3(mp3_file_path)

    # Calculate duration of each part
    duration_ms = len(audio)
    part_duration_ms = math.ceil(duration_ms / parts)

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Split the audio and save parts
    for i in range(parts):
        start_time = i * part_duration_ms
        end_time = min(start_time + part_duration_ms, duration_ms)
        part_audio = audio[start_time:end_time]

        # Export each part as a new MP3 file
        part_filename = os.path.join(output_dir, f'part_{i+1}.mp3')
        part_audio.export(part_filename, format="mp3")
        print(f'Exported {part_filename}')

# Example usage, splits in 4 save in ./split_parts
#split_mp3_into_parts('/content/drive/MyDrive/demucs/other/Sin Cognito # 1.mp3')

In [None]:
#@title Split cmd
# Syntax
# split_mp3_into_parts( (string) file_path, (string) save_dir, (int) parts )

# Example:
# split_mp3_into_parts( 'myAudio.mp3', './splitFiles/', 3 )

# CPU mode for stem splitting (no GPU)

[source link](https://colab.research.google.com/drive/1nLVmRk3Je_v965fsLmYtRGpLqW3Nt37M)


In [None]:
# @title Set up

# @markdown **TLDR**; upload your audio files a folder named `inputs`, and hit `Ctrl + F9`.
# @markdown
# @markdown Once done (a matter of minutes), separated music sources will become available under `outputs`.

# @markdown **`inputs_folder`**: Choose which folder to take unseparated music from. Supported files include `.mp3`, `.wav`, `.ogg` or `.flac`.
inputs_folder = 'inputs' # @param {type:"string"}

# @markdown **`outputs_folder`**: Choose which folder separated sources will be output to.
outputs_folder = 'outputs' # @param {type:"string"}

# @markdown **`karaoke_mode`**: Separate vocals from the rest of the accompaniment. This will mix the files after separating the mix fully, so this won't be faster or use less memory.
karaoke_mode = False # @param {type:"boolean"}

# @markdown **`output_format`**: How outputs will be saved.
output_format = "mp3 - 160 kbps" # @param ["mp3 - 160 kbps", "mp3 - 320 kbps", "wav - 16 bit", "wav - 24 bit", "wav - 32 bit (float)"] {allow-input: true}

# @markdown **`model`**: Choose which model is used for separation:
# @markdown - `htdemucs`: first version of Hybrid Transformer Demucs. Trained on MusDB + 800 songs. Default model.
# @markdown - `htdemucs_ft`: fine-tuned version of `htdemucs`, separation will take 4 times more time
# @markdown     but might be a bit better. Same training set as `htdemucs`.
# @markdown - `htdemucs_6s`: 6 sources version of `htdemucs`, with `piano` and `guitar` being added as sources.
# @markdown     Note that the `piano` source is not working great at the moment.
# @markdown - `hdemucs_mmi`: Hybrid Demucs v3, retrained on MusDB + 800 songs.
# @markdown - `mdx`: trained only on MusDB HQ, winning model on track A at the [MDX][mdx] challenge.
# @markdown - `mdx_extra`: trained with extra training data (**including MusDB test set**), ranked 2nd on the track B
# @markdown     of the [MDX][mdx] challenge.
# @markdown - `mdx_q`, `mdx_extra_q`: quantized version of the previous models. Smaller download and storage
# @markdown     but quality can be slightly worse.
# @markdown - `SIG`: where `SIG` is a single model from the [model zoo](docs/training.md#model-zoo).
# @markdown
# @markdown [mdx]: https://www.aicrowd.com/challenges/music-demixing-challenge-ismir-2021

model = 'htdemucs_ft' # @param ["htdemucs", "htdemucs_ft", "htdemucs_6s", "hdemucs_mmi", "mdx", "mdx_extra", "mdx_q", "mdx_extra_q"] {allow-input: true}

!mkdir -p {inputs_folder}

### Separate music sources

In [None]:
#@title Install dependenies

has_gpu = !nvidia-smi > /dev/null 2>&1 && echo 1
if has_gpu:
  print("Using GPU packages")
  !pip install -qU git+https://github.com/adefossez/demucs#egg=demucs
else:
  # avoid installing dependencies to get rid of CUDA dependency
  print("Using CPU packages; separation will be slow. Consider switching to a GPU-enabled runtime")
  !git clone https://github.com/adefossez/demucs.git
  !pip install -q -e demucs -r demucs/requirements_minimal.txt --no-deps
  !pip install -q treetable omegaconf submitit retrying
  !pip install -q torch==2.0.1 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu

# Test that everything is in place
!python3 -m demucs.separate --list-models

In [None]:

# @title Run model

import re

inputs = !find {inputs_folder} -name *.mp3 -o -name *.flac -o -name *.wav -o -name *.ogg || echo error
if inputs[-1] == "error":
  raise RuntimeError(f'Please make sure you have uploaded your audio files under "{inputs_folder}" folder')

extra_flags = ""
if karaoke_mode:
  extra_flags += " --two-stems=vocals"
if output_format == "wav - 16 bit":
  pass
elif output_format == "wav - 24 bit":
  extra_flags += " --int24"
elif output_format == "wav - 32 bit (float)":
  extra_flags += " --float32"
elif output_format.startswith("mp3"):
  bitrate = int(re.search('([0-9]{2,3}) kbps', output_format).group(1))
  extra_flags += f" --mp3 --mp3-bitrate={bitrate}"

inputs_str = " ".join(f'"{s}"' for s in inputs)

!python3 -m demucs.separate -o {outputs_folder} -n {model} {extra_flags} {inputs_str}