In [5]:
"""
    Install python packages.
    You might see an error like this:

    ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
    flask 2.2.4 requires click>=8.0, but you have click 7.1.2 which is incompatible.
    pip-tools 6.13.0 requires click>=8, but you have click 7.1.2 which is incompatible.

    You can ignore it, everything works without click.
"""
!pip -q install spleeter ipython soundfile

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.3/77.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.8/203.8 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
"""
    Connect to the Google Drive so we can load our songs
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
"""
    Basic Imports
"""

"""
    Spleeter
"""
import spleeter
from spleeter.separator import Separator
from spleeter.audio.adapter import AudioAdapter

"""
    Audio manipulations
"""
import librosa
import soundfile as sf

"""
    Calculations
"""
import numpy as np
import scipy
from scipy.signal import convolve

"""
    Fancy graphs and diagrams
"""
import librosa.display
from IPython.display import Audio
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [16]:
"""
    Helper functions
"""

"""
    Visualize audio as a spectrogam
"""
def visualize_spectrogram(audio, sr=44100):
    plt.figure(figsize=(10, 4))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.tight_layout()
    plt.show()


"""
    Cut track in order to speed up separation and post-processing
"""
def cut(audio, sr=42100):
    return audio[sr*25:sr*80]

"""
    Get mono version of a stereo audio
"""
def mono(audio):
    if len(audio.shape) == 2:
        return audio.mean(axis=1)
    else:
        return audio

"""
    Load and cut songs
    Make sure to adjust acc_path and vocal_path.
"""



"""
    Normalize the loudness of the vocal track based on the accompaniment track.
"""

def normalize_loudness(acc, vocal, frame_length=512, hop_length=512):

    # Make a copy of the vocal track to avoid modifying the original
    vocal_normalized = vocal.copy()

    loudness = {}

    # Calculate the root mean square (loudness) for each frame
    acc_loudness = librosa.feature.rms(acc, frame_length=frame_length, hop_length=hop_length)[0]
    vocal_loudness = librosa.feature.rms(vocal_normalized, frame_length=frame_length, hop_length=hop_length)[0]

    loudness['acc'] = acc_loudness
    loudness['vocal'] = vocal_loudness

    # Adjust the loudness of each frame of the vocal track based on the accompaniment
    for i, (a, v) in enumerate(zip(acc_loudness, vocal_loudness)):
        l = i * hop_length
        r = (i + 1) * hop_length
        ratio = a / v
        vocal_normalized[l:r] *= ratio

    loudness['normalized'] = librosa.feature.rms(vocal_normalized, frame_length=frame_length, hop_length=hop_length)[0]

    return vocal_normalized, loudness

"""
    Aligns the beats of vocals and accompaniment to the same beat structure.
"""

def align_beat(vocal, vocal_unseparated, acc, acc_unseparated, sr=44100, hop_length=1024, verbose=False):
    def beat(song):
        return librosa.beat.beat_track(y=song, sr=sr, hop_length=hop_length, units='samples')

    def calc_stretch(beat_times_from, beat_times_to):
        return ((beat_times_to[1:] - beat_times_to[:-1]).mean() /
                (beat_times_from[1:] - beat_times_from[:-1]).mean())

    def stretch(song, mult):
        return librosa.effects.time_stretch(song, mult)

    def process(song, unseparated, other_unseparated, mult):
        song_unseparated_speedup = stretch(unseparated, mult)
        song_speedup = stretch(song, mult)

        _, beat_samples_song_speedup = beat(song_unseparated_speedup)
        _, beat_samples_other_speedup = beat(other_unseparated)

        shift = beat_samples_song_speedup[1] - beat_samples_other_speedup[0]
        song_speedup_shifted = song_speedup[shift if shift >= 0 else 0:]

        return song_speedup_shifted

    # Detect the bpm in each track
    tempo_vocal, beat_samples_vocal = beat(vocal_unseparated)
    tempo_acc, beat_samples_acc = beat(acc_unseparated)

    if verbose: print(f'Detected bpms:\nVocal - {tempo_vocal}\nAcc   - {tempo_acc}')

    # Calculate the necessary time stretch to align them
    mult = calc_stretch(beat_samples_vocal, beat_samples_acc)
    mult_vocal = 1. / (mult ** 0.5)
    mult_acc = mult ** 0.5

    if verbose: print(f'Detected bpm ratio: {mult}\nSpeeding up vocal by {mult_vocal}\nSpeeding up acc by   {mult_acc}')

    # Shift tracks as their lenght changes after speeding up (slowing down)
    vocal_speedup_shifted = process(vocal, vocal_unseparated, acc_unseparated, mult_vocal)
    acc_speedup_shifted = process(acc, acc_unseparated, vocal_unseparated, mult_acc)

    common_length = min(acc_speedup_shifted.shape[0], vocal_speedup_shifted.shape[0])

    return acc_speedup_shifted[:common_length],  vocal_speedup_shifted[:common_length]


"""
    Actually run normalizing and beat alignment.
"""

def merge(name1, name2):

    acc_path = '/content/drive/My Drive/songs/'+name1
    vocal_path = '/content/drive/My Drive/songs/'+name2

    separator = Separator('spleeter:4stems', stft_backend=spleeter.audio.STFTBackend.LIBROSA)

    sr = 44100

    audio_loader = AudioAdapter.default()
    acc_unseparated, _ = audio_loader.load(acc_path, sample_rate=sr)
    vocal_unseparated, _ = audio_loader.load(vocal_path, sample_rate=sr)
    acc_unseparated = cut(acc_unseparated)
    vocal_unseparated = cut(vocal_unseparated)


    vocal_separated = separator.separate(vocal_unseparated)
    acc_separated = separator.separate(acc_unseparated)

    vocal = mono(vocal_separated['vocals']).T
    vocal_unseparated = mono(vocal_unseparated).T

    acc = mono(acc_separated['other'] + acc_separated['drums'] + acc_separated['bass']).T
    acc_unseparated = mono(acc_unseparated).T
    vocal_normalized, loudness = normalize_loudness(acc, vocal, frame_length=512, hop_length=512)

    acc_adjusted, vocal_adjusted = align_beat(
        vocal,
        vocal_unseparated,
        acc,
        acc_unseparated,
        verbose=False
    )

    merged = acc_adjusted + vocal_adjusted
    name=name1[:-4]+'-'+name2[:-4]+'.wav'
    sf.write(name, merged, sr)
    from google.colab import files
    #files.download(name)

In [17]:
names = [
    '50cent.mp3', 'dralban2.mp3', 'future.mp3', 'song1.mp3',
    'astro.mp3', 'eminem.mp3', 'kanye.mp3',
    'bach.mp3', 'eminem2.mp3', 'oxxy.m4a',
    'dralban.mp3', 'eminem3.mp3', 'pugac.mp3'
]
for i in tqdm(range(len(names))):
    for j in range(len(names)):
        if not (names[i]+ names[j] in ['50cent.mp3future.mp3'])
        merge(names[i], names[j])
        print(f'MERGED {names[i]}, {names[j]}')


  0%|          | 0/13 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

MERGED 50cent.mp3, 50cent.mp3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

MERGED 50cent.mp3, dralban2.mp3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

MERGED 50cent.mp3, future.mp3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

MERGED 50cent.mp3, song1.mp3


Process ForkPoolWorker-11:
Process ForkPoolWorker-12:
Process ForkPoolWorker-3:
Process ForkPoolWorker-10:
Process ForkPoolWorker-4:
Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Process ForkPoolWorker-9:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self

KeyboardInterrupt: ignored