In [3]:
import spleeter
from spleeter.separator import Separator
from spleeter.audio.adapter import AudioAdapter

from IPython.display import Audio

separator = Separator('spleeter:2stems', stft_backend=spleeter.audio.STFTBackend.LIBROSA)

audio_loader = AudioAdapter.default()
sample_rate = 22050  # 44100 is much better, but this is much faster
waveform, _ = audio_loader.load('data/kanye.mp3', sample_rate=sample_rate)

separated = separator.separate(waveform)
print(separated)

vocals = separated['vocals']
accom = separated['accompaniment']

ModuleNotFoundError: No module named 'spleeter'

In [None]:
import librosa
y, sr = librosa.load('data/kanye.mp3', mono=False)  # 2 channels

print(y.shape)
print(vocals.shape, accom.shape)

In [None]:
Audio(data=accom.T, rate=sample_rate)

In [None]:
y, sr = librosa.load('data/kanye.mp3')  # use one channel, beat tracking does not support more (https://librosa.org/doc/latest/multichannel.html#exceptions)

In [None]:
song_length = len(y) / sr
print(song_length)

In [None]:
hop_length = 512  # default
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)

In [None]:
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

In [None]:
Audio(y, rate=sr)

In [None]:
kanye_waveform, _ = audio_loader.load('data/kanye.mp3', sample_rate=sample_rate)
kanye_vocals = separator.separate(kanye_waveform)['vocals']
print(kanye_vocals[:10])
Audio(kanye_vocals.T, rate=sample_rate)

In [None]:
eminem_waveform, _ = audio_loader.load('data/eminem.mp3', sample_rate=sample_rate)
eminem_accom = separator.separate(eminem_waveform)['accompaniment']
print(eminem_accom[:10])
Audio(eminem_accom.T, rate=sample_rate)

In [None]:
print(eminem_accom[:10])
Audio(eminem_accom.T, rate=sample_rate)

In [None]:
import librosa

hop_length = 512  # default

tempo_vocals, beat_frames_vocals = librosa.beat.beat_track(y=kanye_vocals.mean(1), sr=sample_rate, hop_length=hop_length)
tempo_accom, beat_frames_accom = librosa.beat.beat_track(y=eminem_accom.mean(1), sr=sample_rate, hop_length=hop_length)

In [None]:
beat_times_vocals = librosa.frames_to_time(beat_frames_vocals)
beat_times_accom = librosa.frames_to_time(beat_frames_accom)

def calc_stretch(beat_times_from, beat_times_to):
    time_per_beat_from = (beat_times_from[1:] - beat_times_from[:-1]).mean()
    time_per_beat_to = (beat_times_to[1:] - beat_times_to[:-1]).mean()
    return time_per_beat_to / time_per_beat_from

In [None]:
beat_samples_vocals = librosa.frames_to_samples(beat_frames_vocals)
beat_samples_accom = librosa.frames_to_samples(beat_frames_accom)

In [None]:
mult = calc_stretch(beat_samples_vocals, beat_samples_accom)
mult

In [None]:
accom_spedup = librosa.effects.time_stretch(eminem_accom.mean(1), mult)

In [None]:
Audio(accom_spedup, rate=sample_rate)

In [None]:
Audio(kanye_vocals.T, rate=sample_rate)

In [None]:
beat_samples_accom_spedup = librosa.frames_to_samples(librosa.beat.beat_track(y=accom_spedup, sr=sample_rate, hop_length=hop_length)[1])

In [None]:
print((beat_samples_vocals[1:] - beat_samples_vocals[:-1]).mean())
print((beat_samples_accom_spedup[1:] - beat_samples_accom_spedup[:-1]).mean())

In [None]:
shift = beat_samples_vocals[0] - beat_samples_accom_spedup[0]
if shift >= 0:
    vocals_spedup_shifted = kanye_vocals[shift:]
    accom_spedup_shifted = accom_spedup
else:
    accom_spedup_shifted = accom_spedup[-shift:]
    vocals_spedup_shifted = kanye_vocals

In [None]:
x = librosa.frames_to_samples(librosa.beat.beat_track(vocals_spedup_shifted.mean(1), sr=sample_rate, hop_length=hop_length)[1])
x[1:] - x[:-1]
x[:100]

In [None]:
y = librosa.frames_to_samples(librosa.beat.beat_track(accom_spedup_shifted, sr=sample_rate, hop_length=hop_length)[1])
y[1:] - y[:-1]
y[:100] - x[:100]

In [None]:
import soundfile as sf

common_length = min(accom_spedup_shifted.shape[0], vocals_spedup_shifted.shape[0])

sf.write('combined.wav', accom_spedup_shifted[:common_length] + vocals_spedup_shifted.mean(1)[:common_length], sample_rate)

In [None]:
Audio(accom_spedup_shifted[:common_length] + vocals_spedup_shifted.mean(1)[:common_length], rate=sample_rate)