In [1]:
%%capture
!pip install nussl
!pip install tensorflow
import numpy as np
import librosa
import tensorflow as tf
import IPython.display as ipd
import IPython
import librosa.display
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import torch
from torch import nn
import nussl
from nussl.ml.networks.modules import AmplitudeToDB
import museval

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
musdb = nussl.datasets.MUSDB18(download=True)

Downloading MUSDB 7s Sample Dataset to /root/.nussl/musdb18...
Done!


In [4]:
mixture, drum, bass, rest, vocal = 0, 1, 2, 3, 4
hop_length = 768
window_length = 1024
srate = 44100
frame_size = 128
pad_size = 120
spec_len = 392
train_size = np.int(len(musdb)*0.7)

In [5]:
def min_max_norm(data):
  min, max = np.min(data), np.max(data)
  return (data-min)/(max-min)

In [6]:
def padding(data, frame_size):
 
  pad = np.zeros((data.shape[0], pad_size))
  left = np.append(data[...,0], pad, axis=1)
  right = np.append(data[...,1], pad, axis=1)

  return left[0:512,:], right[0:512,:]

In [7]:
def overlap(data):
  count = np.zeros(spec_len)

  spec = np.zeros((512, 392))

  for i in np.arange(len(data)):
    spec[:,i:i+frame_size] += data[i]
    count[i:i+frame_size]+=1.0

  for i in np.arange(392):
    spec[:,i] /= count[i]

  pad = np.zeros((1, spec.shape[1]))
  spec = np.append(spec, pad, axis=0)
  return spec

In [8]:
def slice(left_spec, right_spec, frame_size):

  X_left = []
  X_right = []
  length = left_spec.shape[1]
  for i in range(0, length, frame_size):
    X_left.append(left_spec[:,i:i+frame_size])
  for i in range(0, length, frame_size):
    X_right.append(right_spec[:,i:i+frame_size])
  
  
  X_left = np.array(np.stack(X_left)).reshape(-1, 512, 128, 1)
  X_right = np.array(np.stack(X_right)).reshape(-1, 512, 128, 1)

  return X_left, X_right

In [9]:
def slice_dense(spec, frame_size):

  left_spec = spec[...,0]
  right_spec = spec[...,1]

  length = left_spec.shape[1]

  X_left = []
  X_right = []
  for i in range(0, length-frame_size+1, 1):
    X_left.append(left_spec[0:512,i:i+frame_size])
    X_right.append(right_spec[0:512,i:i+frame_size])
  
  
  X_left = np.array(np.stack(X_left)).reshape(-1, 512, 128, 1)
  X_right = np.array(np.stack(X_right)).reshape(-1, 512, 128, 1)

  return X_left, X_right

In [10]:
def merge_masks(left, right):

  left[left>0.5]=1
  left[left<=0.5]=0
 
  right[right>0.5]=1
  right[right<=0.5]=0
  
  arr = []
  for ele in left:
    arr.append(ele.reshape(512, 128))

  total = np.concatenate(arr[:], axis=1)

  range = np.arange(total.shape[1]-pad_size,total.shape[1]+1)
  left_mask = np.delete(total, range, axis=1)
  pad = np.zeros((1, left_mask.shape[1]))
  left_mask = np.append(left_mask, pad, axis=0)

  arr = []
  for ele in right:
    arr.append(ele.reshape(512, 128))
  total = np.concatenate(arr[:], axis=1)
  right_mask = np.delete(total, range, axis=1)
  pad = np.zeros((1, right_mask.shape[1]))
  right_mask = np.append(right_mask, pad, axis=0)

  mask = np.dstack((left_mask, right_mask))

  return mask


In [11]:
def merge_masks_dense(left, right):

  left[left>0.5]=1
  left[left<=0.5]=0

  right[right>0.5]=1
  right[right<=0.5]=0
  
  left_total = []
  for ele in left:
    left_total.append(ele.reshape(512, 128))


  right_total = []
  for ele in right:
    right_total.append(ele.reshape(512, 128))


  left_mask, right_mask = overlap(left_total), overlap(right_total)
  mask = np.dstack((left_mask, right_mask))
  
  mask[mask>=0.5]=1
  mask[mask<0.5]=0
  return mask



In [12]:
def get_softmask(item):
  representation = np.abs(item['mix'].stft(hop_length=hop_length, window_length=window_length,window_type='hann' ))
  vocals_representation = np.abs(item['sources']['vocals'].stft(hop_length=hop_length, window_length=window_length, 
                                  window_type='hann' ))
  mask = vocals_representation / (np.maximum(vocals_representation, representation) + 1e-8)
  
  return mask

In [13]:
def get_binarymask(item):
  mask = get_softmask(item)

  mask[mask>0.5]=1
  mask[mask<=0.5] = 0
  return mask

In [14]:
def get_vocal(mask, track):
  masked = mask * np.abs(track['mix'].stft(hop_length=hop_length, 
                        window_length=window_length, window_type='hann' ))
  mix_phase = np.angle(track['mix'].stft(hop_length=hop_length, 
                        window_length=window_length, window_type='hann' ))

  masked_stft = masked * np.exp(1j * mix_phase)
  new_signal = nussl.AudioSignal(stft=masked_stft, sample_rate=srate)
  return new_signal.istft(hop_length=hop_length,window_length=window_length, window_type='hann' )

In [15]:
model = tf.keras.models.load_model('/content/drive/My Drive/Umodels2/final')

In [16]:
ori_mixs = []
ori_vocals = []
reget_vocals = []
names = []
for i in np.arange(train_size, len(musdb)):
    print(i, end=' ')
    if i%30 == 29:
      print()

    track = musdb[i]
    mixture_spec = librosa.amplitude_to_db(np.abs(track['mix'].stft(hop_length=hop_length,
                        window_length=window_length, window_type='hann' )))
    #left_spec, right_spec = padding(mixture_spec, frame_size)

    names.append(track['sources']['vocals'])

    ori_mask = get_binarymask(track)
    ori_new_signal = get_vocal(ori_mask, track)
    ori_signal = nussl.AudioSignal(audio_data_array=ori_new_signal, sample_rate=srate)
    ori_vocals.append(ori_signal)

    mix = track['mix'].audio_data
    ori_mix_mask = np.ones((513, 392, 2))
    ori_mix_new_signal = get_vocal(ori_mix_mask, track)
    ori_mix_signal = nussl.AudioSignal(audio_data_array=ori_mix_new_signal, sample_rate=srate)
    ori_mixs.append(ori_mix_signal)

    X_left, X_right= slice_dense(mixture_spec, frame_size)

    X_left = min_max_norm(X_left)
    X_right = min_max_norm(X_right)

    mask_left = model.predict(X_left)
    mask_right = model.predict(X_right)

    mask = merge_masks_dense(mask_left, mask_right)
    
    new_signal = get_vocal(mask, track)
    signal = nussl.AudioSignal(audio_data_array=new_signal, sample_rate=srate)
    reget_vocals.append(signal)

100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 

In [21]:
SDR = []
ISR = []
SIR = []
SAR = []

for i in np.arange(len(ori_vocals)):
  bss = nussl.evaluation.BSSEvalV4(ori_vocals[i], reget_vocals[i])
  bss_scores = bss.evaluate() 

  bss_mix = nussl.evaluation.BSSEvalV4(ori_vocals[i], ori_mixs[i])
  bss_scores_mix = bss_mix.evaluate()

  SDR.append(np.array(bss_scores['source_0']['SDR'])-np.array(bss_scores_mix['source_0']['SDR']))
  ISR.append(np.array(bss_scores['source_0']['ISR']))
  SIR.append(np.array(bss_scores['source_0']['SIR']))
  SAR.append(np.array(bss_scores['source_0']['SAR']))
SDR = np.array(SDR)
ISR = np.array(ISR)
SIR = np.array(SIR)
SAR = np.array(SAR)
print("SDR", SDR.shape)
print(np.mean(SDR))
print("ISR", ISR.shape)
print("SIR", SIR.shape)
print("SAR", SAR.shape)

SDR (44, 4)
6.355782751243791
ISR (44, 4)
SIR (44, 4)
SAR (44, 4)


In [22]:
print(np.mean(SDR), np.sqrt(np.var(SDR)), np.min(SDR), np.max(SDR), np.median(SDR))

6.355782751243791 3.3887576144442426 -0.6795370322599203 18.505054704083292 5.961784891857439


In [23]:
print("SDR", np.mean(SDR), np.sqrt(np.var(SDR)), np.min(SDR), np.max(SDR), np.median(SDR))
print("ISR", np.mean(ISR), np.sqrt(np.var(ISR)), np.min(ISR), np.max(ISR), np.median(ISR))
print("SIR", np.mean(SIR), np.sqrt(np.var(SIR)), np.min(SIR), np.max(SIR), np.median(SIR))
print("SAR", np.mean(SAR), np.sqrt(np.var(SAR)), np.min(SAR), np.max(SAR), np.median(SAR))

SDR 6.355782751243791 3.3887576144442426 -0.6795370322599203 18.505054704083292 5.961784891857439
ISR 6.8586019315144995 3.9509858172160452 -0.7724343890228282 16.94093607202514 6.380907692384336
SIR inf nan inf inf inf
SAR -0.9072132956705339 5.7761537634623465 -33.102314039556255 8.342426443431513 -0.3133597905858489


  x = asanyarray(arr - arrmean)


In [24]:
ipd.Audio(musdb[112]['mix'].audio_data, rate=srate)

In [25]:
ipd.Audio(musdb[112]['sources']['vocals'].audio_data, rate=srate)

In [26]:
reget_vocals[12].embed_audio()

In [27]:
reget_vocals[12].write_audio_to_file('/content/drive/My Drive/Convmodels/U-net.wav')

In [28]:
train_vocals = []
ori_train_vocals = []
for i in np.arange(0, 10):
    print(i, end=' ')
    if i%30 == 29:
      print()

    track = musdb[i]
    mixture_spec = librosa.amplitude_to_db(np.abs(track['mix'].stft(hop_length=hop_length,
                        window_length=window_length, window_type='hann' )))
    
    #left_spec, right_spec = padding(mixture_spec, frame_size)

    ori_train_vocals.append(track['sources']['vocals'])
    X_left, X_right= slice_dense(mixture_spec, frame_size)

    X_left = min_max_norm(X_left)
    X_right = min_max_norm(X_right)

    mask_left = model.predict(X_left)
    mask_right = model.predict(X_right)

    mask = merge_masks_dense(mask_left, mask_right)
    
    new_signal = get_vocal(mask, track)
    signal = nussl.AudioSignal(audio_data_array=new_signal, sample_rate=srate)
    train_vocals.append(signal)

0 1 2 3 4 5 6 7 8 9 