In [1]:
%%capture
!pip install nussl
!pip install tensorflow
import numpy as np
import librosa
import tensorflow as tf
import IPython.display as ipd
import IPython
import librosa.display
import numpy as gfg 
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import torch
from torch import nn
import nussl
import nussl.evaluation
import museval

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
musdb = nussl.datasets.MUSDB18(download=True)

Downloading MUSDB 7s Sample Dataset to /root/.nussl/musdb18...
Done!


In [4]:
mixture, drum, bass, rest, vocal = 0, 1, 2, 3, 4
hop_length = 256
hop_length = 512
window_length = 4096
srate = 44100
frame_size = 9
train_size = np.int(len(musdb)*0.7)
pad_size = 7
spec_len = 587

In [5]:
def min_max_norm(data):
  min, max = np.min(data), np.max(data)
  return (data-min)/(max-min)

In [6]:
def padding(data, frame_size):
 
  pad = np.zeros((data.shape[0], pad_size))
  left = np.append(data[...,0], pad, axis=1)
  right = np.append(data[...,1], pad, axis=1)

  return left, right

In [7]:
def overlap(data):
  count = np.zeros(spec_len)

  spec = np.zeros((2049, 587))

  for i in np.arange(len(data)):
    spec[:,i:i+frame_size] += data[i]
    count[i:i+frame_size]+=1.0

  for i in np.arange(587):
    spec[:,i] /= count[i]
  return spec

In [8]:
def slice(left_spec, right_spec, frame_size):

  X_left = []
  X_right = []
  length = left_spec.shape[1]
  for i in range(0, length, frame_size):
    X_left.append(left_spec[:,i:i+frame_size])
    X_right.append(right_spec[:,i:i+frame_size])
  
  
  X_left = np.array(np.stack(X_left)).reshape(-1, 2049, 9, 1)
  X_right = np.array(np.stack(X_right)).reshape(-1, 2049, 9, 1)

  return X_left, X_right

In [9]:
def slice_dense(spec, frame_size):

  left_spec = spec[...,0]
  right_spec = spec[...,1]

  length = left_spec.shape[1]

  X_left = []
  X_right = []
  for i in range(0, length-frame_size+1, 1):
    X_left.append(left_spec[:,i:i+frame_size])
    X_right.append(right_spec[:,i:i+frame_size])
  
  
  X_left = np.array(np.stack(X_left)).reshape(-1, 2049, 9, 1)
  X_right = np.array(np.stack(X_right)).reshape(-1, 2049, 9, 1)

  return X_left, X_right

In [10]:
def merge_masks(left, right):

  left[left>0.5]=1
  left[left<=0.5]=0

  right[right>0.5]=1
  right[right<=0.5]=0
  
  arr = []
  for ele in left:
    arr.append(ele.reshape(2049, 9))

  total = np.concatenate(arr[:], axis=1)
  range = np.arange(total.shape[1]-pad_size,total.shape[1]+1)
  left_mask = np.delete(total, range, axis=1)

  arr = []
  for ele in right:
    arr.append(ele.reshape(2049, 9))
  total = np.concatenate(arr[:], axis=1)
  right_mask = np.delete(total, range, axis=1)

  mask = np.dstack((left_mask, right_mask))
  return mask


In [11]:
def merge_masks_dense(left, right):

  left[left>0.5]=1
  left[left<=0.5]=0

  right[right>0.5]=1
  right[right<=0.5]=0
  
  left_total = []
  for ele in left:
    left_total.append(ele.reshape(2049, 9))


  right_total = []
  for ele in right:
    right_total.append(ele.reshape(2049, 9))


  left_mask, right_mask = overlap(left_total), overlap(right_total)
  mask = np.dstack((left_mask, right_mask))
  
  mask[mask>=0.5]=1
  mask[mask<0.5]=0
  return mask



In [12]:
def get_vocal(mask, track):
  masked = mask * np.abs(track['mix'].stft(hop_length=hop_length, 
                        window_length=window_length, window_type='hann' ))
  mix_phase = np.angle(track['mix'].stft(hop_length=hop_length, 
                        window_length=window_length, window_type='hann' ))

  masked_stft = masked * np.exp(1j * mix_phase)
  new_signal = nussl.AudioSignal(stft=masked_stft, sample_rate=track['mix'].sample_rate)
  return new_signal.istft(hop_length=hop_length,window_length=window_length, window_type='hann' )

In [13]:
model = tf.keras.models.load_model('/content/drive/My Drive/Convmodels2/final')

In [14]:
ori_mixs = []
reget_vocals = []
ori_vocals = []
for i in range(train_size, len(musdb)):
    print(i, end=' ')
    if i%30 == 29:
      print()

    track = musdb[i]
    mixture_spec = librosa.amplitude_to_db(np.abs(track['mix'].stft(hop_length=hop_length,
                        window_length=window_length, window_type='hann' )))
    
    ori_vocals.append(track['sources']['vocals'])
    ori_mixs.append(track['mix'])
    X_left, X_right= slice_dense(mixture_spec, frame_size)
    
    X_left = min_max_norm(X_left)
    X_right = min_max_norm(X_right)

    mask_left = model.predict(X_left)
    mask_right = model.predict(X_right)
    
    mask = merge_masks_dense(mask_left, mask_right)
    
    new_signal = get_vocal(mask, track)
    signal = nussl.AudioSignal(audio_data_array=new_signal, sample_rate=srate)
    reget_vocals.append(signal)

100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 

In [16]:
SDR = []
ISR = []
SIR = []
SAR = []

for i in np.arange(len(ori_vocals)):
  bss = nussl.evaluation.BSSEvalV4(ori_vocals[i], reget_vocals[i])
  bss_scores = bss.evaluate() 

  bss_mix = nussl.evaluation.BSSEvalV4(ori_vocals[i], ori_mixs[i])
  bss_scores_mix = bss_mix.evaluate()

  SDR.append(np.array(bss_scores['musdb/'+ori_vocals[i].file_name]['SDR'])-np.array(bss_scores_mix['musdb/'+ori_vocals[i].file_name]['SDR']))
  ISR.append(np.array(bss_scores['musdb/'+ori_vocals[i].file_name]['ISR']))
  SIR.append(np.array(bss_scores['musdb/'+ori_vocals[i].file_name]['SIR']))
  SAR.append(np.array(bss_scores['musdb/'+ori_vocals[i].file_name]['SAR']))
SDR = np.array(SDR)
ISR = np.array(ISR)
SIR = np.array(SIR)
SAR = np.array(SAR)
print("SDR", SDR.shape)
print("ISR", ISR.shape)
print("SIR", SIR.shape)
print("SAR", SAR.shape)

SDR (44, 4)
ISR (44, 4)
SIR (44, 4)
SAR (44, 4)


In [17]:
print(np.mean(SDR), np.sqrt(np.var(SDR)), np.min(SDR), np.max(SDR), np.median(SDR))

6.866666056281562 3.7234650126864843 -1.4698862324517954 19.64137173505305 6.214615151573565


In [18]:
print("SDR", np.mean(SDR), np.sqrt(np.var(SDR)), np.min(SDR), np.max(SDR), np.median(SDR))
print("ISR", np.mean(ISR), np.sqrt(np.var(ISR)), np.min(ISR), np.max(ISR), np.median(ISR))
print("SIR", np.mean(SIR), np.sqrt(np.var(SIR)), np.min(SIR), np.max(SIR), np.median(SIR))
print("SAR", np.mean(SAR), np.sqrt(np.var(SAR)), np.min(SAR), np.max(SAR), np.median(SAR))

SDR 6.866666056281562 3.7234650126864843 -1.4698862324517954 19.64137173505305 6.214615151573565
ISR 3.0320095244435534 2.488774166754534 -0.07580086917985134 12.689332894337461 2.33499725464627
SIR inf nan inf inf inf
SAR -2.065170371800676 5.875968354990354 -31.497348829901412 8.58120423692625 -0.9273895846361468


  x = asanyarray(arr - arrmean)


In [19]:
ipd.Audio(musdb[112]['mix'].audio_data, rate=srate)

In [20]:
ipd.Audio(musdb[112]['sources']['vocals'].audio_data, rate=srate)

In [23]:
reget_vocals[12].write_audio_to_file('/content/drive/My Drive/Convmodels/first_CNN.wav')

In [24]:
import nussl.evaluation
bss = nussl.evaluation.BSSEvalV4(ori_vocals[12], reget_vocals[12])
bss_scores = bss.evaluate()

In [25]:
print(bss_scores)

{'combination': [0], 'permutation': [0], "musdb/Juliet's Rescue - Heartbeats_vocals.wav": {'SDR': [3.284680133458759, 4.09985093868975, 3.6665839419134922, 2.765715627929404], 'ISR': [4.735310932471189, 6.130368782916836, 5.739841621937064, 5.557761416857319], 'SIR': [inf, inf, inf, inf], 'SAR': [1.5214125017933964, 2.8235772681841875, 2.7087497637471296, 2.092970946649527]}}


In [26]:
train_vocals = []
ori_train_vocals = []
for i in np.arange(0, 10):
    print(i, end=' ')
    if i%30 == 29:
      print()

    track = musdb[i]
    mixture_spec = librosa.amplitude_to_db(np.abs(track['mix'].stft(hop_length=hop_length,
                        window_length=window_length, window_type='hann' )))

    
    ori_train_vocals.append(track['sources']['vocals'])
    X_left, X_right= slice_dense(mixture_spec, frame_size)

    X_left = min_max_norm(X_left)
    X_right = min_max_norm(X_right)

    mask_left = model.predict(X_left)
    mask_right = model.predict(X_right)

    mask = merge_masks_dense(mask_left, mask_right)
    
    new_signal = get_vocal(mask, track)
    signal = nussl.AudioSignal(audio_data_array=new_signal, sample_rate=srate)
    train_vocals.append(signal)

0 1 2 3 4 5 6 7 8 9 

In [27]:
tSDR = []
tISR = []
tSIR = []
tSAR = []

for i in np.arange(len(ori_train_vocals)):
  bss = nussl.evaluation.BSSEvalV4(ori_train_vocals[i], train_vocals[i])
  bss_scores = bss.evaluate() 
  tSDR.append(np.mean(bss_scores['musdb/'+ori_train_vocals[i].file_name]['SDR']))
  tISR.append(np.mean(bss_scores['musdb/'+ori_train_vocals[i].file_name]['ISR']))
  tSIR.append(np.mean(bss_scores['musdb/'+ori_train_vocals[i].file_name]['SIR']))
  tSAR.append(np.mean(bss_scores['musdb/'+ori_train_vocals[i].file_name]['SAR']))
print("SDR", np.mean(tSDR))
print("ISR", np.mean(tISR))
print("SIR", np.mean(tSIR))
print("SAR", np.mean(tSAR))

SDR 5.803688101655602
ISR 10.785131694966516
SIR inf
SAR 5.505044929534519


In [28]:
ipd.Audio(musdb[8]['mix'].audio_data, rate=srate)

In [29]:
musdb[8]['sources']['vocals'].embed_audio()

In [31]:
train_vocals[8].embed_audio()