<a href="https://colab.research.google.com/github/shreya0505/MusicalSourceSeparation/blob/master/UnetAudioSeparator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Imports**

In [None]:
pip install tensorflow==1.8



In [None]:
import tensorflow as tf
print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.8.0


In [None]:
pip install FFProbe



In [None]:
pip install ffmpeg



In [None]:
pip install ffprobe



In [None]:
pip install stempeg



In [None]:
pip install musdb



In [None]:
pip install soundfile



In [None]:
import glob2 as glob
import os
import random
import numpy as np
import librosa
from multiprocessing import Process
from lxml import etree
import soundfile as sf
import musdb
import functools
import csv 
import ffmpeg
import stempeg
from tensorflow.contrib.signal.python.ops import window_ops


# **Config**

In [None]:
 model_config = {"musdb_path" : "/content/drive/My Drive/BTP/Dataset", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
                 "directory" : "/content/drive/My Drive/BTP/Dataset/partitions",
                    "estimates_path" : "/content/drive/My Drive/BTP/Dataset/source_estimates", # SET THIS PATH TO WHERE YOU WANT SOURCE ESTIMATES PRODUCED BY THE TRAINED MODEL TO BE SAVED. Folder itself must exist!
                    "data_path" : "/content/drive/My Drive/BTP/Dataset/data", # Set this to where the preprocessed dataset should be saved
                    "model_base_dir" : "/content/drive/My Drive/BTP/Dataset/checkpoints", # Base folder for model checkpoints
                    "log_dir" : "/content/drive/My Drive/BTP/Dataset/logs", # Base folder for logs files
                    "batch_size" : 16, # Batch size
                    "init_sup_sep_lr" : 1e-4, # Supervised separator learning rate
                    "epoch_it" : 0, # Number of supervised separator steps per epoch
                    'cache_size': 4000, # Number of audio snippets buffered in the random shuffle queue. Larger is better, since workers put multiple examples of one song into this queue. The number of different songs that is sampled from with each batch equals cache_size / num_snippets_per_track. Set as high as your RAM allows.
                    'num_workers' : 4, # Number of processes used for each TF map operation used when loading the dataset
                    "num_snippets_per_track" : 100, # Number of snippets that should be extracted from each song at a time after loading it. Higher values make data loading faster, but can reduce the batches song diversity
                    'num_layers' : 12, # How many U-Net layers
                    'filter_size' : 15, # For Wave-U-Net: Filter size of conv in downsampling block
                    'merge_filter_size' : 5, # For Wave-U-Net: Filter size of conv in upsampling block
                    'input_filter_size' : 15, # For Wave-U-Net: Filter size of first convolution in first downsampling block
                    'output_filter_size': 1, # For Wave-U-Net: Filter size of convolution in the output layer
                    'num_initial_filters' : 24, # Number of filters for convolution in first layer of network
                    "num_frames": 16384, # DESIRED number of time frames in the output waveform per samples (could be changed when using valid padding)
                    'expected_sr': 22050,  # Downsample all audio input to this sampling rate
                    'mono_downmix': True,  # Whether to downsample the audio input
                    'output_type' : 'direct', # Type of output layer, either "direct" or "difference". Direct output: Each source is result of tanh activation and independent. DIfference: Last source output is equal to mixture input - sum(all other sources)
                    'output_activation' : 'tanh', # Activation function for output layer. "tanh" or "linear". Linear output involves clipping to [-1,1] at test time, and might be more stable than tanh
                    'context' : False, # Type of padding for convolutions in separator. If False, feature maps double or half in dimensions after each convolution, and convolutions are padded with zeros ("same" padding). If True, convolution is only performed on the available mixture input, thus the output is smaller than the input
                    'network' : 'unet', # Type of network architecture, either unet (our model) or unet_spectrogram (Jansson et al 2017 model)
                    'upsampling' : 'linear', # Type of technique used for upsampling the feature maps in a unet architecture, either 'linear' interpolation or 'learned' filling in of extra samples
                    'task' : 'voice', # Type of separation task. 'voice' : Separate music into voice and accompaniment. 'multi_instrument': Separate music into guitar, bass, vocals, drums and other (Sisec)
                    'augmentation' : True, # Random attenuation of source signals to improve generalisation performance (data augmentation)
                    'raw_audio_loss' : True, # Only active for unet_spectrogram network. True: L2 loss on audio. False: L1 loss on spectrogram magnitudes for training and validation and test loss
                    'worse_epochs' : 20, # Patience for early stoppping on validation set
                    }


In [None]:
experiment_id = np.random.randint(0,1000000)

In [None]:
def baseline():
    print("Training baseline model")

def baseline_diff():
    print("Training baseline model with difference output")
    model_config = {
        "output_type" : "difference"
    }
def baseline_context():
    print("Training baseline model with difference output and input context (valid convolutions)")
    model_config = {
        "output_type" : "difference",
        "context" : True
    }

def baseline_stereo():
    print("Training baseline model with difference output and input context (valid convolutions) and stereo input/output")
    model_config = {
        "output_type" : "difference",
        "context" : True,
        "mono_downmix" : False
    }
def full():
    print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer")
    model_config = {
        "output_type" : "difference",
        "context" : True,
        "upsampling": "learned",
        "mono_downmix" : False
    }
def full_44KHz():
    print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer, and 44.1 KHz sampling rate")
    model_config = {
        "output_type" : "difference",
        "context" : True,
        "upsampling": "learned",
        "mono_downmix" : False,
        "expected_sr" : 44100
    }
def baseline_context_smallfilter_deep():
    model_config = {
        "output_type": "difference",
        "context": True,
        "num_layers" : 14,
        "duration" : 7,
        "filter_size" : 5,
        "merge_filter_size" : 1
    }

def full_multi_instrument():
    print("Training multi-instrument separation with best model")
    model_config = {
        "output_type": "difference",
        "context": True,
        "upsampling": "linear",
        "mono_downmix": False,
        "task" : "multi_instrument"
    }

def baseline_comparison():
    model_config = {
        "batch_size": 4, # Less output since model is so big. Doesn't matter since the model's output is not dependent on its output or input size (only convolutions)

        "output_type": "difference",
        "context": True,
        "num_frames" : 768*127 + 1024,
        "duration" : 13,
        "expected_sr" : 8192,
        "num_initial_filters" : 34
    }

def unet_spectrogram():
    model_config = {
        "batch_size": 4, # Less output since model is so big.

        "network" : "unet_spectrogram",
        "num_layers" : 6,
        "expected_sr" : 8192,
        "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
        "duration" : 13,
        "num_initial_filters" : 16
    }
def unet_spectrogram_l1():
    model_config = {
        "batch_size": 4, # Less output since model is so big.

        "network" : "unet_spectrogram",
        "num_layers" : 6,
        "expected_sr" : 8192,
        "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
        "duration" : 13,
        "num_initial_filters" : 16,
        "raw_audio_loss" : False
    }


# **Utils**

In [None]:
def getTrainableVariables(tag=""):
    return [v for v in tf.trainable_variables() if tag in v.name]

In [None]:
def getNumParams(tensors):
    return np.sum([np.prod(t.get_shape().as_list()) for t in tensors])

In [None]:
def crop_and_concat(x1,x2, match_feature_dim=True):
    '''
    Copy-and-crop operation for two feature maps of different size.
    Crops the first input x1 equally along its borders so that its shape is equal to 
    the shape of the second input x2, then concatenates them along the feature channel axis.
    :param x1: First input that is cropped and combined with the second input
    :param x2: Second input
    :return: Combined feature map
    '''
    if x2 is None:
        return x1

    x1 = crop(x1,x2.get_shape().as_list(), match_feature_dim)
    return tf.concat([x1, x2], axis=2)


In [None]:
def random_amplify(sample):
    '''
    Randomly amplifies or attenuates the input signal
    :return: Amplified signal
    '''
    for key, val in list(sample.items()):
        if key != "mix":
            sample[key] = tf.random_uniform([], 0.7, 1.0) * val

    sample["mix"] = tf.add_n([val for key, val in list(sample.items()) if key != "mix"])
    return sample


In [None]:
def crop_sample(sample, crop_frames):
    for key, val in list(sample.items()):
        if key != "mix" and crop_frames > 0:
            sample[key] = val[crop_frames:-crop_frames,:]
    return sample

In [None]:
def pad_freqs(tensor, target_shape):
    '''
    Pads the frequency axis of a 4D tensor of shape [batch_size, freqs, timeframes, channels] or 2D tensor [freqs, timeframes] with zeros
    so that it reaches the target shape. If the number of frequencies to pad is uneven, the rows are appended at the end. 
    :param tensor: Input tensor to pad with zeros along the frequency axis
    :param target_shape: Shape of tensor after zero-padding
    :return: Padded tensor
    '''
    target_freqs = (target_shape[1] if len(target_shape) == 4 else target_shape[0]) #TODO
    if isinstance(tensor, tf.Tensor):
        input_shape = tensor.get_shape().as_list()
    else:
        input_shape = tensor.shape

    if len(input_shape) == 2:
        input_freqs = input_shape[0]
    else:
        input_freqs = input_shape[1]

    diff = target_freqs - input_freqs
    if diff % 2 == 0:
        pad = [(diff/2, diff/2)]
    else:
        pad = [(diff//2, diff//2 + 1)] # Add extra frequency bin at the end

    if len(target_shape) == 2:
        pad = pad + [(0,0)]
    else:
        pad = [(0,0)] + pad + [(0,0), (0,0)]

    if isinstance(tensor, tf.Tensor):
        return tf.pad(tensor, pad, mode='constant', constant_values=0.0)
    else:
        return np.pad(tensor, pad, mode='constant', constant_values=0.0)


In [None]:
def LeakyReLU(x, alpha=0.2):
    return tf.maximum(alpha*x, x)

In [None]:
def AudioClip(x, training):
    '''
    Simply returns the input if training is set to True, otherwise clips the input to [-1,1]
    :param x: Input tensor (coming from last layer of neural network)
    :param training: Whether model is in training (True) or testing mode (False)
    :return: Output tensor (potentially clipped)
    '''
    if training:
        return x
    else:
        return tf.maximum(tf.minimum(x, 1.0), -1.0)

In [None]:
def resample(audio, orig_sr, new_sr):
    return librosa.resample(audio.T, orig_sr, new_sr).T

In [None]:
def load(path, sr=22050, mono=True, offset=0.0, duration=None, dtype=np.float32):
    # ALWAYS output (n_frames, n_channels) audio
    y, orig_sr = librosa.load(path, sr, mono, offset, duration, dtype)
    if len(y.shape) == 1:
        y = np.expand_dims(y, axis=0)
    return y.T, orig_sr

In [None]:
def crop(tensor, target_shape, match_feature_dim=True):
    '''
    Crops a 3D tensor [batch_size, width, channels] along the width axes to a target shape.
    Performs a centre crop. If the dimension difference is uneven, crop last dimensions first.
    :param tensor: 4D tensor [batch_size, width, height, channels] that should be cropped. 
    :param target_shape: Target shape (4D tensor) that the tensor should be cropped to
    :return: Cropped tensor
    '''
    shape = np.array(tensor.get_shape().as_list())
    diff = shape - np.array(target_shape)
    assert(diff[0] == 0 and (diff[2] == 0 or not match_feature_dim))# Only width axis can differ
    if (diff[1] % 2 != 0):
        print("WARNING: Cropping with uneven number of extra entries on one side")
    assert diff[1] >= 0 # Only positive difference allowed
    if diff[1] == 0:
        return tensor
    crop_start = diff // 2
    crop_end = diff - crop_start

    return tensor[:,crop_start[1]:-crop_end[1],:]


In [None]:
def spectrogramToAudioFile(magnitude, fftWindowSize, hopSize, phaseIterations=10, phase=None, length=None):
    '''
    Computes an audio signal from the given magnitude spectrogram, and optionally an initial phase.
    Griffin-Lim is executed to recover/refine the given the phase from the magnitude spectrogram.
    :param magnitude: Magnitudes to be converted to audio
    :param fftWindowSize: Size of FFT window used to create magnitudes
    :param hopSize: Hop size in frames used to create magnitudes
    :param phaseIterations: Number of Griffin-Lim iterations to recover phase
    :param phase: If given, starts ISTFT with this particular phase matrix
    :param length: If given, audio signal is clipped/padded to this number of frames
    :return:
    '''
    if phase is not None:
        if phaseIterations > 0:
            # Refine audio given initial phase with a number of iterations
            return reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations, phase, length)
        # reconstructing the new complex matrix
        stftMatrix = magnitude * np.exp(phase * 1j) # magnitude * e^(j*phase)
        audio = librosa.istft(stftMatrix, hop_length=hopSize, length=length)
    else:
        audio = reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations)
    return audio


In [None]:

def reconPhase(magnitude, fftWindowSize, hopSize, phaseIterations=10, initPhase=None, length=None):
    '''
    Griffin-Lim algorithm for reconstructing the phase for a given magnitude spectrogram, optionally with a given
    intial phase.
    :param magnitude: Magnitudes to be converted to audio
    :param fftWindowSize: Size of FFT window used to create magnitudes
    :param hopSize: Hop size in frames used to create magnitudes
    :param phaseIterations: Number of Griffin-Lim iterations to recover phase
    :param initPhase: If given, starts reconstruction with this particular phase matrix
    :param length: If given, audio signal is clipped/padded to this number of frames
    :return:
    '''
    for i in range(phaseIterations):
        if i == 0:
            if initPhase is None:
                reconstruction = np.random.random_sample(magnitude.shape) + 1j * (2 * np.pi * np.random.random_sample(magnitude.shape) - np.pi)
            else:
                reconstruction = np.exp(initPhase * 1j) # e^(j*phase), so that angle => phase
        else:
            reconstruction = librosa.stft(audio, fftWindowSize, hopSize)
        spectrum = magnitude * np.exp(1j * np.angle(reconstruction))
        if i == phaseIterations - 1:
            audio = librosa.istft(spectrum, hopSize, length=length)
        else:
            audio = librosa.istft(spectrum, hopSize)
    return audio

# **Dataset**

In [None]:
def take_random_snippets(sample, keys, input_shape, num_samples):
    # Take a sample (collection of audio files) and extract snippets from it at a number of random positions
    start_pos = tf.random_uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
    return take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples)


In [None]:
def take_all_snippets(sample, keys, input_shape, output_shape):
    # Take a sample and extract snippets from the audio signals, using a hop size equal to the output size of the network
    start_pos = tf.range(0, sample["length"] - input_shape[0], delta=output_shape[0], dtype=tf.int64)
    num_samples = start_pos.shape[0]
    return take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples)


In [None]:
def take_snippets_at_pos(sample, keys, start_pos, input_shape, num_samples):
    # Take a sample and extract snippets from the audio signals at the given start positions with the given number of samples width
    batch = dict()
    for key in keys:
        batch[key] = tf.map_fn(lambda pos: sample[key][pos:pos + input_shape[0], :], start_pos, dtype=tf.float32)
        batch[key].set_shape([num_samples, input_shape[0], input_shape[1]])

    return tf.data.Dataset.from_tensor_slices(batch)

In [None]:
def _floats_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))


In [None]:
def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


In [None]:
def write_records(sample_list, model_config, input_shape, output_shape, records_path):
    # Writes samples in the given list as TFrecords into a given path, using the current model config and in/output shapes

    # Compute padding
    if (input_shape[1] - output_shape[1]) % 2 != 0:
        print("WARNING: Required number of padding of " + str(input_shape[1] - output_shape[1]) + " is uneven!")
    pad_frames = (input_shape[1] - output_shape[1]) // 2

    # Set up writers
    num_writers = 1
    writers = [tf.python_io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]

    # Go through songs and write them to TFRecords
    all_keys = model_config["source_names"] + ["mix"]
    for sample in sample_list:
        print("Reading song")
        try:
            audio_tracks = dict()

            for key in all_keys:
                audio, _ =load(sample[key], sr=model_config["expected_sr"], mono=model_config["mono_downmix"])

                if not model_config["mono_downmix"] and audio.shape[1] == 1:
                    print("WARNING: Had to duplicate mono track to generate stereo")
                    audio = np.tile(audio, [1, 2])

                audio_tracks[key] = audio
        except Exception as e:
            print(e)
            print("ERROR occurred during loading file " + str(sample) + ". Skipping")
            continue

        # Pad at beginning and end with zeros
        audio_tracks = {key : np.pad(audio_tracks[key], [(pad_frames, pad_frames), (0, 0)], mode="constant", constant_values=0.0) for key in list(audio_tracks.keys())}

        # All audio tracks must be exactly same length and channels
        length = audio_tracks["mix"].shape[0]
        channels = audio_tracks["mix"].shape[1]
        for audio in list(audio_tracks.values()):
            assert(audio.shape[0] == length)
            assert (audio.shape[1] == channels)

        # Write to TFrecords the flattened version
        feature = {key: _floats_feature(audio_tracks[key]) for key in all_keys}
        feature["length"] = _int64_feature(length)
        feature["channels"] = _int64_feature(channels)
        sample = tf.train.Example(features=tf.train.Features(feature=feature))
        writers[np.random.randint(0, num_writers)].write(sample.SerializeToString())

    for writer in writers:
        writer.close()


In [None]:
def parse_record(example_proto, source_names, shape):
    # Parse record from TFRecord file

    all_names = source_names + ["mix"]

    features = {key : tf.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
    features["length"] = tf.FixedLenFeature([], tf.int64)
    features["channels"] = tf.FixedLenFeature([], tf.int64)

    parsed_features = tf.parse_single_example(example_proto, features)

    # Reshape
    length = tf.cast(parsed_features["length"], tf.int64)
    channels = tf.constant(shape[-1], tf.int64) #tf.cast(parsed_features["channels"], tf.int64)
    sample = dict()
    for key in all_names:
        sample[key] = tf.reshape(parsed_features[key], tf.stack([length, channels]))
    sample["length"] = length
    sample["channels"] = channels

    return sample

In [None]:
def getMUSDBHQ(directory):
    subsets = list()
    
    output_path= os.path.join(directory, "outputs" ,"getMUSDBHQ.csv")
    
    if os.path.exists(output_path):
          os.remove(output_path)
    
    with open(output_path, 'w', newline='') as file:
            fieldnames = ['track', 'mix', 'drums', 'bass', 'others' ,'vocals', 'accompaniment']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()

    log_path= os.path.join(directory, "outputs" ,"separation_info.csv")
    if not os.path.exists(log_path):
      with open(log_path, 'w', newline='') as file:
              fields = ['Track Name', 'Max abs Deviation', 'Mean abs Deviation']
              writer = csv.writer(file)
              writer.writerow(fields)
        

    for subset in ["fake_train", "fake_test"]:
        print("\n\nLoading " + subset + " set...")
        subset_path = os.path.join(directory,subset)
        samples = list()
        
        
        
        
        for track in os.listdir(subset_path):
            print("\nProcessing track\t\t\t", track)
            audio_path = os.path.join(subset_path,track)
            audio, rate = stempeg.read_stems(audio_path)
            
            track_name= track[:-9]
            target_path = os.path.join(directory,"separated",subset,track_name)
            if not os.path.exists(target_path):
               os.mkdir(target_path)

            example = dict()
            i=0;
            for stem in ["mix", "drums", "bass", "others","vocals"]:
              stem_path = os.path.join(target_path,stem + ".wav")
              example[stem] = stem_path
              if not os.path.exists(stem_path):                
                sf.write(stem_path, audio[i], rate)
              i=i+1
            

            acc_path = stem_path = os.path.join(target_path,  "accompaniment.wav")
            if not os.path.exists(acc_path):
              acc_audio = audio[1]+audio[2]+audio[3]
              acc_audio = np.clip((acc_audio), -1.0, 1.0)
              sf.write(acc_path, acc_audio, rate)
              
              diff_signal = np.abs(audio[0] - acc_audio - audio[4])
              Max_abs_dev_from_source_additivity_constraint = np.max(diff_signal)
              Mean_abs_dev_from_source_additivity_constraint = np.mean(diff_signal)
              
              with open(log_path, 'a') as csvfile:
                filewriter = csv.writer(csvfile, delimiter=',')
                filewriter.writerow([track_name, Max_abs_dev_from_source_additivity_constraint,Mean_abs_dev_from_source_additivity_constraint])
            
            example["accompaniment"] = acc_path
            output_csv = example
            output_csv["track"] = track_name
            
            with open(output_path, 'a', newline='') as file:
              fieldnames = ['track', 'mix', 'drums', 'bass', 'others' ,'vocals', 'accompaniment']
              writer = csv.DictWriter(file, fieldnames=fieldnames)
              writer.writerow(output_csv)

            samples.append(example)

        subsets.append(samples)

    return subsets



In [None]:
def get_path(db_path, instrument_node):
    return db_path + os.path.sep + instrument_node.xpath("./relativeFilepath")[0].text

In [None]:
def get_dataset(model_config, input_shape, output_shape, partition):
    
    '''
    For a model configuration and input/output shapes of the network, get the corresponding dataset for a given partition
    :param model_config: Model config
    :param input_shape: Input shape of network
    :param output_shape: Output shape of network
    :param partition: "train", "valid", or "test" partition
    :return: Tensorflow dataset object
    '''

    # Check if pre-processed dataset is already available for this model config and partition
    main_folder = model_config["directory"]
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
        # We have to prepare the MUSDB dataset
        print("Preparing MUSDB dataset! This could take a while...")
        dsd_train, dsd_test = getMUSDBHQ(model_config["musdb_path"])  # List of (mix, acc, bass, drums, other, vocal) tuples

        # Pick 25 random songs for validation from MUSDB train set (this is always the same selection each time since we fix the random seed!)
        val_idx = np.random.choice(len(dsd_train), size=1, replace=False)
        train_idx = [i for i in range(len(dsd_train)) if i not in val_idx]
        print("Validation with MUSDB training songs no. " + str(val_idx))

        # Draw randomly from datasets
        dataset = dict()
        dataset["train"] = [dsd_train[i] for i in train_idx]
        dataset["valid"] = [dsd_train[i] for i in val_idx]
        dataset["test"] = dsd_test    


        # Convert audio files into TFRecords now

        # The dataset structure is a dictionary with "train", "valid", "test" keys, whose entries are lists, where each element represents a song.
        # Each song is represented as a dictionary containing elements mix, acc, vocal or mix, bass, drums, other, vocal depending on the task.

        num_cores = 8

        for curr_partition in ["train", "valid", "test"]:
            print("Writing " + curr_partition + " partition...")

            # Shuffle sample order
            sample_list = dataset[curr_partition]
            random.shuffle(sample_list)

            # Create folder
            partition_folder = os.path.join(main_folder, curr_partition)
            os.makedirs(partition_folder)

            part_entries = int(np.ceil(float(len(sample_list) / float(num_cores))))
            processes = list()
            for core in range(num_cores):
                train_filename = os.path.join(partition_folder, str(core) + "_")  # address to save the TFRecords file
                sample_list_subset = sample_list[core * part_entries:min((core + 1) * part_entries, len(sample_list))]
                proc = Process(target=write_records,
                               args=(sample_list_subset, model_config, input_shape, output_shape, train_filename))
                proc.start()
                processes.append(proc)
            for p in processes:
                p.join()

    print("Dataset ready!")
    # Finally, load TFRecords dataset based on the desired partition
    dataset_folder = os.path.join(main_folder, partition)
   
    
    records_files = glob.glob(os.path.join(dataset_folder, "*.tfrecords"))
    random.shuffle(records_files)
    dataset = tf.data.TFRecordDataset(records_files)
    dataset = dataset.map(lambda x : parse_record(x, model_config["source_names"], input_shape[1:]), num_parallel_calls=model_config["num_workers"])
    dataset = dataset.prefetch(10)

    # Take random samples from each song
    if partition == "train":
        dataset = dataset.flat_map(lambda x : take_random_snippets(x, model_config["source_names"] + ["mix"], input_shape[1:], model_config["num_snippets_per_track"]))
    else:
        dataset = dataset.flat_map(lambda x : take_all_snippets(x, model_config["source_names"] + ["mix"], input_shape[1:], output_shape[1:]))
    dataset = dataset.prefetch(100)

    if partition == "train" and model_config["augmentation"]: # If its the train partition, activate data augmentation if desired
            dataset = dataset.map(random_amplify, num_parallel_calls=model_config["num_workers"]).prefetch(100)

    # Cut source outputs to centre part
    dataset = dataset.map(lambda x : crop_sample(x, (input_shape[1] - output_shape[1])//2)).prefetch(100)

    if partition == "train": # Repeat endlessly and shuffle when training
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=model_config["cache_size"])

    dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(model_config["batch_size"]))
    dataset = dataset.prefetch(1)

    return dataset


# **Models**

### **Output Layer**

In [None]:
def independent_outputs(featuremap, source_names, num_channels, filter_width, padding, activation):
    outputs = dict()
    for name in source_names:
        outputs[name] = tf.layers.conv1d(featuremap, num_channels, filter_width, activation=activation, padding=padding)
    return outputs

def difference_output(input_mix, featuremap, source_names, num_channels, filter_width, padding, activation, training):
    outputs = dict()
    sum_source = 0
    for name in source_names[:-1]:
        out = tf.layers.conv1d(featuremap, num_channels, filter_width, activation=activation, padding=padding)
        outputs[name] = out
        sum_source = sum_source + out

    # Compute last source based on the others
    last_source = crop(input_mix, sum_source.get_shape().as_list()) - sum_source
    last_source = AudioClip(last_source, training)
    outputs[source_names[-1]] = last_source
    return outputs

### **Interpolation Layer**

In [None]:
def learned_interpolation_layer(input, padding, level):
    '''
    Implements a trainable upsampling layer by interpolation by a factor of two, from N samples to N*2 - 1.
    Interpolation of intermediate feature vectors v_1 and v_2 (of dimensionality F) is performed by
     w \cdot v_1 + (1-w) \cdot v_2, where \cdot is point-wise multiplication, and w an F-dimensional weight vector constrained to [0,1]
    :param input: Input features of shape [batch_size, 1, width, F]
    :param padding:
    :param level:
    :return:
    '''
    assert(padding == "valid" or padding == "same")
    features = input.get_shape().as_list()[3]

    # Construct 2FxF weight matrix, where F is the number of feature channels in the feature map.
    # Matrix is constrained, made up out of two diagonal FxF matrices with diagonal weights w and 1-w. w is constrained to be in [0,1] # mioid
    weights = tf.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
    weights_scaled = tf.nn.sigmoid(weights) # Constrain weights to [0,1]
    counter_weights = 1.0 - weights_scaled # Mirrored weights for the features from the other time step
    conv_weights = tf.expand_dims(tf.concat([tf.expand_dims(tf.diag(weights_scaled), axis=0), tf.expand_dims(tf.diag(counter_weights), axis=0)], axis=0), axis=0)
    intermediate_vals = tf.nn.conv2d(input, conv_weights, strides=[1,1,1,1], padding=padding.upper())

    intermediate_vals = tf.transpose(intermediate_vals, [2, 0, 1, 3])
    out = tf.transpose(input, [2, 0, 1, 3])
    num_entries = out.get_shape().as_list()[0]
    out = tf.concat([out, intermediate_vals], axis=0)
    indices = list()

    # Interleave interpolated features with original ones, starting with the first original one
    num_outputs = (2*num_entries - 1) if padding == "valid" else 2*num_entries
    for idx in range(num_outputs):
        if idx % 2 == 0:
            indices.append(idx // 2)
        else:
            indices.append(num_entries + idx//2)
    out = tf.gather(out, indices)
    current_layer = tf.transpose(out, [1, 2, 0, 3])
    return current_layer

**Unet Audio Separator**

In [None]:
class UnetAudioSeparator:
    '''
    U-Net separator network for singing voice separation.
    Uses valid convolutions, so it predicts for the centre part of the input - only certain input and output shapes are therefore possible (see getpadding function)
    '''

    def __init__(self, model_config):
        '''
        Initialize U-net
        :param num_layers: Number of down- and upscaling layers in the network 
        '''
        self.num_layers = model_config["num_layers"]
        self.num_initial_filters = model_config["num_initial_filters"]
        self.filter_size = model_config["filter_size"]
        self.merge_filter_size = model_config["merge_filter_size"]
        self.input_filter_size = model_config["input_filter_size"]
        self.output_filter_size = model_config["output_filter_size"]
        self.upsampling = model_config["upsampling"]
        self.output_type = model_config["output_type"]
        self.context = model_config["context"]
        self.padding = "valid" if model_config["context"] else "same"
        self.source_names = model_config["source_names"]
        self.num_channels = 1 if model_config["mono_downmix"] else 2
        self.output_activation = model_config["output_activation"]

    def get_padding(self, shape):
        '''
        Calculates the required amounts of padding along each axis of the input and output, so that the Unet works and has the given shape as output shape
        :param shape: Desired output shape 
        :return: Input_shape, output_shape, where each is a list [batch_size, time_steps, channels]
        '''

        if self.context:
            # Check if desired shape is possible as output shape - go from output shape towards lowest-res feature map
            rem = float(shape[1]) # Cut off batch size number and channel

            # Output filter size
            rem = rem - self.output_filter_size + 1

            # Upsampling blocks
            for i in range(self.num_layers):
                rem = rem + self.merge_filter_size - 1
                rem = (rem + 1.) / 2.# out = in + in - 1 <=> in = (out+1)/

            # Round resulting feature map dimensions up to nearest integer
            x = np.asarray(np.ceil(rem),dtype=np.int64)
            assert(x >= 2)

            # Compute input and output shapes based on lowest-res feature map
            output_shape = x
            input_shape = x

            # Extra conv
            input_shape = input_shape + self.filter_size - 1

            # Go from centre feature map through up- and downsampling blocks
            for i in range(self.num_layers):
                output_shape = 2*output_shape - 1 #Upsampling
                output_shape = output_shape - self.merge_filter_size + 1 # Conv

                input_shape = 2*input_shape - 1 # Decimation
                if i < self.num_layers - 1:
                    input_shape = input_shape + self.filter_size - 1 # Conv
                else:
                    input_shape = input_shape + self.input_filter_size - 1

            # Output filters
            output_shape = output_shape - self.output_filter_size + 1

            input_shape = np.concatenate([[shape[0]], [input_shape], [self.num_channels]])
            output_shape = np.concatenate([[shape[0]], [output_shape], [self.num_channels]])

            return input_shape, output_shape
        else:
            return [shape[0], shape[1], self.num_channels], [shape[0], shape[1], self.num_channels]

    def get_output(self, input, training, return_spectrogram=False, reuse=True):
        '''
        Creates symbolic computation graph of the U-Net for a given input batch
        :param input: Input batch of mixtures, 3D tensor [batch_size, num_samples, num_channels]
        :param reuse: Whether to create new parameter variables or reuse existing ones
        :return: U-Net output: List of source estimates. Each item is a 3D tensor [batch_size, num_out_samples, num_channels]
        '''
        with tf.variable_scope("separator", reuse=reuse):
            enc_outputs = list()
            current_layer = input

            # Down-convolution: Repeat strided conv
            for i in range(self.num_layers):
                current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * i), self.filter_size, strides=1, activation=LeakyReLU, padding=self.padding) # out = in - filter + 1
                enc_outputs.append(current_layer)
                current_layer = current_layer[:,::2,:] # Decimate by factor of 2 # out = (in-1)/2 + 1

            current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * self.num_layers),self.filter_size,activation=LeakyReLU,padding=self.padding) # One more conv here since we need to compute features after last decimation

            # Feature map here shall be X along one dimension

            # Upconvolution
            for i in range(self.num_layers):
                #UPSAMPLING
                current_layer = tf.expand_dims(current_layer, axis=1)
                if self.upsampling == 'learned':
                    # Learned interpolation between two neighbouring time positions by using a convolution filter of width 2, and inserting the responses in the middle of the two respective inputs
                    current_layer = learned_interpolation_layer(current_layer, self.padding, i)
                else:
                    if self.context:
                        current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
                    else:
                        current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
                current_layer = tf.squeeze(current_layer, axis=1)
                # UPSAMPLING FINISHED

                assert(enc_outputs[-i-1].get_shape().as_list()[1] == current_layer.get_shape().as_list()[1] or self.context) #No cropping should be necessary unless we are using context
                current_layer = crop_and_concat(enc_outputs[-i-1], current_layer, match_feature_dim=False)
                current_layer = tf.layers.conv1d(current_layer, self.num_initial_filters + (self.num_initial_filters * (self.num_layers - i - 1)), self.merge_filter_size,
                                                 activation=LeakyReLU,
                                                 padding=self.padding)  # out = in - filter + 1

            current_layer = crop_and_concat(input, current_layer, match_feature_dim=False)

            # Output layer
            # Determine output activation function
            if self.output_activation == "tanh":
                out_activation = tf.tanh
            elif self.output_activation == "linear":
                out_activation = lambda x: AudioClip(x, training)
            else:
                raise NotImplementedError

            if self.output_type == "direct":
                return independent_outputs(current_layer, self.source_names, self.num_channels, self.output_filter_size, self.padding, out_activation)
            elif self.output_type == "difference":
                cropped_input = crop(input,current_layer.get_shape().as_list(), match_feature_dim=False)
                return difference_output(cropped_input, current_layer, self.source_names, self.num_channels, self.output_filter_size, self.padding, out_activation, training)
            else:
                raise NotImplementedError

# **Train**

In [None]:
def config(task_name):

  model_config["task"]=task_name
  if model_config["task"] == "multi_instrument":
      model_config["source_names"] = ["bass", "drums", "other", "vocals"]
  elif model_config["task"] == "voice":
      model_config["source_names"] = ["accompaniment", "vocals"]
  else:
      raise NotImplementedError
  model_config["num_sources"] = len(model_config["source_names"])
  model_config["num_channels"] = 1 if model_config["mono_downmix"] else 2


def train(model_config, load_model=None):
    # Determine input and output shapes
    disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0]  # Shape of input
    separator_class = UnetAudioSeparator(model_config)


    sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape))
    separator_func = separator_class.get_output

    dataset = get_dataset(model_config, sep_input_shape, sep_output_shape, partition="train")
    iterator = dataset.make_one_shot_iterator()
    batch = iterator.get_next()

    print("Training...")

    separator_sources = separator_func(batch["mix"], True, not model_config["raw_audio_loss"], reuse=tf.AUTO_REUSE)
    separator_loss = 0

    for key in model_config["source_names"]:
        real_source = batch[key]
        sep_source = separator_sources[key]
        separator_loss += tf.reduce_mean(tf.square(real_source - sep_source))
    separator_loss = separator_loss / float(model_config["num_sources"]) # Normalise by number of sources

    # TRAINING CONTROL VARIABLES
    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False, dtype=tf.int64)
    increment_global_step = tf.assign(global_step, global_step + 1)

    # Set up optimizers
    separator_vars = getTrainableVariables("separator")
    print("Sep_Vars: " + str(getNumParams(separator_vars)))
    print("Num of variables " + str(len(tf.global_variables())))

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        with tf.variable_scope("separator_solver"):
            separator_solver = tf.train.AdamOptimizer(learning_rate=model_config["init_sup_sep_lr"]).minimize(separator_loss, var_list=separator_vars)
    
    # SUMMARIES
    tf.summary.scalar("sep_loss", separator_loss, collections=["sup"])
    sup_summaries = tf.summary.merge_all(key='sup')

    # Start session and queue input threads
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(model_config["log_dir"] + os.path.sep + str(experiment_id),graph=sess.graph)

    # CHECKPOINTING
    # Load pretrained model to continue training, if we are supposed to
    if load_model != None:
        restorer = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
        print("Num of variables" + str(len(tf.global_variables())))
        restorer.restore(sess, load_model)
        print('Pre-trained model restored from file ' + load_model)

    saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)

    # Start training loop
    _global_step = sess.run(global_step)
    _init_step = _global_step
    
    # Epoch finished - Save model
    print("Finished epoch!")
    save_path = saver.save(sess, model_config["model_base_dir"] + os.path.sep + str(experiment_id) + os.path.sep + str(experiment_id), global_step=int(_global_step))

    # Close session, clear computational graph
    writer.flush()
    writer.close()
    sess.close()
    tf.reset_default_graph()

    return save_path





    




In [None]:
config("voice")
train(model_config)

Dataset ready!
Training...
Sep_Vars: 10263028
Num of variables 55
Finished epoch!


'/content/drive/My Drive/BTP/Dataset/checkpoints/876218/876218-0'