In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, LeakyReLU, UpSampling1D, Concatenate, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Cropping1D
from tensorflow.keras.layers import Reshape
from tensorflow.nn import sigmoid
import os
import librosa
import numpy as np
import random
from concurrent.futures import ProcessPoolExecutor
import matplotlib as plt


In [2]:
models_folder = "/Users/rei/Documents/Machine_Learning/MODELS/Unet/Unet_Sound_Seperation/Unet-Sound-Seperation/Models/"


In [3]:
def crop(tensor, target_shape, match_feature_dim=True):
    shape = tf.shape(tensor)
    diff = shape - target_shape
    assert diff[1] >= 0 # Only positive difference allowed
    if diff[1] == 0:
        return tensor
    crop_start = diff // 2
    crop_end = diff - crop_start
    return tensor[:, crop_start[1]:-crop_end[1], :]

def AudioClip(x, training):
    if training:
        return x
    else:
        return tf.maximum(tf.minimum(x, 1.0), -1.0)

def difference_output(input_mix, featuremap, source_names, num_channels, filter_width, padding, activation, training):
    outputs = dict()
    sum_source = 0
    for name in source_names[:-1]:
        out = tf.keras.layers.Conv1D(num_channels, filter_width, activation=activation, padding=padding)(featuremap)
        outputs[name] = out
        sum_source += out

    last_source = crop(input_mix, sum_source.shape) - sum_source
    last_source = AudioClip(last_source, training)
    outputs[source_names[-1]] = last_source
    return outputs
def learned_interpolation_layer(input, padding, level):
    features = input.shape[2]
    weights = tf.Variable(tf.initializers.GlorotUniform()(shape=[features]), dtype=tf.float32, name="interp_" + str(level))
    weights_scaled = tf.nn.sigmoid(weights)
    counter_weights = 1.0 - weights_scaled

    conv_weights = tf.linalg.diag(weights_scaled)
    conv_weights = tf.expand_dims(conv_weights, axis=0)
    intermediate_vals = tf.linalg.matmul(input, conv_weights)
    
    counter_conv_weights = tf.linalg.diag(counter_weights)
    counter_conv_weights = tf.expand_dims(counter_conv_weights, axis=0)
    counter_intermediate_vals = tf.linalg.matmul(input, counter_conv_weights)

    output = tf.concat([intermediate_vals, counter_intermediate_vals], axis=1)
    
    if padding == "valid":
        output = output[:, :-1, :]

    return output


def create_encoder(input, num_layers, num_initial_filters, filter_size, input_filter_size, padding, dropout_rate=0.3):
    enc_outputs = []
    current_layer = input
    current_layer = tf.keras.layers.Conv1D(num_initial_filters, input_filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)
    current_layer = tf.keras.layers.Dropout(dropout_rate)(current_layer)  # Adding dropout here
    enc_outputs.append(current_layer)

    for i in range(num_layers - 1):
        current_layer = tf.keras.layers.Conv1D(num_initial_filters + (num_initial_filters * i), filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)
        current_layer = tf.keras.layers.Dropout(dropout_rate)(current_layer)  # Adding dropout here
        current_layer = current_layer[:, ::2, :]  # Decimate by factor of 2
        enc_outputs.append(current_layer)

    return enc_outputs

def create_decoder(enc_outputs, num_layers, num_initial_filters, filter_size, merge_filter_size, padding, upsampling):
    current_layer = enc_outputs[-1]

    for i in range(num_layers - 1, 0, -1):
        if upsampling == 'linear':
            current_layer = tf.keras.layers.UpSampling1D(size=2)(current_layer)
        elif upsampling == 'learned':
            current_layer = learned_interpolation_layer(current_layer, padding=padding, level=i)

        current_layer = tf.concat([current_layer, enc_outputs[i - 1]], axis=2)
        current_layer = tf.keras.layers.Conv1D(num_initial_filters * (num_layers - i), merge_filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)

    return current_layer


def get_output_layer(current_layer, output_type, source_names, num_channels, output_filter_size, padding, activation, training):
    if output_type == "direct":
        return independent_outputs(current_layer, source_names, num_channels, output_filter_size, padding, activation)
    elif output_type == "difference":
        cropped_input = crop(input, current_layer.get_shape().as_list(), match_feature_dim=False)
        return difference_output(cropped_input, current_layer, source_names, num_channels, output_filter_size, padding, activation, training)
    else:
        raise NotImplementedError("Unknown output type")

def independent_outputs(featuremap, source_names, num_channels, filter_width, padding, activation):
    outputs = dict()
    for name in source_names:
        outputs[name] = tf.keras.layers.Conv1D(num_channels, filter_width, activation=activation, padding=padding)(featuremap)
    return outputs


In [6]:
num_frames = 16384# * 2
num_channels = 1
num_layers = 12 #12
num_initial_filters = 24 #24
filter_size = 15 #15
merge_filter_size = 5
input_filter_size = 15
output_filter_size = 1
padding = 'same'  
upsampling = 'linear'  # or 'learned'
output_type = 'difference'  # or  'direct'
source_names = ["accompaniment", "vocals"]
activation = 'tanh'
training = True


ValueError: Invalid value in tensor used for shape: -263

In [4]:

def build_model():
    # Input
    input_mix = Input(shape=(num_frames, num_channels), name="input")

    # Encoder
    enc_outputs = create_encoder(input_mix, num_layers, num_initial_filters, filter_size, input_filter_size, padding)

    # Decoder
    current_layer = create_decoder(enc_outputs, num_layers, num_initial_filters, filter_size, merge_filter_size, padding, upsampling)

    # Output Layer
    outputs = get_output_layer(current_layer, input_mix, output_type, source_names, num_channels, output_filter_size, padding, activation, training)


    # Build Model
    model = Model(inputs=input_mix, outputs=outputs)
    return model



In [5]:
learning_rate = 1.13e-5
BATCH_SIZE = 16
EPOCHS = 2000

In [None]:

# Building the model
model = build_model()

# Compile the model (if needed)
model.compile(optimizer='adam', loss='mse') # or other appropriate loss and optimizer

# Summary of the model
model.summary()


In [7]:
SAMPLE_RATE = 22050
SNIPPET_LENGTH = num_frames
tfRecord_Datasets = '/Users/rei/Documents/Machine_Learning/MODELS/Unet/Unet_Sound_Seperation/Unet-Sound-Seperation/tf_Record'


def parse_tfrecord_fn(example):
    feature_description = {
        'mixed_signal': tf.io.FixedLenFeature([SNIPPET_LENGTH], tf.float32),
        'vocal_signal': tf.io.FixedLenFeature([SNIPPET_LENGTH], tf.float32)
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['mixed_signal'], example['vocal_signal']

def load_dataset(filename):
    raw_dataset = tf.data.TFRecordDataset(filename)
    return raw_dataset.map(parse_tfrecord_fn)

# Load your training, validation and test data
train_dataset = load_dataset(os.path.join(tfRecord_Datasets, 'train.tfrecord'))
val_dataset = load_dataset(os.path.join(tfRecord_Datasets, 'val.tfrecord'))
test_dataset = load_dataset(os.path.join(tfRecord_Datasets, 'test.tfrecord'))


In [None]:
import wandb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Initialize wandb
wandb.init(project='Large_Shaking_Through_Unet_model', name='CPU-Batch16-lr1.13e-5')

def r_squared(y_true, y_pred):
    """
    Calculate R-squared, the coefficient of determination.
    """
    residual = tf.reduce_sum(tf.square(tf.subtract(y_true, y_pred)))
    total = tf.reduce_sum(tf.square(tf.subtract(y_true, tf.reduce_mean(y_true))))
    r2 = tf.subtract(1.0, tf.divide(residual, total))
    return r2

# Build the model
#model = build_model()

# Compile the model
optimizer = Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999)
model.compile(optimizer=optimizer, loss='mean_squared_error',metrics=[r_squared])

# Set up early stopping based on validation loss
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

# Assuming train_dataset and val_dataset are your training and validation data generators or datasets
# You might need to adjust this part based on how you've set up your data
history = model.fit(train_dataset, validation_data=val_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping, wandb.keras.WandbCallback()])

# Fine-tuning phase
#model.compile(optimizer=Adam(learning_rate=0.00001, beta_1=0.9, beta_2=0.999), loss='mean_squared_error')
#history_fine_tune = model.fit(train_dataset, validation_data=val_dataset, epochs=2000, batch_size=32, callbacks=[early_stopping, wandb.keras.WandbCallback()])

# You might want to save the model after training
model.save(models_folder + 'best_model.h5')
