In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, LeakyReLU, UpSampling1D, Concatenate, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Cropping1D
from tensorflow.keras.layers import Reshape
from tensorflow.nn import sigmoid
import os
import librosa
import numpy as np
import random
from concurrent.futures import ProcessPoolExecutor


In [2]:
model_path = "/Users/rei/Documents/Machine_Learning/MODELS/Unet/Shaking Through Sound Seperation/Shaking_Through_Model_01.keras"
trainDir = '/Users/rei/Documents/Machine_Learning/Data/Audio/Shaking_Through/Dataset/Train'
testDir = '/Users/rei/Documents/Machine_Learning/Data/Audio/Shaking_Through/Dataset/Test'


In [3]:
def crop(tensor, target_shape, match_feature_dim=True):
    shape = tf.shape(tensor)
    diff = shape - target_shape
    assert diff[1] >= 0 # Only positive difference allowed
    if diff[1] == 0:
        return tensor
    crop_start = diff // 2
    crop_end = diff - crop_start
    return tensor[:, crop_start[1]:-crop_end[1], :]

def AudioClip(x, training):
    if training:
        return x
    else:
        return tf.maximum(tf.minimum(x, 1.0), -1.0)

def difference_output(input_mix, featuremap, source_names, num_channels, filter_width, padding, activation, training):
    outputs = dict()
    sum_source = 0
    for name in source_names[:-1]:
        out = tf.keras.layers.Conv1D(num_channels, filter_width, activation=activation, padding=padding)(featuremap)
        outputs[name] = out
        sum_source += out

    last_source = crop(input_mix, sum_source.shape) - sum_source
    last_source = AudioClip(last_source, training)
    outputs[source_names[-1]] = last_source
    return outputs

In [4]:
def learned_interpolation_layer(input, padding, level):
    features = input.shape[2]
    weights = tf.Variable(tf.initializers.GlorotUniform()(shape=[features]), dtype=tf.float32, name="interp_" + str(level))
    weights_scaled = tf.nn.sigmoid(weights)
    counter_weights = 1.0 - weights_scaled

    conv_weights = tf.linalg.diag(weights_scaled)
    conv_weights = tf.expand_dims(conv_weights, axis=0)
    intermediate_vals = tf.linalg.matmul(input, conv_weights)
    
    counter_conv_weights = tf.linalg.diag(counter_weights)
    counter_conv_weights = tf.expand_dims(counter_conv_weights, axis=0)
    counter_intermediate_vals = tf.linalg.matmul(input, counter_conv_weights)

    output = tf.concat([intermediate_vals, counter_intermediate_vals], axis=1)
    
    if padding == "valid":
        output = output[:, :-1, :]

    return output



In [5]:
def create_encoder(input, num_layers, num_initial_filters, filter_size, input_filter_size, padding):
    enc_outputs = []
    current_layer = input
    current_layer = tf.keras.layers.Conv1D(num_initial_filters, input_filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)
    enc_outputs.append(current_layer)

    for i in range(num_layers - 1):
        current_layer = tf.keras.layers.Conv1D(num_initial_filters + (num_initial_filters * i), filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)
        current_layer = current_layer[:, ::2, :]  # Decimate by factor of 2
        enc_outputs.append(current_layer)

    return enc_outputs

In [5]:
def create_decoder(enc_outputs, num_layers, num_initial_filters, filter_size, merge_filter_size, padding, upsampling):
    current_layer = enc_outputs[-1]

    for i in range(num_layers - 1, 0, -1):
        if upsampling == 'linear':
            current_layer = tf.keras.layers.UpSampling1D(size=2)(current_layer)
        elif upsampling == 'learned':
            current_layer = learned_interpolation_layer(current_layer, padding=padding, level=i)

        current_layer = tf.concat([current_layer, enc_outputs[i - 1]], axis=2)
        current_layer = tf.keras.layers.Conv1D(num_initial_filters * (num_layers - i), merge_filter_size, strides=1, activation=LeakyReLU(), padding=padding)(current_layer)

    return current_layer



In [6]:
def get_output_layer(current_layer, output_type, source_names, num_channels, output_filter_size, padding, activation, training):
    if output_type == "direct":
        return independent_outputs(current_layer, source_names, num_channels, output_filter_size, padding, activation)
    elif output_type == "difference":
        cropped_input = crop(input, current_layer.get_shape().as_list(), match_feature_dim=False)
        return difference_output(cropped_input, current_layer, source_names, num_channels, output_filter_size, padding, activation, training)
    else:
        raise NotImplementedError("Unknown output type")

def independent_outputs(featuremap, source_names, num_channels, filter_width, padding, activation):
    outputs = dict()
    for name in source_names:
        outputs[name] = tf.keras.layers.Conv1D(num_channels, filter_width, activation=activation, padding=padding)(featuremap)
    return outputs


In [7]:
num_frames = 16384
num_channels = 1
num_layers = 12
num_initial_filters = 24
filter_size = 15
merge_filter_size = 5
input_filter_size = 15
output_filter_size = 1
padding = 'same'  
upsampling = 'linear'  # or 'learned'
output_type = 'direct'  # or 'difference'
source_names = ["accompaniment", "vocals"]
activation = 'tanh'
training = True

def build_model():
    # Input
    input_mix = Input(shape=(num_frames, num_channels), name="input")

    # Encoder
    enc_outputs = create_encoder(input_mix, num_layers, num_initial_filters, filter_size, input_filter_size, padding)

    # Decoder
    current_layer = create_decoder(enc_outputs, num_layers, num_initial_filters, filter_size, merge_filter_size, padding, upsampling)

    # Output Layer
    outputs = get_output_layer(current_layer, output_type, source_names, num_channels, output_filter_size, padding, activation, training)

    # Build Model
    model = Model(inputs=input_mix, outputs=outputs)
    return model

In [8]:

SAMPLE_RATE = 22050
SNIPPET_LENGTH = 16384  # Length of random snippets
AUGMENTATION = True    # Toggle data augmentation
# Time Jittering
def time_jitter(audio, max_offset=500):
    offset = np.random.randint(max_offset)
    augmented_audio = np.pad(audio, (offset, 0), "constant")
    return augmented_audio[:len(audio)]

# Noise Injection
def add_noise(audio, noise_level=0.005):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_level * noise
    return np.clip(augmented_audio, -1, 1)

# Reverb (simple decay)
def add_reverb(audio, decay=0.5):
    impulse_response = np.zeros(len(audio))
    impulse_response[::4000] = decay
    augmented_audio = np.convolve(audio, impulse_response, mode='same')
    return np.clip(augmented_audio, -1, 1)

# Random Cropping
def random_cropping(audio, segment_length=SNIPPET_LENGTH):
    start = np.random.randint(0, len(audio) - segment_length)
    return audio[start: start + segment_length]

# Frequency Masking (in the spectrogram domain)
def freq_masking(spec, F=30, num_masks=1):
    num_channels, num_frames = spec.shape
    for _ in range(num_masks):
        f = np.random.uniform(low=0.0, high=F)
        f = int(f)
        f0 = np.random.uniform(low=0.0, high=num_channels - f)
        f0 = int(f0)
        spec[f0:f0 + f, :] = 0
    return spec

# Time Masking (in the spectrogram domain)
def time_masking(spec, T=40, num_masks=1):
    num_channels, num_frames = spec.shape
    for _ in range(num_masks):
        t = np.random.uniform(low=0.0, high=T)
        t = int(t)
        t0 = np.random.uniform(low=0.0, high=num_frames - t)
        t0 = int(t0)
        spec[:, t0:t0 + t] = 0
    return spec

def random_amplify(audio):
    factor = random.uniform(0.7, 1.3)  # Random amplification factor
    return audio * factor

def load_and_process_data(directory, min_mix=2, max_mix=5, augmentations={}):
    X = []
    y = []
    vocal_dir = os.path.join(directory, '08Vox')
    other_dirs = [os.path.join(directory, folder) for folder in os.listdir(directory) if folder != '08Vox' and not folder.startswith('.')]

    for vocal_file in os.listdir(vocal_dir):
        if not vocal_file.lower().endswith(('.wav', '.mp3', '.flac')):
            continue
        
        vocal_path = os.path.join(vocal_dir, vocal_file)
        vocal_signal, _ = librosa.load(vocal_path, sr=SAMPLE_RATE)

        # Skip if the length is shorter than the snippet length
        if len(vocal_signal) < SNIPPET_LENGTH:
            continue

        # Apply augmentations
        if "time_jitter" in augmentations and augmentations["time_jitter"]:
            vocal_signal = time_jitter(vocal_signal)
        if "noise_injection" in augmentations and augmentations["noise_injection"]:
            vocal_signal = add_noise(vocal_signal)
        if "reverb" in augmentations and augmentations["reverb"]:
            vocal_signal = add_reverb(vocal_signal)
        if "random_cropping" in augmentations and augmentations["random_cropping"]:
            vocal_signal = random_cropping(vocal_signal)

        # Normalize the vocal signal
        vocal_signal = normalize_audio(vocal_signal)

        # Randomly select a number of mixes
        num_mixes = random.randint(min_mix, max_mix)

        mixed_signal = vocal_signal.copy()  # Create a copy of the vocal signal to be mixed

        # Randomly select other samples to mix with the vocal
        for _ in range(num_mixes):
            other_dir = random.choice(other_dirs)
            other_file = random.choice([f for f in os.listdir(other_dir) if f.lower().endswith(('.wav', '.mp3', '.flac'))])
            other_path = os.path.join(other_dir, other_file)
            other_signal, _ = librosa.load(other_path, sr=SAMPLE_RATE)

            # Skip if the length is shorter than the snippet length
            if len(other_signal) < SNIPPET_LENGTH:
                continue

            other_signal = normalize_audio(other_signal)
            other_signal = pad_or_crop(other_signal, target_length=len(mixed_signal))
            mixed_signal += other_signal

        # Apply Frequency and Time Masking on the spectrogram
        S = librosa.stft(vocal_signal)
        if "freq_masking" in augmentations and augmentations["freq_masking"]:
            S = freq_masking(S)
        if "time_masking" in augmentations and augmentations["time_masking"]:
            S = time_masking(S)

        # Convert back to time domain
        vocal_signal = librosa.istft(S)

        # Divide into segments of 16384 samples
        for i in range(0, len(vocal_signal), SNIPPET_LENGTH):
            vocal_segment = pad_or_crop(vocal_signal[i:i + SNIPPET_LENGTH], SNIPPET_LENGTH)
            mixed_segment = pad_or_crop(mixed_signal[i:i + SNIPPET_LENGTH], SNIPPET_LENGTH)

            X.append(mixed_segment)
            y.append(vocal_segment)

    return np.array(X), np.array(y)


def pad_or_crop(audio, target_length):
    length = len(audio)
    if length < target_length:
        padding = target_length - length
        audio = np.pad(audio, (0, padding), 'constant')
    elif length > target_length:
        audio = audio[:target_length]
    return audio

def normalize_audio(audio):
    return 2 * (audio - np.min(audio)) / (np.max(audio) - np.min(audio)) - 1


In [10]:
# Load and process the data for training and testing
augmentation_config = {
    "time_jitter": True,
    "noise_injection": True,
    "reverb": False,
    "random_cropping": True,
    "freq_masking": True,
    "time_masking": True
}
X_train, y_train = load_and_process_data(trainDir, augmentations=augmentation_config)
X_test, y_test = load_and_process_data(testDir, augmentations=augmentation_config)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [11]:

def create_dataset(X, y, batch_size=32, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(X))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

# Load and process the data for training and testing
X_train, y_train = load_and_process_data(trainDir)
X_test, y_test = load_and_process_data(testDir)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create TensorFlow datasets
batch_size = 64
train_dataset = create_dataset(X_train, y_train, batch_size=batch_size)
val_dataset = create_dataset(X_val, y_val, batch_size=batch_size)
test_dataset = create_dataset(X_test, y_test, batch_size=batch_size, shuffle=False)  # No shuffling for test set


In [12]:
# Importing necessary libraries to load the model
from tensorflow.keras.models import load_model

# Specifying the path to the trained model
model_path = "/Users/rei/Documents/Machine_Learning/MODELS/Unet/Shaking Through Sound Seperation/Shaking_Through_Model_01.keras"

Unet_Model = load_model(model_path, compile=False)

# Compile the model again with your desired optimizer
Unet_Model.compile(optimizer='adam', loss='mse')

model = build_model()  # Function that builds your model architecture
model.load_weights(model_path)


In [13]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
callbacks = [reduce_lr]


In [14]:
import wandb
from wandb.keras import WandbCallback

wandb.init(project='Shaking_Through_Unet_model')
callbacks.append(WandbCallback(log_weights=True))
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss')
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

callbacks.extend([checkpoint, early_stopping])



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mreinert-wasserman[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [15]:
model.compile(optimizer='adam', loss='mse')

In [12]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=callbacks)


NameError: name 'model' is not defined