In [30]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from pathlib import PurePath
import pandas as pd
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os
import math
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path



from tensorflow.keras import layers, models, datasets, callbacks, utils, metrics, optimizers
import tensorflow.keras.backend as K
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError
from tensorflow.keras.utils import plot_model


In [2]:
DATASET_PATH ="Train_Submission/Sound_Drum"
SAMPLE_RATE=22050
DURATION=10
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
JSON_PATH = "data.json"

In [3]:
def sort_audio_files_by_channels(folder_path):
    # List all files in the folder
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Separate mono and multichannel files
    mono_files = []
    multichannel_files = []

    for file_name in files:
        file_path = os.path.join(folder_path, file_name)

        # Check the number of channels using librosa
        signal,_ = librosa.load(file_path, mono=False)
#         print(signal.shape)

        if len(signal.shape) > 1:
            multichannel_files.append(file_name)
            
        else:
            mono_files.append(file_name)
           

    return mono_files, multichannel_files



In [4]:
monofiles, multichannel_files = sort_audio_files_by_channels(DATASET_PATH)

In [5]:
len(monofiles)

563

In [6]:
len(multichannel_files)

137

In [9]:
import soundfile as sf
from scipy.io import wavfile
def convert_to_fixed_size(dataset_path,  target_duration, multichannel_files):
    for i, (dirpath, dirname, filenames) in enumerate(os.walk(dataset_path)):
        
        #ensure that we're not at the root level
        if dirpath is not dataset_path:
            #Get the Semantic Label
            dirpath_components = dirpath.split("\\")
            sematic_label = dirpath_components[-1]
                        
            for f in multichannel_files:
                input_file = os.path.join(dirpath, f)
                output_folder = os.path.dirname(dataset_path) +'/fixed'
                
                if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
                
                output_file = f
                # Load the audio file
                y, sr = librosa.load(input_file, mono=False, sr=None)
#                 print(len(y[0]))
                
                target_samples = int(sr*target_duration)
                
                if len(y[0]) > target_samples:
                    y_fixed_length = y[:, :target_samples]
                else:
                    pad_length = target_samples - len(y[0])
                    y_fixed_length = np.pad(y, ((0,0),(0, pad_length)), 'constant')
#                 print("{} : {}\n".format(type(y_fixed_length), y_fixed_length.shape))
                y_fixed_length = y_fixed_length.T
                    
                sf.write(output_file, y_fixed_length, sr )
                

In [10]:
convert_to_fixed_size('Train_Submission', 10, multichannel_files)

In [7]:
multichannel_files

['04-47367.wav',
 '046578_tribal-drum-rhythms-03wav-68096.wav',
 '054399_8039s-old-school-rap-drum-loop-80433.wav',
 '056514_drum-40118.wav',
 '059960_sonido2-sincopa-alta-7mp3-47316.wav',
 '065339_metal-bass-drum-90850.wav',
 '066166_qui-c39est-qu39est-tombe-loop-t85wav-39366.wav',
 '068320_hang-drum-2wav-80568.wav',
 '102-bpm-boom-groove-82498.wav',
 '109-bpm-70s-style-drum-loop-76138.wav',
 '10_drumloop_minimal-32725.wav',
 '120bpm_kick-build-up-98848.wav',
 '140-bpm-amen-break-original-processed-6945 (1).wav',
 '140-bpm-amen-break-original-processed-6945.wav',
 '170-beat-box-hpf-loop-103412.wav',
 '174-txls4-106297.wav',
 '2018-11-15-22563.wav',
 '808-d3-38858.wav',
 '80s-drums-fl-studio-70248.wav',
 '8bit-sample-69080.wav',
 'action-drums-78-low-67673.wav',
 'african-98600.wav',
 'agressive-straight-drums-105-bpm-99895.wav',
 'alien-beeper-103420.wav',
 'asian-gong-102397.wav',
 'atari-st-beat-09-106443.wav',
 'ba-dum-bum-all-74740.wav',
 'bar-bq-chicken-drumpsticks-129354.wav',
 

In [8]:
with open("Train_Meta_File.txt", 'w') as file:
    file.write(','.join(multichannel_files))

In [9]:
def load_and_preprocess_audio(file_path, target_sr=22050, duration=10, n_fft=2048, hop_length=512, n_mfcc=100):
    #Load audio file
    y,sr = librosa.load(file_path, sr=target_sr, duration=duration, mono=False)
    
    #Extract features (e.g. spectrogram)
    features = librosa.feature.mfcc(y=y, sr=target_sr, n_fft=n_fft, hop_length=hop_length, n_mels=128)
    features = librosa.power_to_db(features, ref=np.max)
    
    
    #if you have multiple channels, stack them along the third axis
    if len(y.shape) > 1:
        features = np.stack([features]*y.shape[0], axis=-1)
        
    else: 
        print("Skipping File...\n")
        return
        
    reduced_features = np.mean(features, axis=-1)
    #Normalize the features
    normalized_features = (reduced_features - np.min(reduced_features)) / (np.max(reduced_features) - np.min(reduced_features))
    normalized_features = normalized_features.T
    return normalized_features


def preprocess_and_save_to_npz(input_folder, sr=22050, duration=10, hop_length=512):
    #Assuming each aduio file in the folder is a multichannel audio file
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.wav')]
    
    num_of_samples_per_file = int(sr*duration)
    expected_num_vectors_per_file = math.ceil(num_of_samples_per_file/hop_length)
    
    data = {}
    mfcc_dict = {}
    for file_name in audio_files:
        file_path = os.path.join(input_folder, file_name)
        audio_data = load_and_preprocess_audio(file_path)
        mfcc_dict[file_name] = audio_data
#         print(audio_data.shape[2], expected_num_vectors_per_file, type(audio_data))
#         semantics = input_folder.split('/')
#         print(semantics)
#         key = semantics[-1]
#         if audio_data.shape[2] == expected_num_vectors_per_file:
#             if key in data:
#                 data[key].append(audio_data.tolist())
#             else:
#                 data[key]=audio_data.tolist()
        
#     with open(output_json, 'w') as json_file:
#         json.dump(data, json_file, indent=4)
    np.savez("mfcc_data_drums.npz", **mfcc_dict)

In [10]:
preprocess_and_save_to_npz("Multichannel_Fixed_Time")

In [11]:
loaded_data = np.load("mfcc_data_drums.npz")

In [12]:
input_data = loaded_data['04-47367.wav']

In [13]:
input_data.shape

(431, 20, 2)

In [14]:
input_data

array([[[0.        , 0.        ],
        [0.85915077, 0.8529997 ],
        [0.        , 0.        ],
        ...,
        [0.        , 0.        ],
        [0.7974676 , 0.7970575 ],
        [0.7091631 , 0.6313097 ]],

       [[0.        , 0.        ],
        [0.95580405, 0.9553362 ],
        [0.9215112 , 0.9210318 ],
        ...,
        [0.73916245, 0.74817485],
        [0.        , 0.        ],
        [0.83489865, 0.8291292 ]],

       [[0.9309942 , 0.93171597],
        [0.95782626, 0.9569068 ],
        [0.9176909 , 0.9177766 ],
        ...,
        [0.715199  , 0.61785376],
        [0.        , 0.        ],
        [0.75366133, 0.7416283 ]],

       ...,

       [[0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        .

In [2]:
with open('Train_Meta_File.txt', 'r') as file:
  data = file.read().replace('\n', '')
  files_list = data.split(',')

print(files_list)
print(len(files_list))

['04-47367.wav', '046578_tribal-drum-rhythms-03wav-68096.wav', '054399_8039s-old-school-rap-drum-loop-80433.wav', '056514_drum-40118.wav', '059960_sonido2-sincopa-alta-7mp3-47316.wav', '065339_metal-bass-drum-90850.wav', '066166_qui-c39est-qu39est-tombe-loop-t85wav-39366.wav', '068320_hang-drum-2wav-80568.wav', '102-bpm-boom-groove-82498.wav', '109-bpm-70s-style-drum-loop-76138.wav', '10_drumloop_minimal-32725.wav', '120bpm_kick-build-up-98848.wav', '140-bpm-amen-break-original-processed-6945 (1).wav', '140-bpm-amen-break-original-processed-6945.wav', '170-beat-box-hpf-loop-103412.wav', '174-txls4-106297.wav', '2018-11-15-22563.wav', '808-d3-38858.wav', '80s-drums-fl-studio-70248.wav', '8bit-sample-69080.wav', 'action-drums-78-low-67673.wav', 'african-98600.wav', 'agressive-straight-drums-105-bpm-99895.wav', 'alien-beeper-103420.wav', 'asian-gong-102397.wav', 'atari-st-beat-09-106443.wav', 'ba-dum-bum-all-74740.wav', 'bar-bq-chicken-drumpsticks-129354.wav', 'bass-loops-003-with-drums-l

In [6]:
train_data = np.load('mfcc_data_drums.npz')
X_train = []
for file in files_list:
  X_train.append(train_data[file])

X_train = np.asarray(X_train)
X_train.shape

(137, 431, 20, 2)

In [27]:
X_train_trimmed = X_train[:, :420, :, :]

In [28]:
X_train_trimmed.shape

(137, 420, 20, 2)

In [12]:
batch_size = 13
real_series = (tf.data.Dataset.from_tensor_slices(X_train)
                .batch(batch_size, drop_remainder=True))
real_series_iter = iter(real_series.repeat())

In [15]:
real_series_iter

<tensorflow.python.data.ops.iterator_ops.OwnedIterator at 0x1aac299deb0>

In [16]:
def make_random_data():
    while True:
        yield np.random.normal(low=0, high=1, size=(431,20,2))

In [19]:
random_series = iter(tf.data.Dataset
                     .from_generator(make_random_data, output_types=tf.float32)
                     .batch(batch_size)
                     .repeat())

In [24]:
print(random_series)

<_RepeatDataset element_spec=TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)>


In [29]:
X_DIM = 420
Y_DIM = 20
Z_DIM = 2
EMBEDDING_DIM = 420

In [52]:
#Embedder
embedder_input = layers.Input(shape=(X_DIM, Y_DIM, Z_DIM))   ## Image Size height, width, channels
# label_input = layers.Input(shape = (87, 50, 15))

# e = layers.Concatenate(axis=-1)([embedder_input, label_input])
e = layers.Conv2D(64, kernel_size=3, strides=2, padding="same")(embedder_input)
e = layers.LeakyReLU(0.2)(e)
e = layers.Conv2D(128, kernel_size=3, strides=2, padding="same")(e)
e = layers.LeakyReLU()(e)
e = layers.Dropout(0.3)(e)
e = layers.Conv2D(128, kernel_size=3, strides=1, padding="same")(e)
e = layers.LeakyReLU(0.2)(e)
e = layers.Dropout(0.3)(e)
e = layers.Conv2D(128, kernel_size=3, strides=1, padding="same")(e)
e = layers.LeakyReLU(0.2)(e)
shape_before_flattening = K.int_shape(e)[1:]
print("SHAPE BEFORE FLATTENING THE EMBEDDER NETWORK {}".format(shape_before_flattening))
#e = layers.Flatten()(e)
embedder_output = layers.Dense(EMBEDDING_DIM, name = "embedder_ouput")(e)
embedder = tf.keras.Model(embedder_input, embedder_output)

# embedder = models.Model([embedder_input, label_input], embedder_ouput)

print("EMBEDDER NETWORK LOOKS LIKE : \n")
embedder.summary()



SHAPE BEFORE FLATTENING THE EMBEDDER NETWORK (105, 5, 128)
EMBEDDER NETWORK LOOKS LIKE : 

Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 420, 20, 2)]      0         
                                                                 
 conv2d_8 (Conv2D)           (None, 210, 10, 64)       1216      
                                                                 
 leaky_re_lu_84 (LeakyReLU)  (None, 210, 10, 64)       0         
                                                                 
 conv2d_9 (Conv2D)           (None, 105, 5, 128)       73856     
                                                                 
 leaky_re_lu_85 (LeakyReLU)  (None, 105, 5, 128)       0         
                                                                 
 dropout_4 (Dropout)         (None, 105, 5, 128)       0         
                                 

In [51]:
#Recovery
recovery_input = layers.Input(shape=(EMBEDDING_DIM,), name="RECOVERY_INPUT")
# label_input = layers.Input(shape=(15,))
# r = layers.Concatenate(axis=-1)([recovery_input, label_input])
r = layers.Dense(np.prod(shape_before_flattening))(recovery_input)
r = layers.Reshape(shape_before_flattening)(r)
r = layers.Conv2DTranspose(128, kernel_size=3, strides=1, padding="same", use_bias=False)(r)
r = layers.BatchNormalization(momentum=0.9)(r)
r = layers.LeakyReLU(0.2)(r)
r = layers.Conv2DTranspose(128, kernel_size=3, strides=1, padding="same", use_bias=False)(r)
r = layers.BatchNormalization(momentum=0.9)(r)
r = layers.LeakyReLU(0.2)(r)
r = layers.Conv2DTranspose(128, kernel_size=3, strides=2, padding="same", use_bias=False)(r)
r = layers.BatchNormalization(momentum=0.9)(r)
r = layers.LeakyReLU(0.2)(r)
r = layers.Conv2DTranspose(128, kernel_size=3, strides=1, padding="same", use_bias=False)(r)
r = layers.BatchNormalization(momentum=0.9)(r)
r = layers.LeakyReLU(0.2)(r)
recovery_output = layers.Conv2DTranspose(2, kernel_size=3, strides=2, padding="same", activation="tanh")(r)

# recovery = models.Model([recovery_input, label_input], recovery_output)
recovery = tf.keras.Model(recovery_input, recovery_output)
print("RECOVERY NETWORK LOOKS LIKE: \n")
recovery.summary()


RECOVERY NETWORK LOOKS LIKE: 

Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 RECOVERY_INPUT (InputLayer  [(None, 420)]             0         
 )                                                               
                                                                 
 dense_18 (Dense)            (None, 67200)             28291200  
                                                                 
 reshape_18 (Reshape)        (None, 105, 5, 128)       0         
                                                                 
 conv2d_transpose_90 (Conv2  (None, 105, 5, 128)       147456    
 DTranspose)                                                     
                                                                 
 batch_normalization_72 (Ba  (None, 105, 5, 128)       512       
 tchNormalization)                                               
                           