In [63]:
import tensorflow as tf
from glob import glob
import librosa
import soundfile as sf
from numpy.random import choice
import numpy as np  
from tensorflow.keras import layers,Model,Sequential
from tensorflow.data import Dataset

In [4]:
data_dir="/mnt/d/Programs/Python/PW/projects/asteroid/zip-hindi-2k"

In [5]:
files=glob(f"{data_dir}/**/*.wav")
len(files)

2000

In [53]:
noise_files=glob('/mnt/d/Programs/Python/PW/projects/asteroid/noise-2k/**/*.wav')
len(noise_files)

2000

In [6]:
SAMPLE_RATE=16_000
SEG_LENGTH=0.63
NUM_FBANK=64
WINDOW_LENGTH=0.025
OVERLAP=0.010

In [None]:
layers.BatchNormalization

In [10]:
class Prologue(Model):
    def __init__(self,
                 out_channels=128,
                 kernel_size=11,):
        super(Prologue,self).__init__()
        self.prolog=layers.Conv2D(filters=out_channels,
                              kernel_size=kernel_size,
                              padding='same')
        self.norm1=layers.BatchNormalization()
        self.relu=layers.ReLU()
    
    def call(self,x):
        x=self.prolog(x)
        x=self.norm1(x)
        x=self.relu(x)
        return x

    def build(self,input_shape):
        super(Prologue,self).build(input_shape)

In [None]:
layers.DepthwiseConv2D

In [29]:
class QuartzSubBlock(Model):
    def __init__(self,
                 out_channels,
                 kernel_size) -> None:
        super(QuartzSubBlock,self).__init__()
        self.depthwise_conv =layers.DepthwiseConv2D(kernel_size=kernel_size, 
                                        padding='same')
        self.pointwise_conv = layers.Conv2D( filters=out_channels, 
                                        kernel_size=1)
        self.norm=layers.BatchNormalization()
        self.relu=layers.ReLU()
        self.dropout=layers.Dropout(0.5)
    
    def call(self,x):
        x=self.pointwise_conv(x)
        x=self.depthwise_conv(x)
        x=self.norm(x)
        x=self.relu(x)
        x=self.dropout(x)
        return x
    
    def build(self,input_shape):
        super(QuartzSubBlock,self).build(input_shape)

In [37]:
class QuartzBlock(Model):
    def __init__(self,
                 out_channels,
                 kernel_size,
                 num_sub_blocks=2):
        super(QuartzBlock,self).__init__()
        self.sub_block_list=[QuartzSubBlock(
            out_channels,
            kernel_size
        ) for _ in range(num_sub_blocks)]
        self.sub_blocks=Sequential(self.sub_block_list)
        self.depthwise_conv1 = layers.DepthwiseConv2D(kernel_size=kernel_size, 
                                        padding='same')
        self.pointwise_conv1 = layers.Conv2D(filters=out_channels, 
                                        kernel_size=1,
                                        padding='same')
        self.norm1=layers.BatchNormalization()
        self.relu1=layers.ReLU()
        self.dropout1=layers.Dropout(0.5)
        self.pointwise_conv2 = layers.Conv2D(filters=out_channels,
                                        kernel_size=1,
                                        padding='same')
        self.norm2=layers.BatchNormalization()
    
    def call(self,x):
        y=self.sub_blocks(x)
        y=self.pointwise_conv1(x)
        y=self.depthwise_conv1(x)
        y=self.norm1(y)
        x=self.pointwise_conv2(x)
        x=self.norm2(x)
        out=self.relu1(x+y)
        out=self.dropout1(out)
        return out

    def build(self,input_shape):
        super(QuartzBlock,self).build(input_shape)

In [38]:
q_block=QuartzBlock(out_channels=64,kernel_size=13)
q_block.build(input_shape=(None,64,64,64))

In [39]:
out=q_block(tf.random.uniform(shape=(2,64,64,64)))
out.shape

TensorShape([2, 64, 64, 64])

In [43]:
class Epilogue(Model):
    def __init__(self,
                 out_channels,
                 kernel_size,
                 dilation=1):
        super(Epilogue,self).__init__()
        self.conv=layers.Conv2D(filters=out_channels,
                            kernel_size=kernel_size,
                            dilation_rate=dilation)
        self.norm=layers.BatchNormalization()
        self.relu=layers.ReLU()
    
    def call(self,x):
        x=self.conv(x)
        x=self.norm(x)
        x=self.relu(x)
        return x

    def build(self,input_shape):
        super(Epilogue,self).build(input_shape)

In [44]:
epilog=Epilogue(out_channels=128,kernel_size=29,dilation=2)
epilog.build((None,64,64,64))

In [46]:
epilog(tf.random.uniform(shape=(2,64,64,64))).shape

TensorShape([2, 8, 8, 128])

In [48]:
class MarbleNet(Model):
    def __init__(self) -> None:
        super(MarbleNet,self).__init__()
        self.prolog=Prologue()
        self.resizer=layers.Conv2D(filters=64,kernel_size=1)
        self.block_b1=QuartzBlock(out_channels=64,
                                  kernel_size=13,
                                  num_sub_blocks=2)
        self.block_b2=QuartzBlock(out_channels=64,
                                  kernel_size=15,
                                  num_sub_blocks=2)
        self.block_b3=QuartzBlock(out_channels=64,
                                  kernel_size=17,
                                  num_sub_blocks=2)
        self.epilogue1=Epilogue(out_channels=128,
                                kernel_size=29,
                                dilation=2)
        self.epilogue2=Epilogue(out_channels=128,
                                kernel_size=1)
        self.conv1x1=layers.Conv2D(filters=2,
                            kernel_size=1)
        self.linear=layers.Dense(2,activation='softmax')

    def call(self,x):
        x=self.prolog(x)
        x=self.resizer(x)
        x=self.block_b1(x)
        x=self.block_b2(x)
        x=self.block_b3(x)
        x=self.epilogue1(x)
        x=self.epilogue2(x)
        x=self.conv1x1(x)
        batch=x.shape[0]
        x=tf.reshape(x,shape=(batch,-1))
        x=self.linear(x)
        return x
    
    def build(self,input_shape):
        super(MarbleNet,self).build(input_shape)

In [50]:
marble_net=MarbleNet()
marble_net.build((None,64,64,1))

In [51]:
marble_net(tf.random.uniform(shape=(2,64,64,1))).shape

TensorShape([2, 2])

In [85]:
class MarbleNetDataset:
    def __init__(self,audio_files,
                 noise_files,
                 sample_rate=16_000,
                 seg_len=0.63,
                 num_filts=64,
                 win_len=0.025,
                 overlap=0.01):
        self.audio_files=audio_files
        self.noise_files=noise_files
        self.sample_rate=sample_rate
        self.seg_len=int(seg_len*sample_rate)
        self.num_filts=num_filts
        self.win_len=int(win_len*sample_rate)
        self.overlap=int(overlap*sample_rate)
    
    def __len__(self):
        return len(self.audio_files)+len(self.noise_files)
    
    def __iter__(self):
        chance=np.random.rand()
        file=None
        label=None
        if chance > 0.5:
            file=choice(self.audio_files,1).item()
            label=1
        else:
            file=choice(self.noise_files,1).item()
            label=0
        data,_=librosa.load(file,sr=self.sample_rate,mono=True)
        mel_spectrogram = librosa.feature.melspectrogram(y=data, sr=self.sample_rate, 
                                                         n_fft=512,
                                                         hop_length=self.overlap, 
                                                         win_length=self.win_len, 
                                                         n_mels=self.num_filts)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        if mel_spectrogram_db.shape[1] > 64:
            mel_spectrogram_db = mel_spectrogram_db[:, :64]
        elif mel_spectrogram_db.shape[1] < 64:
            mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, 64 - mel_spectrogram_db.shape[1])), mode='constant')
        return np.expand_dims(mel_spectrogram_db,axis=-1),np.array(label)
    def generator(self):
        yield self.__iter__()

In [87]:
import numpy as np
import librosa
from random import choice
import tensorflow as tf

class MarbleNetDataset:
    def __init__(self, audio_files, noise_files, sample_rate=16_000, seg_len=0.63, num_filts=64, win_len=0.025, overlap=0.01):
        self.audio_files = audio_files
        self.noise_files = noise_files
        self.sample_rate = sample_rate
        self.seg_len = int(seg_len * sample_rate)
        self.num_filts = num_filts
        self.win_len = int(win_len * sample_rate)
        self.overlap = int(overlap * sample_rate)

    def __len__(self):
        return len(self.audio_files) + len(self.noise_files)

    def __iter__(self):
        chance = np.random.rand()
        file = None
        label = None
        if chance > 0.5:
            file = choice(self.audio_files)
            label = 1
        else:
            file = choice(self.noise_files)
            label = 0
        data, _ = librosa.load(file, sr=self.sample_rate, mono=True)
        mel_spectrogram = librosa.feature.melspectrogram(
            y=data, sr=self.sample_rate, 
            n_fft=512,
            hop_length=self.overlap, 
            win_length=self.win_len, 
            n_mels=self.num_filts)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        if mel_spectrogram_db.shape[1] > 64:
            mel_spectrogram_db = mel_spectrogram_db[:, :64]
        elif mel_spectrogram_db.shape[1] < 64:
            mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, 64 - mel_spectrogram_db.shape[1])), mode='constant')
        return np.expand_dims(mel_spectrogram_db, axis=-1), np.array(label)

    def generator(self, batch_size=5):
        while True:
            batch_x, batch_y = [], []
            for _ in range(batch_size):
                x, y = self.__iter__()
                batch_x.append(x)
                batch_y.append(y)
            yield np.array(batch_x), np.array(batch_y)

In [88]:
dataset_generator=MarbleNetDataset(audio_files=files,noise_files=noise_files)


In [91]:
batch_size=5

In [92]:
dataset=Dataset.from_generator(dataset_generator.generator,output_signature=(
    tf.TensorSpec(shape=(batch_size,64,64,1),dtype='float32'),
    tf.TensorSpec(shape=(batch_size),dtype='float32')
), args=(batch_size,))


In [94]:
for x,y in dataset:
    print(x.shape)
    print(y.shape)
    out=marble_net(x)
    print(out.shape)
    break

(5, 64, 64, 1)
(5,)


W0000 00:00:1727380379.849611   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.866975   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.882135   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.896761   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.910703   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.925852   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.943683   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.958787   67724 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1727380379.974401   67724 gp

(5, 2)


In [95]:
marble_net.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [96]:
marble_net.fit(dataset,epochs=1)

I0000 00:00:1727380521.361019   69967 service.cc:146] XLA service 0x7f2c6c00c350 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727380521.362464   69967 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-09-26 19:55:21.964261: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1727380538.580044   69967 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   2099/Unknown [1m358s[0m 157ms/step - accuracy: 0.8135 - loss: 1.3624

KeyboardInterrupt: 