In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import logging

In [2]:
logger=logging.getLogger(__name__)

In [3]:
import tensorflow as tf 

2024-09-07 13:33:09.698446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-07 13:33:12.454236: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-07 13:33:13.262151: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-07 13:33:19.281353: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from tensorflow.keras.layers import Dense,Input,Reshape,Conv2D,BatchNormalization,Lambda,add,Dropout
from tensorflow.keras import regularizers,Model,layers
import tensorflow.keras.backend as K

In [5]:
SAMPLE_RATE=16_000
NUM_FRAMES=160
NUM_FBANKS=64

In [6]:
@tf.function
def tf_normalize(data, ndims, eps=0, adjusted=False):
    data = tf.convert_to_tensor(data, name='data')

    reduce_dims = [-i - 1 for i in range(ndims)]
    # pylint: disable=E1123,E1120
    data = tf.cast(data, dtype=tf.dtypes.float32)
    data_num = tf.reduce_prod(data.shape[-ndims:])
    data_mean = tf.reduce_mean(data, axis=reduce_dims, keepdims=True)

    # Apply a minimum normalization that protects us against uniform images.
    stddev = tf.math.reduce_std(data, axis=reduce_dims, keepdims=True)
    adjusted_stddev = stddev
    if adjusted:
        min_stddev = tf.math.rsqrt(tf.cast(data_num, tf.dtypes.float32))
        eps = tf.maximum(eps, min_stddev)
    if eps > 0:
        adjusted_stddev = tf.maximum(adjusted_stddev, eps)

    return (data - data_mean) / adjusted_stddev


@tf.function
def tf_fbank(samples):
    """
    Compute Mel-filterbank energy features from an audio signal.
    See python_speech_features.fbank
    """
    frame_length = int(0.025 * SAMPLE_RATE)
    frame_step = int(0.01 * SAMPLE_RATE)
    fft_length = 512
    fft_bins = fft_length // 2 + 1

    pre_emphasis = samples[:, 1:] - 0.97 * samples[:, :-1]

    
    spec = tf.abs(tf.signal.stft(pre_emphasis, frame_length, frame_step, fft_length, window_fn=tf.ones))
    powspec = tf.square(spec) / fft_length

    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=NUM_FBANKS,
        num_spectrogram_bins=fft_bins,
        sample_rate=SAMPLE_RATE,
        lower_edge_hertz=0,
        upper_edge_hertz=SAMPLE_RATE / 2,
    )

    feat = tf.matmul(powspec, linear_to_mel_weight_matrix)
    # feat = tf.where(feat == 0, np.finfo(np.float32).eps, feat)
    return feat


class DeepSpeakerModel:
    def __init__(
            self,
            batch_input_shape=(None, NUM_FRAMES, NUM_FBANKS, 1),
            include_softmax=False,
            num_speakers_softmax=None,
            pcm_input=False
    ):
        if pcm_input:
            batch_input_shape = None
        self.include_softmax = include_softmax
        if self.include_softmax:
            assert num_speakers_softmax > 0
        self.clipped_relu_count = 0

        if pcm_input:
            batch_input_shape = batch_input_shape or (None, None)  # Batch-size, num-samples
            inputs = Input(batch_shape=batch_input_shape, name='raw_inputs')
            x = inputs
            x = Lambda(tf_fbank)(x)
            x = Lambda(lambda x_: tf_normalize(x_, 1, 1e-12))(x)
            x = Lambda(lambda x_: tf.expand_dims(x_, axis=-1))(x)
            print(f"pcm input shape {x}")
        else:
            batch_input_shape = batch_input_shape or (None, None, NUM_FBANKS, 1)
            inputs = Input(batch_shape=batch_input_shape, name='input')
            x = inputs
            print(f"inputs shape {x}")

        x = self.cnn_component(x)
        print(f"cnn component output shape {x}")

        x = Reshape((-1, 2048))(x)
        print(f" shape after reshape {x}")
        # Temporal average layer. axis=1 is time.
        x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
        print(f" shape after average {x}")
        if include_softmax:
            logger.info('Including a Dropout layer to reduce overfitting.')
            # used for softmax because the dataset we pre-train on might be too small. easy to overfit.
            x = Dropout(0.5)(x)
        x = Dense(512, name='affine')(x)
        print(f" shape after dense 512 {x}")
        if include_softmax:
            # Those weights are just when we train on softmax.
            x = Dense(num_speakers_softmax, activation='softmax')(x)
        else:
            # Does not contain any weights.
            x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
        self.m = Model(inputs, x, name='ResCNN')

    def keras_model(self):
        return self.m

    def get_weights(self):
        w = self.m.get_weights()
        if self.include_softmax:
            w.pop()  # last 2 are the W_softmax and b_softmax.
            w.pop()
        return w

    def clipped_relu(self, inputs):
        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
        print(f" shape after clipped relu {relu}")
        self.clipped_relu_count += 1
        return relu

    def identity_block(self, input_tensor, kernel_size, filters, stage, block):
        conv_name_base = f'res{stage}_{block}_branch'
        print(f"shape received in identity block {input_tensor}")
        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.L2(0.0001),
                   name=conv_name_base + '_2a')(input_tensor)
        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
        x = self.clipped_relu(x)

        x = Conv2D(
            filters,
            kernel_size=kernel_size,
            strides=1,
            activation=None,
            padding='same',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.L2(0.0001),
            name=conv_name_base + '_2b',
        )(x)
        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)

        x = self.clipped_relu(x)

        x = layers.add([x, input_tensor])
        x = self.clipped_relu(x)
        print(f"shape output in identity block {input_tensor}")
        return x

    def conv_and_res_block(self, inp, filters, stage):
        conv_name = 'conv{}-s'.format(filters)
        # TODO: why kernel_regularizer?
        print(f"shape received in conv-res-block {inp}")
        o = Conv2D(filters,
                   kernel_size=5,
                   strides=2,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.L2(0.0001), name=conv_name)(inp)
        o = BatchNormalization(name=conv_name + '_bn')(o)
        o = self.clipped_relu(o)
        for i in range(3):
            o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
        print(f"shape output in conv-res-block {inp}")
        return o

    def cnn_component(self, inp):
        x = self.conv_and_res_block(inp, 64, stage=1)
        x = self.conv_and_res_block(x, 128, stage=2)
        x = self.conv_and_res_block(x, 256, stage=3)
        x = self.conv_and_res_block(x, 512, stage=4)
        return x

    def set_weights(self, w):
        for layer, layer_w in zip(self.m.layers, w):
            layer.set_weights(layer_w)
            logger.info(f'Setting weights for [{layer.name}]...')

In [7]:
model=DeepSpeakerModel()

inputs shape <KerasTensor shape=(None, 160, 64, 1), dtype=float32, sparse=False, name=input>
shape received in conv-res-block <KerasTensor shape=(None, 160, 64, 1), dtype=float32, sparse=False, name=input>


I0000 00:00:1725716106.600408   58320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1725716113.973621   58320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1725716113.973759   58320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1725716113.991509   58320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1725716113.991627   58320 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

 shape after clipped relu <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_4>
shape received in identity block <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_4>
 shape after clipped relu <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_9>
 shape after clipped relu <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_14>
 shape after clipped relu <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_18>
shape output in identity block <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_4>
shape received in identity block <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_18>
 shape after clipped relu <KerasTensor shape=(None, 80, 32, 64), dtype=float32, sparse=False, name=keras_tensor_23>
 shape after clipped relu <KerasTensor shape=(None, 80, 3

In [8]:
model.m.summary()

In [21]:
import numpy as np  

In [22]:
x=tf.constant(np.random.rand(10,160,64,1))

In [24]:
x.shape

TensorShape([10, 160, 64, 1])

In [23]:
def clipped_relu(inputs):
        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20))(inputs)
        return relu

In [44]:
def identity_block(input_tensor, kernel_size, filters, stage, block):
        conv_name_base = f'res{stage}_{block}_branch'
        # print(f"shape received in identity block {input_tensor}")
        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.L2(0.0001),
                   name=conv_name_base + '_2a')(input_tensor)
        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
        x =clipped_relu(x)

        x = Conv2D(
            filters,
            kernel_size=kernel_size,
            strides=1,
            activation=None,
            padding='same',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.L2(0.0001),
            name=conv_name_base + '_2b',
        )(x)
        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)

        x = clipped_relu(x)

        x = layers.add([x, input_tensor])
        x = clipped_relu(x)
        # print(f"shape output in identity block {input_tensor}")
        return x

In [45]:
def conv_and_res_block(inp, filters, stage):
        conv_name = 'conv{}-s'.format(filters)
        # TODO: why kernel_regularizer?
        # print(f"shape received in conv-res-block {inp}")
        o = Conv2D(filters,
                   kernel_size=5,
                   strides=2,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.L2(0.0001), name=conv_name)(inp)
        o = BatchNormalization(name=conv_name + '_bn')(o)
        o = clipped_relu(o)
        for i in range(3):
            o =identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
        # print(f"shape output in conv-res-block {inp}")
        return o


In [47]:
def cnn_component(inp):
        x = conv_and_res_block(inp, 64, stage=1)
        x = conv_and_res_block(x, 128, stage=2)
        x = conv_and_res_block(x, 256, stage=3)
        x = conv_and_res_block(x, 512, stage=4)
        return x

In [48]:
x.shape

TensorShape([10, 160, 64, 1])

In [54]:
identity_out=identity_block(x,kernel_size=3,filters=64,stage=1,block=1)
# identity_out=identity_block(identity_out,kernel_size=3,filters=64,stage=2,block=2)
identity_out.shape

TensorShape([10, 160, 64, 64])

In [59]:
conv_res_block_out=conv_and_res_block(x,64,stage=1)
conv_res_block_out.shape

TensorShape([10, 80, 32, 64])

In [55]:
conv_out= Conv2D(64,
                   kernel_size=3,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.L2(0.0001))(x)
conv_out.shape

TensorShape([10, 160, 64, 64])

In [58]:
x.shape

TensorShape([10, 160, 64, 1])

In [57]:
o = Conv2D(64,
            kernel_size=5,
            strides=2,
            activation=None,
            padding='same',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.L2(0.0001))(x)
o.shape 

TensorShape([10, 80, 32, 64])

In [60]:
print(x.shape)
cn_out=cnn_component(x)
print(cn_out.shape)

(10, 160, 64, 1)


W0000 00:00:1725710254.946217    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710254.976729    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710254.995916    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.014089    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.031363    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.050101    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.068593    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.086908    7260 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1725710255.104718    7260 gp

(10, 10, 4, 512)


In [61]:
print(cn_out.shape)
rs=Reshape((-1,2048))(cn_out)
print(rs.shape)

(10, 10, 4, 512)
(10, 10, 2048)


In [62]:
rs = Lambda(lambda y: K.mean(y, axis=1), name='average')(rs)
print(rs.shape)

(10, 2048)
