Group No : 28

| BITS ID       | Name                         | Contribution |
|---------------|------------------------------|-------------|
| 2024aa05366   | JOKARE MAHESH SHIVANAND      | 100%        |
| 2024aa05367   | ROHIT SANWARIYA              | 100%        |
| 2024aa05369   | ANEESH K.V.                  | 100%        |
| 2024aa05370   | JAISINGHANI ANJALI CHANDER VARSHA | 100%       |

In [6]:
# Import required libraries
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle
import os
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.10.0


In [13]:
# Create synthetic speech dataset for demonstration
def create_synthetic_speech_dataset(num_samples=1000, seq_length=8000):
    """Create synthetic audio data for speech recognition demo"""
    print("Creating synthetic speech dataset...")
    
    # Generate synthetic audio (mixture of sine waves)
    X = []
    y = []
    
    # Simple vocabulary for demonstration
    vocab = ['hello', 'world', 'ai', 'ml', 'test', 'data', 'model']
    
    for i in range(num_samples):
        # Create synthetic audio (multiple sine waves)
        t = np.linspace(0, 1, seq_length)
        freq1 = 100 + np.random.randint(0, 200)
        freq2 = 300 + np.random.randint(0, 200)
        
        audio = (0.5 * np.sin(2 * np.pi * freq1 * t) + 
                 0.3 * np.sin(2 * np.pi * freq2 * t) +
                 0.1 * np.random.randn(seq_length))
        
        # Normalize
        audio = audio / np.max(np.abs(audio))
        
        # Create simple text label
        text_label = ' '.join(np.random.choice(vocab, size=2))
        
        X.append(audio)
        y.append(text_label)
    
    return np.array(X), np.array(y)

# Generate dataset
X, y = create_synthetic_speech_dataset(1000)
X = np.expand_dims(X, -1)  # Add channel dimension

print(f"Dataset shape: {X.shape}")
print(f"Sample labels: {y[:5]}")

Creating synthetic speech dataset...
Dataset shape: (1000, 8000, 1)
Sample labels: ['model model' 'test data' 'ai world' 'world world' 'model ai']


In [15]:
# Text preprocessing
def create_text_processor(texts):
    """Create text tokenizer and preprocess labels"""
    # Create character-level tokenizer
    tokenizer = keras.preprocessing.text.Tokenizer(
        char_level=True,
        oov_token='<UNK>',
        filters=''
    )
    
    tokenizer.fit_on_texts(texts)
    
    # Convert texts to sequences
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Pad sequences
    max_len = max(len(seq) for seq in sequences)
    sequences_padded = keras.preprocessing.sequence.pad_sequences(
        sequences, maxlen=max_len, padding='post'
    )
    
    return sequences_padded, tokenizer, max_len

# Process text labels
y_processed, tokenizer, max_text_len = create_text_processor(y)
vocab_size = len(tokenizer.word_index) + 1

print(f"Processed labels shape: {y_processed.shape}")
print(f"Vocabulary size: {vocab_size}")
print(f"Max text length: {max_text_len}")

Processed labels shape: (1000, 11)
Vocabulary size: 15
Max text length: 11


In [16]:
# Conformer components
class ConformerBlock(layers.Layer):
    """Conformer block as described in the paper"""
    def __init__(self, embed_dim, num_heads, ff_dim, conv_kernel_size=32, dropout_rate=0.1):
        super(ConformerBlock, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        
        # First feed-forward layer
        self.ffn1 = keras.Sequential([
            layers.Dense(ff_dim, activation='swish'),
            layers.Dropout(dropout_rate),
            layers.Dense(embed_dim),
            layers.Dropout(dropout_rate)
        ])
        
        # Self-attention layer
        self.self_attention = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim//num_heads,
            dropout=dropout_rate
        )
        self.attention_norm = layers.LayerNormalization(epsilon=1e-6)
        
        # Convolution module
        self.conv_module = keras.Sequential([
            layers.LayerNormalization(epsilon=1e-6),
            layers.Conv1D(embed_dim, 1),  # Pointwise conv
            layers.Activation('swish'),
            layers.DepthwiseConv1D(conv_kernel_size, padding='same'),
            layers.BatchNormalization(),
            layers.Activation('swish'),
            layers.Conv1D(embed_dim, 1),  # Pointwise conv
            layers.Dropout(dropout_rate)
        ])
        self.conv_norm = layers.LayerNormalization(epsilon=1e-6)
        
        # Second feed-forward layer
        self.ffn2 = keras.Sequential([
            layers.Dense(ff_dim, activation='swish'),
            layers.Dropout(dropout_rate),
            layers.Dense(embed_dim),
            layers.Dropout(dropout_rate)
        ])
        self.final_norm = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, training=False):
        # First FFN (half-step residual)
        ffn1_out = self.ffn1(inputs, training=training)
        x = inputs + 0.5 * ffn1_out
        
        # Self-attention
        attn_output = self.self_attention(x, x, training=training)
        x = self.attention_norm(x + attn_output)
        
        # Convolution module
        conv_output = self.conv_module(x, training=training)
        x = self.conv_norm(x + conv_output)
        
        # Second FFN (half-step residual)
        ffn2_out = self.ffn2(x, training=training)
        x = x + 0.5 * ffn2_out
        
        return self.final_norm(x)

In [17]:
def build_conformer_model(input_shape, vocab_size, max_text_len, 
                         embed_dim=144, num_heads=4, ff_dim=576, 
                         num_blocks=4, conv_kernel_size=32):
    """Build complete Conformer model for speech recognition"""
    
    # Input layer
    audio_input = layers.Input(shape=input_shape, name='audio_input')
    
    # Initial feature extraction
    x = layers.Conv1D(embed_dim, 3, padding='same')(audio_input)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    
    # Add positional encoding
    positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
    position_embedding = layers.Embedding(input_shape[0], embed_dim)(positions)
    x = x + position_embedding
    
    # Stack Conformer blocks
    for _ in range(num_blocks):
        x = ConformerBlock(embed_dim, num_heads, ff_dim, conv_kernel_size)(x)
    
    # Output layers
    x = layers.Dense(embed_dim, activation='swish')(x)
    x = layers.Dropout(0.2)(x)
    
    # CTC output layer
    output = layers.Dense(vocab_size, activation='softmax', name='ctc_output')(x)
    
    model = keras.Model(inputs=audio_input, outputs=output)
    return model

# Build model
input_shape = (X.shape[1], X.shape[2])
model = build_conformer_model(input_shape, vocab_size, max_text_len)

print("Conformer Model Summary:")
model.summary()

Conformer Model Summary:
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 audio_input (InputLayer)       [(None, 8000, 1)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 8000, 144)    576         ['audio_input[0][0]']            
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 audio_input (InputLayer)       [(None, 8000, 1)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 8000, 144)    576         ['au

In [21]:
# CTC loss implementation
def ctc_loss(y_true, y_pred):
    """Custom CTC loss function"""
    batch_size = tf.shape(y_true)[0]
    input_length = tf.shape(y_pred)[1]
    label_length = tf.shape(y_true)[1]
    
    input_length = input_length * tf.ones(shape=(batch_size, 1), dtype='int32')
    label_length = label_length * tf.ones(shape=(batch_size, 1), dtype='int32')
    
    return keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

# Custom accuracy metric for CTC
# Fixed CTC accuracy metric
class CTCAccuracy(keras.metrics.Metric):
    def __init__(self, name='ctc_accuracy', **kwargs):
        super(CTCAccuracy, self).__init__(name=name, **kwargs)
        self.correct_count = self.add_weight(name='correct', initializer='zeros')
        self.total_count = self.add_weight(name='total', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        # Decode predictions
        input_length = tf.shape(y_pred)[1]
        input_length = input_length * tf.ones(shape=(tf.shape(y_pred)[0],), dtype='int32')
        
        decoded, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(y_pred, perm=[1, 0, 2]),
            input_length
        )
        
        # Convert to dense and ensure same type
        decoded = tf.sparse.to_dense(decoded[0], default_value=-1)
        decoded = tf.cast(decoded, tf.int32)  # Convert to int32
        
        # Ensure y_true is also int32
        y_true = tf.cast(y_true, tf.int32)
        
        # Calculate accuracy - handle variable length sequences
        correct = tf.reduce_all(
            tf.equal(decoded[:, :tf.shape(y_true)[1]], y_true), 
            axis=1
        )
        correct = tf.cast(correct, tf.float32)
        
        self.correct_count.assign_add(tf.reduce_sum(correct))
        self.total_count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))
    
    def result(self):
        return self.correct_count / self.total_count
    
    def reset_states(self):
        self.correct_count.assign(0.)
        self.total_count.assign(0.)

# Alternative simpler CTC accuracy metric (if above still has issues)
class SimpleCTCAccuracy(keras.metrics.Metric):
    def __init__(self, name='ctc_accuracy', **kwargs):
        super(SimpleCTCAccuracy, self).__init__(name=name, **kwargs)
        self.correct_count = self.add_weight(name='correct', initializer='zeros')
        self.total_count = self.add_weight(name='total', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        # Simple implementation - just track if we can decode something
        input_length = tf.shape(y_pred)[1]
        input_length = input_length * tf.ones(shape=(tf.shape(y_pred)[0],), dtype='int32')
        
        decoded, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(y_pred, perm=[1, 0, 2]),
            input_length
        )
        
        # Just check if we decoded anything (simplified)
        decoded_dense = tf.sparse.to_dense(decoded[0], default_value=-1)
        has_prediction = tf.cast(tf.reduce_any(decoded_dense != -1, axis=1), tf.float32)
        
        self.correct_count.assign_add(tf.reduce_sum(has_prediction))
        self.total_count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))
    
    def result(self):
        return self.correct_count / self.total_count if self.total_count > 0 else 0.0
    
    def reset_states(self):
        self.correct_count.assign(0.)
        self.total_count.assign(0.)
    def __init__(self, name='ctc_accuracy', **kwargs):
        super(CTCAccuracy, self).__init__(name=name, **kwargs)
        self.correct_count = self.add_weight(name='correct', initializer='zeros')
        self.total_count = self.add_weight(name='total', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        # Decode predictions
        input_length = tf.shape(y_pred)[1]
        input_length = input_length * tf.ones(shape=(tf.shape(y_pred)[0],), dtype='int32')
        
        decoded, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(y_pred, perm=[1, 0, 2]),
            input_length
        )
        
        # Compare with true labels
        y_true = tf.cast(y_true, tf.int32)
        decoded = tf.sparse.to_dense(decoded[0], default_value=-1)
        
        # Calculate accuracy
        correct = tf.reduce_all(tf.equal(decoded, y_true), axis=1)
        correct = tf.cast(correct, tf.float32)
        
        self.correct_count.assign_add(tf.reduce_sum(correct))
        self.total_count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))
    
    def result(self):
        return self.correct_count / self.total_count
    
    def reset_states(self):
        self.correct_count.assign(0.)
        self.total_count.assign(0.)

In [22]:
# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=ctc_loss,
    metrics=[CTCAccuracy()]
)

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y_processed, test_size=0.2, random_state=42
)

print(f"Training data: {X_train.shape}, {y_train.shape}")
print(f"Validation data: {X_val.shape}, {y_val.shape}")

# Train model
print("Training Conformer model...")
history = model.fit(
    X_train, y_train,
    batch_size=8,
    epochs=15,
    validation_data=(X_val, y_val),
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
    ],
    verbose=1
)

Training data: (800, 8000, 1), (800, 11)
Validation data: (200, 8000, 1), (200, 11)
Training Conformer model...
Epoch 1/15


ResourceExhaustedError: Graph execution error:

Detected at node 'model/conformer_block/multi_head_attention/einsum/Einsum' defined at (most recent call last):
    File "c:\Users\rohit\.conda\envs\tf\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\rohit\.conda\envs\tf\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\tornado\platform\asyncio.py", line 211, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\rohit\.conda\envs\tf\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "c:\Users\rohit\.conda\envs\tf\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "c:\Users\rohit\.conda\envs\tf\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 519, in dispatch_queue
      await self.process_one()
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 508, in process_one
      await dispatch(*args)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 400, in dispatch_shell
      await result
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 368, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request
      reply_content = await reply_content
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 455, in do_execute
      res = shell.run_cell(
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\ipykernel\zmqshell.py", line 577, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3077, in run_cell
      result = self._run_cell(
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3132, in _run_cell
      result = runner(coro)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3336, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3519, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3579, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\rohit\AppData\Local\Temp\ipykernel_11616\1885145611.py", line 18, in <module>
      history = model.fit(
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\rohit\AppData\Local\Temp\ipykernel_11616\3308336230.py", line 54, in call
      attn_output = self.self_attention(x, x, training=training)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\layers\attention\multi_head_attention.py", line 596, in call
      attention_output, attention_scores = self._compute_attention(
    File "c:\Users\rohit\.conda\envs\tf\lib\site-packages\keras\layers\attention\multi_head_attention.py", line 525, in _compute_attention
      attention_scores = tf.einsum(self._dot_product_equation, key, query)
Node: 'model/conformer_block/multi_head_attention/einsum/Einsum'
OOM when allocating tensor with shape[8,4,8000,8000] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node model/conformer_block/multi_head_attention/einsum/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_17872]

In [None]:
# Evaluate model
print("Evaluating model...")
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation CTC Accuracy: {val_accuracy:.4f}")
print(f"Validation Loss: {val_loss:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['ctc_accuracy'], label='Training Accuracy')
plt.plot(history.history['val_ctc_accuracy'], label='Validation Accuracy')
plt.title('CTC Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('CTC Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.savefig('conformer_training.png')
plt.show()

In [None]:
# Prediction function
def predict_speech(audio_sample, model, tokenizer):
    """Predict text from audio sample"""
    # Add batch dimension
    audio_sample = np.expand_dims(audio_sample, axis=0)
    
    # Predict
    predictions = model.predict(audio_sample, verbose=0)
    
    # Decode using greedy decoder
    input_length = np.ones(predictions.shape[0]) * predictions.shape[1]
    decoded, _ = tf.nn.ctc_greedy_decoder(
        tf.transpose(predictions, perm=[1, 0, 2]),
        input_length
    )
    
    # Convert to text
    decoded = tf.sparse.to_dense(decoded[0]).numpy()
    predicted_text = tokenizer.sequences_to_texts(decoded)[0]
    
    return predicted_text

# Test prediction
test_sample = X_val[0]
true_text = tokenizer.sequences_to_texts([y_val[0]])[0]
predicted_text = predict_speech(test_sample, model, tokenizer)

print(f"True text: {true_text}")
print(f"Predicted text: {predicted_text}")

# Save model
model.save('conformer_speech_model.h5')
print("Model saved as 'conformer_speech_model.h5'")