In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import audio_dataset_from_directory
import numpy as np
import librosa

In [None]:

def load_audio_file(file_path):
    # Load an audio file as a tensor, assume the file is a WAV file
    audio_binary = tf.io.read_file(file_path)
    audio, sample_rate = tf.audio.decode_wav(audio_binary)
    # Only use the first channel if it's stereo
    audio = audio[:, 0]
    return audio, sample_rate


def preprocess_audio_mfps(audio, sample_rate):
    # Cast audio to float32 and normalize
    audio = tf.cast(audio, tf.float32)
    audio = audio / 32768.0  # Normalize audio

    # Extract mel-frequency power spectra
    def _extract_mel(audio):
        # Compute mel-frequency power spectra
        stfts = tf.signal.stft(audio, frame_length=1024, frame_step=512, fft_length=1024)
        spectrograms = tf.abs(stfts)

        num_spectrogram_bins = stfts.shape[-1]
        lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 128
        linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)

        mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
        mel_spectrograms = tf.reshape(mel_spectrograms, [1, -1, 128])  # Reshape for batch dimension if needed
        return mel_spectrograms

    # Use tf.py_function to allow for eager execution of the extraction
    mel_spectra = tf.py_function(_extract_mel, [audio], tf.float32)
    return mel_spectra

def pad_sequence(seq):
    # Pad the sequence to the maximum length found in the training data
    padded_seq = tf.pad(seq, paddings=[[0, 0], [0, max_length - tf.shape(seq)[1]], [0, 0]], constant_values=0)
    return padded_seq


In [None]:
def contrastive_loss(y_true, y_pred, margin=1):
    # Calculate the Euclidean distance between the two outputs
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

# Define cosine_similarity function as it's used in your Lambda layer
def cosine_similarity(vectors):
    x, y = vectors
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return K.sum(x * y, axis=-1, keepdims=True)


In [None]:
file_name_m4a = "/content/Achtee-irfan.m4a"
file_name_wav = "/content/Achtee-irfan.wav"
base_file = "/content/Q1201502.wav"

In [None]:
import soundfile as sf
import audioread
import numpy as np

# Function to load M4A file
def load_m4a(filename):
    with audioread.audio_open(filename) as f:
        data = np.hstack([np.frombuffer(chunk, dtype='int16') for chunk in f])
        return data, f.samplerate

# Load your M4A file
data, samplerate = load_m4a(file_name_m4a)

# Convert it to WAV using soundfile
sf.write(file_name_wav, data, samplerate, format='WAV', subtype='PCM_16')


In [None]:
# Example of how to use these functions

max_length = 402
file_path = file_name_wav
audio, sample_rate = load_audio_file(file_path)
processed_audio = preprocess_audio_mfps(audio, sample_rate)
processed_padded = pad_sequence(processed_audio)

file_path_base = base_file
audio_base, sample_rate_base = load_audio_file(file_path_base)
processed_audio_base = preprocess_audio_mfps(audio_base, sample_rate_base)
processed_padded_base = pad_sequence(processed_audio_base)


# # Load the saved Siamese model
# model_path = 'best_model.h5'
# siamese_model = tf.keras.models.load_model(model_path)
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K  # Ensure backend is imported


# When loading the model, you now also need to register the cosine_similarity function
siamese_model = load_model('best_model_5-9.h5', custom_objects={'contrastive_loss': contrastive_loss, 'cosine_similarity': cosine_similarity})

# Register the custom loss function and load the model
#siamese_model = load_model('best_model_5-9.h5', custom_objects={'contrastive_loss': contrastive_loss})

# Suppose you have another processed audio tensor, reference_audio, to compare against
# Here you should provide your model with both samples as a pair
output = siamese_model([processed_padded, processed_padded_base])  # Assuming your model takes a list of two inputs

# The output typically could be a similarity score or a classification result
print("Model output:", output.numpy())

Model output: [[0.76158553]]


In [None]:
# color coding which letter or sound pronouncing wrong.