In [245]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Input, LSTM, Dense, TimeDistributed, Flatten
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

# Ensure GPU is used
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print(f"Using GPU: {physical_devices[0]}")
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print("No GPU found. Ensure CUDA and cuDNN are correctly installed.")

# Constants
SAMPLE_RATE = 22050  # Sample rate for audio
N_MELS = 128         # Number of mel bands
DURATION = 2         # Fixed audio duration in seconds
HOP_LENGTH = 512     # Hop length for spectrogram
INPUT_SHAPE = (None, 128, 216, 3) # Input shape for MobileNetV2 (3 channels)
INPUT_SIZE = 82944
LSTM_UNITS = 128     # LSTM units

# Preprocessing Function
def preprocess_audio(file_path, sample_rate=SAMPLE_RATE, duration=DURATION):
    """Load audio file and convert to mel spectrogram."""
    y, sr = librosa.load(file_path, sr=sample_rate, duration=duration)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    # Pad or truncate the spectrogram to match the desired shape (N_MELS, 216)
    target_length = 216  # Set the desired number of time steps
    if mel_spectrogram_db.shape[1] < target_length:
        # Pad the spectrogram with zeros if it's shorter
        pad_width = target_length - mel_spectrogram_db.shape[1]
        mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, pad_width)), mode='constant')
    elif mel_spectrogram_db.shape[1] > target_length:
        # Truncate the spectrogram if it's longer
        mel_spectrogram_db = mel_spectrogram_db[:, :target_length]
    
    # Convert to 3 channels by repeating the single channel spectrogram
    mel_spectrogram_db_3_channels = np.repeat(mel_spectrogram_db[:, :, np.newaxis], 3, axis=-1)
    
    return mel_spectrogram_db_3_channels

# Dataset Loader with Debugging
def load_dataset(file_paths):
    """Load dataset and preprocess into spectrograms."""
    spectrograms = []
    for file_path in file_paths:
        print(f"Loading file: {file_path}")  # Debugging: print the file being processed
        if file_path.endswith((".mp3", ".wav", ".flac")):  # Check for multiple formats
            if os.path.exists(file_path):
                spectrogram = preprocess_audio(file_path)
                print(f"Spectrogram shape for {file_path}: {spectrogram.shape}")  # Debugging: check shape
                spectrograms.append(spectrogram)
            else:
                print(f"File does not exist: {file_path}")
        else:
            print(f"Skipping unsupported file type: {file_path}")
    
    # Convert list to numpy array
    return np.array(spectrograms)

# MobileNetV2 for Feature Extraction
def build_feature_extractor(input_shape):
    """Build MobileNetV2 feature extractor."""
    base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights="imagenet")
    base_model.trainable = False  # Freeze weights
    return base_model

# Full Model
def build_model(input_shape, lstm_units):
    """Build the full model."""
    input_layer = Input(shape=(None, 128, 216, 3))
   
    # Feature extraction with MobileNetV2
    feature_extractor = build_feature_extractor(input_shape)
    features = TimeDistributed(feature_extractor)(input_layer)  # Apply feature extractor for each time step
    flattened_features = TimeDistributed(Flatten())(features)   # Flatten features for LSTM
   
    # Temporal modeling with LSTM
    lstm_output = LSTM(lstm_units, return_sequences=False)(flattened_features)
   
    # Output embedding
    output_layer = Dense(128, activation="relu", name="embedding")(lstm_output)
   
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Example Workflow
dataset_path = "cv-corpus-7.0-singleword/ta"  # Update this to the location of your dataset
audio_files = os.path.join(dataset_path, "clips")
metadata_file = os.path.join(dataset_path, "validated.tsv")  # Replace if different

# Load metadata
metadata = pd.read_csv(metadata_file, sep="\t")
metadata["path"] = metadata["path"].apply(lambda x: os.path.join(audio_files, x))

# Print first few paths for debugging
print(metadata["path"].head())

# # Load spectrograms (pass individual file paths to load_dataset)
# spectrograms = load_dataset(metadata["path"].values)

# # Normalize Spectrograms
# spectrograms = spectrograms / np.max(spectrograms)

# # Add time dimension (even if only 1 time step)
# spectrograms = np.expand_dims(spectrograms, axis=1)

# # Build and Compile Model

# model = build_model((128, 216, 3), LSTM_UNITS)  # Adjust input shape to (time_steps, height, width, channels)
# model.compile(optimizer="adam", loss="mse")
# model.summary()

# Load spectrograms (pass individual file paths to load_dataset)
spectrograms = load_dataset(metadata["path"].values)

# Normalize Spectrograms
spectrograms = spectrograms / np.max(spectrograms)

# Add time dimension (even if only 1 time step)
spectrograms = np.expand_dims(spectrograms, axis=1)

# Build and Compile Model
model = build_model((128, 216, 3), LSTM_UNITS)  # Adjust input shape to (time_steps, height, width, channels)
model.compile(optimizer="adam", loss="mse")
model.summary()

# Generate Embeddings
embeddings = model.predict(spectrograms, batch_size=8)  # Adjust batch size if needed
print("Embeddings:", embeddings.shape)
print("\nEmbeddings:", embeddings)



No GPU found. Ensure CUDA and cuDNN are correctly installed.
0    cv-corpus-7.0-singleword/ta\clips\common_voice...
1    cv-corpus-7.0-singleword/ta\clips\common_voice...
2    cv-corpus-7.0-singleword/ta\clips\common_voice...
3    cv-corpus-7.0-singleword/ta\clips\common_voice...
4    cv-corpus-7.0-singleword/ta\clips\common_voice...
Name: path, dtype: object
Loading file: cv-corpus-7.0-singleword/ta\clips\common_voice_ta_22693174.mp3
Spectrogram shape for cv-corpus-7.0-singleword/ta\clips\common_voice_ta_22693174.mp3: (128, 216, 3)
Loading file: cv-corpus-7.0-singleword/ta\clips\common_voice_ta_22064618.mp3
Spectrogram shape for cv-corpus-7.0-singleword/ta\clips\common_voice_ta_22064618.mp3: (128, 216, 3)
Loading file: cv-corpus-7.0-singleword/ta\clips\common_voice_ta_21711106.mp3
Spectrogram shape for cv-corpus-7.0-singleword/ta\clips\common_voice_ta_21711106.mp3: (128, 216, 3)
Loading file: cv-corpus-7.0-singleword/ta\clips\common_voice_ta_21714254.mp3
Spectrogram shape for cv-corpu

  base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights="imagenet")


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 125ms/step
Embeddings: (252, 128)

Embeddings: [[0.03184887 0.         0.         ... 0.         0.21454336 0.21564873]
 [0.18957488 0.         0.01428142 ... 0.         0.69044936 0.        ]
 [0.13938804 0.04130739 0.         ... 0.         0.5604585  0.02380553]
 ...
 [0.1133021  0.         0.         ... 0.         0.45789328 0.        ]
 [0.         0.         0.02190065 ... 0.00975258 0.21257503 0.05762592]
 [0.02095847 0.         0.         ... 0.         0.7436626  0.        ]]


In [282]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, TimeDistributed, Flatten
from tensorflow.keras.models import Model
import numpy as np
import random
import librosa
from sklearn.preprocessing import LabelEncoder

# Define MAML loss
def maml_loss(y_true, y_pred):
    """MAML loss function: Negative log-likelihood loss."""
    y_true = tf.cast(y_true, tf.int32)  # Ensure y_true is in integer format
    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred))

# Preprocess labels into integers
def preprocess_labels(labels):
    """Convert string labels to integers."""
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(labels)

# Generate spectrogram from audio file
def generate_spectrogram(file_path, n_mels=128, hop_length=512, n_fft=2048, duration=None):
    audio, sr = librosa.load(file_path, sr=None, duration=duration)
    mel_spectrogram = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )
    log_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    log_spectrogram = (log_spectrogram - log_spectrogram.min()) / (log_spectrogram.max() - log_spectrogram.min())
    log_spectrogram = np.stack([log_spectrogram] * 3, axis=-1)
    return tf.convert_to_tensor(log_spectrogram, dtype=tf.float32)

# Preprocess a batch of audio files into spectrogram tensors
def preprocess_batch(file_paths):
    tensors = []
    for file_path in file_paths:
        spectrogram = generate_spectrogram(file_path)
        resized_spectrogram = tf.image.resize(spectrogram, (128, 216))
        tensors.append(resized_spectrogram)
    batch_tensor = tf.stack(tensors)
    return tf.expand_dims(batch_tensor, axis=1)  # Adds a temporal dimension

# MAML training step: Apply gradient updates to the model parameters
def maml_train_step(model, support_set, query_set, learning_rate=0.01):
    support_set_tensors = preprocess_batch(support_set["files"])
    query_set_tensors = preprocess_batch(query_set["files"])
    
    support_labels = preprocess_labels(support_set['labels'])
    query_labels = preprocess_labels(query_set['labels'])
    
    support_labels = tf.convert_to_tensor(support_labels)
    query_labels = tf.convert_to_tensor(query_labels)
    
    with tf.GradientTape() as tape:
        support_embeddings = model(support_set_tensors)
        support_loss = maml_loss(support_labels, support_embeddings)
    
    gradients = tape.gradient(support_loss, model.trainable_variables)
    optimizer = tf.keras.optimizers.Adam(learning_rate)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return support_loss, query_set_tensors, query_labels

# Prototypical Networks loss function
def prototypical_loss(query_embeddings, query_labels, prototypes):
    distances = np.linalg.norm(query_embeddings[:, np.newaxis] - prototypes, axis=-1)
    predictions = tf.nn.softmax(-distances)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=query_labels, logits=predictions))
    return loss

# Build the model (MobileNetV2 + LSTM)
def build_model(input_shape, lstm_units):
    input_layer = Input(shape=(None, 128, 216, 3))
    feature_extractor = build_feature_extractor(input_shape)
    features = TimeDistributed(feature_extractor)(input_layer)
    flattened_features = TimeDistributed(Flatten())(features)
    lstm_output = LSTM(lstm_units, return_sequences=False)(flattened_features)
    output_layer = Dense(128, activation="relu", name="embedding")(lstm_output)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Meta-Training Loop (MAML)
def meta_train(model, dataset, num_tasks, num_shots, num_query, meta_iterations=10):
    dataset = list(dataset)
    available_tasks = len(dataset)
    if num_tasks > available_tasks:
        num_tasks = available_tasks
    
    for iteration in range(meta_iterations):
        tasks = random.sample(dataset, num_tasks)
        meta_loss = 0
        total_accuracy = 0  # Initialize total accuracy
        
        for task in tasks:
            support_set = task['support']
            query_set = task['query']
            
            task_loss, query_set_tensors, query_labels = maml_train_step(model, support_set, query_set)
            meta_loss += task_loss
            
            # Compute query embeddings
            query_embeddings = model(query_set_tensors)
            
            # Calculate accuracy
            predicted_labels = np.argmax(query_embeddings, axis=-1)
            accuracy = np.mean(np.equal(predicted_labels, query_labels))
            total_accuracy += accuracy

        # Average loss and accuracy over tasks
        meta_loss /= num_tasks
        total_accuracy /= num_tasks
        
        print(f"Iteration {iteration + 1}/{meta_iterations}, Meta Loss: {meta_loss:.4f}, Accuracy: {total_accuracy:.4f}")

# Prepare the dataset for few-shot learning
def prepare_few_shot_dataset(audio_files, labels, num_shots, num_query):
    dataset = []
    unique_labels = np.unique(labels)
    
    for label in unique_labels:
        class_files = [f for f, l in zip(audio_files, labels) if l == label]
        
        if len(class_files) >= num_shots + num_query:
            sampled_files = random.sample(class_files, num_shots + num_query)
            support_set = {"files": sampled_files[:num_shots], "labels": [label] * num_shots}
            query_set = {"files": sampled_files[num_shots:], "labels": [label] * num_query}
            dataset.append({"support": support_set, "query": query_set})
        else:
            num_shots_adjusted = max(1, len(class_files) // 2)
            num_query_adjusted = len(class_files) - num_shots_adjusted
            sampled_files = class_files
            support_set = {"files": sampled_files[:num_shots_adjusted], "labels": [label] * num_shots_adjusted}
            query_set = {"files": sampled_files[num_shots_adjusted:], "labels": [label] * num_query_adjusted}
            dataset.append({"support": support_set, "query": query_set})

    return dataset

# Prepare dataset
audio_files = metadata["path"].values
labels = metadata["sentence"].apply(lambda x: x.split()[0]).values

class_to_keyword, keyword_to_class = create_class_to_keyword_mapping(labels)
num_shots = 5
num_query = 15

dataset = prepare_few_shot_dataset(audio_files, labels, num_shots, num_query)
num_tasks = len(dataset)

# Check if there are tasks available
if num_tasks > 0:
    model = build_model((128, 216, 3), lstm_units=128)
    model.compile(optimizer="adam", loss="mse", metrics=['accuracy'])
    
    # Meta-training
    meta_train(model, dataset, num_tasks, num_shots, num_query)
else:
    print("No tasks available for meta-training.")


  base_model = MobileNetV2(input_shape=input_shape, include_top=False, weights="imagenet")


Iteration 1/10, Meta Loss: 0.7100, Accuracy: 1.0000
Iteration 2/10, Meta Loss: 0.0010, Accuracy: 1.0000
Iteration 3/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 4/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 5/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 6/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 7/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 8/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 9/10, Meta Loss: 0.0000, Accuracy: 1.0000
Iteration 10/10, Meta Loss: 0.0000, Accuracy: 1.0000


In [268]:
import os
import glob
import librosa
import numpy as np

def process_audio(file_path, sr=16000, n_mels=64, n_fft=2048, hop_length=512, win_length=2048):
    """
    Load an audio file and convert it into a Mel spectrogram.
    
    Args:
    - file_path: Path to the audio file.
    - sr: Sampling rate.
    - n_mels: Number of Mel bands.
    - n_fft: Number of FFT points.
    - hop_length: Hop length for the STFT.
    - win_length: Window length for the STFT.

    Returns:
    - Mel spectrogram of the audio.
    """
    # Load the audio file using librosa
    audio, _ = librosa.load(file_path, sr=sr)
    
    # Generate the Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft,
                                                     hop_length=hop_length, win_length=win_length)
    
    # Convert the Mel spectrogram to decibels (log scale)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    # Optionally, resize the spectrogram to a fixed shape (e.g., 128x216)
    mel_spectrogram_resized = np.resize(mel_spectrogram_db, (128, 216))
    
    # Add the channel dimension (for 3-channel input)
    mel_spectrogram_resized = np.expand_dims(mel_spectrogram_resized, axis=-1)  # Adds a channel dimension (e.g., (128, 216, 1))
    mel_spectrogram_resized = np.repeat(mel_spectrogram_resized, 3, axis=-1)  # Repeat the single channel 3 times to simulate RGB
    
    return mel_spectrogram_resized

# Function to load and preprocess these files
def load_dataset(file_paths):
    dataset = []
    for file in file_paths:
        # Process each audio file and generate spectrogram
        spectrogram = process_audio(file)
        dataset.append(spectrogram)
    return np.array(dataset)  # Convert to numpy array for consistency
# Example: Define the compute_prototypes function (ensure it's defined somewhere in your code)
def compute_prototypes(embeddings, labels, num_classes):
    prototypes = []
    for i in range(num_classes):
        class_embeddings = embeddings[labels == i]  # Select embeddings for each class
        if class_embeddings.size > 0:  # Ensure embeddings exist for the class
            prototypes.append(np.mean(class_embeddings, axis=0))  # Mean embedding
        else:
            prototypes.append(np.zeros(embeddings.shape[1]))  # Placeholder prototype
    return np.array(prototypes)


def few_shot_inference(model, query_set, prototypes):
    predictions = []
    loss = 0.0
    
    # Pass the entire query set to the model
    query_embeddings = model.predict(query_set)  # Shape: (batch_size, embedding_dim)
    
    # Compute distances and predictions for each query embedding
    for query_embedding in query_embeddings:
        distances = np.linalg.norm(prototypes - query_embedding, axis=1)  # Euclidean distance
        prediction = np.argmin(distances)  # Closest prototype
        predictions.append(prediction)
        loss += np.min(distances)  # Sum of distances as the loss
        #print("query_embedding",query_embedding)
        #print("distances",distances)
    return predictions, loss

def few_shot_inference_with_keywords(model, query_set, prototypes, class_to_keyword):
    """
    Perform few-shot inference and convert numeric predictions to keywords.
    
    Args:
    - model: Trained model for embeddings.
    - query_set: Query dataset.
    - prototypes: Class prototypes.
    - class_to_keyword: Mapping from class indices to keywords.

    Returns:
    - predictions: List of predicted keywords.
    - loss: Total loss for the query set.
    """
    predictions = []
    loss = 0.0

    # Get embeddings for the query set
    query_embeddings = model.predict(query_set)  # Shape: (batch_size, embedding_dim)
    
    for query_embedding in query_embeddings:
        # Compute distances and find the closest prototype
        distances = np.linalg.norm(prototypes - query_embedding, axis=1)
        predicted_index = np.argmin(distances)  # Closest prototype index
        predictions.append(class_to_keyword[predicted_index])  # Map to keyword
        loss += np.min(distances)  # Sum of distances as the loss

    return predictions, loss


# Assuming this is the path where your audio files are stored
audio_folder_path = "cv-corpus-7.0-singleword/taFEW"  # Update this to the location of your dataset

# Get all .mp3 files in the folder
audio_files = glob.glob(os.path.join(audio_folder_path, "*.mp3"))

# Print the list of audio files to verify
print("Found audio files:", audio_files)

# Load the query set for the new keywords
file_path =  preprocess_batch(audio_files)
#print("file",file_path.shape)
new_keyword_query_set = load_dataset(audio_files)
#print(new_keyword_query_set.shape)
num_classes = len(np.unique(labels))  

# Example: Ensure the embeddings and labels are aligned
# Add a sequence dimension (e.g., sequence length = 1 for single input sequences)
new_keyword_query_set = np.expand_dims(new_keyword_query_set, axis=1)  # Shape: (batch_size, sequence_length, 128, 216, 3)
#print("new_keyword_query_set",new_keyword_query_set)
embeddings = model(new_keyword_query_set)  # Get embeddings for the audio files

# Ensure embeddings are numpy arrays
embeddings = np.array(embeddings)
# Print shapes to debug

# Example: Ensure labels match the length of embeddings
# If the embeddings are outputting a different number of samples, adjust labels accordingly
labels = labels[:len(embeddings)]  # This is an example of truncating labels to match embeddings

# If embeddings are not 2D (samples, embedding_dim), check how many samples are being returned
# This can help identify if some embeddings were skipped

# Make sure embeddings and labels have the same length
#assert len(embeddings) == len(labels), "Mismatch between number of embeddings and labels"

# Compute prototypes
prototypes = compute_prototypes(embeddings, labels, num_classes)
print("Shape of embeddings:", embeddings.shape)
print("Shape of labels:", len(labels))
print("Query Set Shape:", new_keyword_query_set.shape)
print("Prototypes Shape:", prototypes.shape)

predicted_keywords, loss = few_shot_inference_with_keywords(model, new_keyword_query_set, prototypes, class_to_keyword)

print(f"Predicted Keywords: {predicted_keywords}, Loss: {loss}")
#நான்கு ஒன்று ஒன்று

Found audio files: ['cv-corpus-7.0-singleword/taFEW\\common_voice_ta_21689633.mp3']
Shape of embeddings: (1, 128)
Shape of labels: 1
Query Set Shape: (1, 1, 128, 216, 3)
Prototypes Shape: (14, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
Predicted Keywords: ['ஃபயர்ஃபாக்ஸ்'], Loss: 20.850296020507812


In [253]:
import os
import glob
import librosa
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

def process_audio(file_path, sr=16000, n_mels=64, n_fft=2048, hop_length=512, win_length=2048):
    """
    Load an audio file and convert it into a Mel spectrogram.
    
    Args:
    - file_path: Path to the audio file.
    - sr: Sampling rate.
    - n_mels: Number of Mel bands.
    - n_fft: Number of FFT points.
    - hop_length: Hop length for the STFT.
    - win_length: Window length for the STFT.

    Returns:
    - Mel spectrogram of the audio.
    """
    # Load the audio file using librosa
    audio, _ = librosa.load(file_path, sr=sr)
    
    # Generate the Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, n_fft=n_fft,
                                                     hop_length=hop_length, win_length=win_length)
    
    # Convert the Mel spectrogram to decibels (log scale)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    # Optionally, resize the spectrogram to a fixed shape (e.g., 128x216)
    mel_spectrogram_resized = np.resize(mel_spectrogram_db, (128, 216))
    
    # Add the channel dimension (for 3-channel input)
    mel_spectrogram_resized = np.expand_dims(mel_spectrogram_resized, axis=-1)  # Adds a channel dimension (e.g., (128, 216, 1))
    mel_spectrogram_resized = np.repeat(mel_spectrogram_resized, 3, axis=-1)  # Repeat the single channel 3 times to simulate RGB
    
    return mel_spectrogram_resized

def load_dataset(file_paths):
    """Preprocess a batch of audio files into spectrogram tensors."""
    dataset = []
    for file in file_paths:
        spectrogram = process_audio(file)
        dataset.append(spectrogram)
    return np.array(dataset)  # Convert to numpy array for consistency

def compute_prototypes(embeddings, labels, num_classes):
    """Compute the prototypes for each class as the mean of embeddings."""
    prototypes = []
    for i in range(num_classes):
        class_embeddings = embeddings[labels == i]  # Select embeddings for each class
        if class_embeddings.size > 0:
            prototypes.append(np.mean(class_embeddings, axis=0))  # Mean embedding
        else:
            prototypes.append(np.zeros(embeddings.shape[1]))  # Placeholder for empty class
    return np.array(prototypes)

def few_shot_inference_with_keywords(model, query_set, prototypes, class_to_keyword):
    """
    Perform few-shot inference and convert numeric predictions to keywords.
    
    Args:
    - model: Trained model for embeddings.
    - query_set: Query dataset.
    - prototypes: Class prototypes.
    - class_to_keyword: Mapping from class indices to keywords.

    Returns:
    - predictions: List of predicted keywords.
    - loss: Total loss for the query set.
    """
    predictions = []
    loss = 0.0

    # Get embeddings for the query set
    query_embeddings = model.predict(query_set)  # Shape: (batch_size, embedding_dim)
    
    for query_embedding in query_embeddings:
        # Compute distances and find the closest prototype
        distances = np.linalg.norm(prototypes - query_embedding, axis=1)
        predicted_index = np.argmin(distances)  # Closest prototype index
        predictions.append(class_to_keyword[predicted_index])  # Map to keyword
        loss += np.min(distances)  # Sum of distances as the loss

    return predictions, loss

def prepare_dataset(audio_folder_path, labels, num_shots, num_query):
    """Prepare the few-shot dataset."""
    audio_files = glob.glob(os.path.join(audio_folder_path, "*.mp3"))
    class_to_keyword, keyword_to_class = create_class_to_keyword_mapping(labels)
    
    dataset = prepare_few_shot_dataset(audio_files, labels, num_shots, num_query)
    
    return dataset, class_to_keyword, keyword_to_class

def create_class_to_keyword_mapping(labels):
    """Creates a mapping from class indices to human-readable keywords."""
    label_encoder = LabelEncoder()
    label_indices = label_encoder.fit_transform(labels)
    
    class_to_keyword = {idx: label for idx, label in enumerate(label_encoder.classes_)}
    keyword_to_class = {label: idx for idx, label in class_to_keyword.items()}
    
    return class_to_keyword, keyword_to_class

# Assuming your dataset is in this folder
audio_folder_path = "cv-corpus-7.0-singleword/ta"  # Update this to the location of your dataset
#labels = ["label1", "label2", "label3"]  # Example labels
num_shots = 5
num_query = 15

# Prepare dataset
dataset, class_to_keyword, keyword_to_class = prepare_dataset(audio_folder_path, labels, num_shots, num_query)

# Load the query set (new audio files)
new_keyword_query_set = load_dataset(audio_folder_path)  # Load new query set
new_keyword_query_set = np.expand_dims(new_keyword_query_set, axis=1)  # Shape: (batch_size, sequence_length, 128, 216, 3)

# Assuming you already have a trained model
model = build_model((128, 216, 3), LSTM_UNITS)  # LSTM_UNITS should be defined
model.compile(optimizer="adam", loss="mse")

# Get embeddings for the query set
embeddings = model.predict(new_keyword_query_set)  # Get embeddings for the audio files

# Compute prototypes for each class
num_classes = len(np.unique(labels))
prototypes = compute_prototypes(embeddings, labels, num_classes)

# Perform few-shot inference and convert numeric predictions to keywords
predicted_keywords, loss = few_shot_inference_with_keywords(model, new_keyword_query_set, prototypes, class_to_keyword)

print(f"Predicted Keywords: {predicted_keywords}, Loss: {loss}")


Class 'ஒன்று' has 0 files.
Total tasks in dataset: 1


  audio, _ = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'c'