<a href="https://colab.research.google.com/github/ratnesh003/HCLTech-Tasks/blob/main/Silver%20Badge%20Assignments/Assignment%202/HCLTech_ML_Assignment_2_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2

#### write a python program to find similar audio files to the one you upload. Store many audio files and compare the one you upload with the stored ones. Using torch audio framework or any other appropriate framework

### Installing the Dependencies

In [1]:
!pip install torchaudio torch librosa soundfile scikit-learn
!pip install torchcodec




### Creating folder for the Dataset

In [2]:
import shutil
import os

DATASET_PATH = "./librispeech"

if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)

os.makedirs(DATASET_PATH, exist_ok=True)


### Downloading the Audio Dataset

In [3]:
import torchaudio

dataset = torchaudio.datasets.LIBRISPEECH(
    root=DATASET_PATH,
    url="test-clean",
    download=True
)


100%|██████████| 331M/331M [00:04<00:00, 71.6MB/s]


### Loading the model in GPU

In [4]:
import torch

bundle = torchaudio.pipelines.WAV2VEC2_BASE
model = bundle.get_model()

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

model.eval()


Wav2Vec2Model(
  (feature_extractor): FeatureExtractor(
    (conv_layers): ModuleList(
      (0): ConvLayerBlock(
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
      )
      (1-4): 4 x ConvLayerBlock(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
      )
      (5-6): 2 x ConvLayerBlock(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
      )
    )
  )
  (encoder): Encoder(
    (feature_projection): FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (pos_conv_embed): ConvolutionalPositionalEmbedding(
        (conv): ParametrizedConv1d(
          768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16
          (parametriza

### Audio Preprocessing function

In [5]:
def preprocess_audio(waveform, sample_rate):
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(
            waveform, sample_rate, bundle.sample_rate
        )
    return waveform


### Extract Fixed-Length Audio Embeddings

In [6]:
def extract_embedding(waveform):
    import torch
    waveform = waveform.to(device)
    with torch.no_grad():
        features, _ = model(waveform)
        embedding = features.mean(dim=1)  # temporal average
    return embedding.squeeze()

### Build Audio Embedding Database

In [7]:
import torch
import tqdm as tqdm

audio_embeddings = []
audio_waveforms = []
audio_sample_rates = []

MAX_FILES = len(dataset)

for idx in tqdm.tqdm(range(MAX_FILES), desc="Building audio embedding database"):
    waveform, sr, _, _, _, _ = dataset[idx]

    waveform = preprocess_audio(waveform, sr)
    emb = extract_embedding(waveform)

    audio_embeddings.append(emb)
    audio_waveforms.append(waveform)
    audio_sample_rates.append(bundle.sample_rate)

audio_embeddings = torch.stack(audio_embeddings)

Building audio embedding database: 100%|██████████| 2620/2620 [01:49<00:00, 24.00it/s]


### Upload Query Audio

In [8]:
from google.colab import files

uploaded = files.upload()
query_audio_path = list(uploaded.keys())[0]

Saving audio.wav to audio (1).wav


### Process Query Audio

In [9]:
query_waveform, query_sr = torchaudio.load(query_audio_path)
query_waveform = preprocess_audio(query_waveform, query_sr)
query_embedding = extract_embedding(query_waveform)


### Compute Similarity & Retrieve Top Matches

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Audio, display

import numpy as np

similarities = cosine_similarity(
    query_embedding.unsqueeze(0).cpu().numpy(),
    audio_embeddings.cpu().numpy()
)[0]

print("Query Audio: \n")
display(Audio(query_waveform.squeeze().numpy(), rate=bundle.sample_rate))

top_k = 5
top_indices = np.argsort(similarities)[::-1][:top_k]

print("\n" + "-" * 50 + "\n\n\n")

print("Top similar audio files:\n")

for rank, idx in enumerate(top_indices, start=1):
    print(f"Rank {rank} | Similarity Score: {similarities[idx]:.4f} \n")
    display(
        Audio(
            audio_waveforms[idx].squeeze().numpy(),
            rate=audio_sample_rates[idx]
        )
    )
    print("\n" + "-" * 50 + "\n")

Query Audio: 




--------------------------------------------------



Top similar audio files:

Rank 1 | Similarity Score: 1.0000 




--------------------------------------------------

Rank 2 | Similarity Score: 0.9837 




--------------------------------------------------

Rank 3 | Similarity Score: 0.9813 




--------------------------------------------------

Rank 4 | Similarity Score: 0.9744 




--------------------------------------------------

Rank 5 | Similarity Score: 0.9740 




--------------------------------------------------

