# Libraries and Tools Installation



In [None]:
!pip install git+https://github.com/speechbrain/speechbrain.git@develop
!pip install datasets==3.5.0
!pip install speechbrain transformers torchaudio datasets tqdm

Collecting git+https://github.com/speechbrain/speechbrain.git@develop
  Cloning https://github.com/speechbrain/speechbrain.git (to revision develop) to /tmp/pip-req-build-yizwuobr
  Running command git clone --filter=blob:none --quiet https://github.com/speechbrain/speechbrain.git /tmp/pip-req-build-yizwuobr
  Resolved https://github.com/speechbrain/speechbrain.git to commit 96b00ca4652f723688635c0cd40bf2366451a046
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hyperpyyaml (from speechbrain==1.0.3)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9->speechbrain==1.0.3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9->speechbrain==1.0.3)
  Downloading nvidia_cuda_runt

In [None]:
# !pip uninstall -y transformers
# !pip install transformers==4.40.1  # Or latest stable version
!pip install torchcodec



Collecting torchcodec
  Downloading torchcodec-0.6.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading torchcodec-0.6.0-cp311-cp311-manylinux_2_28_x86_64.whl (1.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.6.0


In [None]:
!pip freeze > requirements.txt

In [None]:
# Step 1: Remove preinstalled FFmpeg
!apt remove ffmpeg -y

# Step 2: Download and extract FFmpeg 7.0.2 static build
!wget -q https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz
!tar -xf ffmpeg-release-amd64-static.tar.xz
!cp ffmpeg-*-amd64-static/ffmpeg ffmpeg-*-amd64-static/ffprobe /usr/local/bin/

# Step 3: Confirm the new version is active
!ffmpeg -version


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following packages will be REMOVED:
  ffmpeg
0 upgraded, 0 newly installed, 1 to remove and 35 not upgraded.
After this operation, 2,288 kB disk space will be freed.
(Reading database ... 126284 files and directories currently installed.)
Removing ffmpeg (7:4.4.2-0ubuntu0.22.04.1) ...
Processing triggers for man-db (2.10.2-1) ...
ffmpeg version 7.0.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2024 the FFmpeg developers
built with gcc 8 (Debian 8.3.0-6)
configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopen

# Importing Libraries

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from speechbrain.inference.classifiers import EncoderClassifier
from transformers import AutoModel, AutoFeatureExtractor, Wav2Vec2FeatureExtractor, WavLMForXVector
from datasets import load_dataset, Audio
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
import os
import torch.nn.functional as F


In [None]:
!rm -rf /root/.cache/huggingface/datasets/UBC-NLP*


In [None]:
from huggingface_hub import login

login("SECRET_KEY")


# Model Architucture

## ECAPA_WavLM_Fusion Model

This class implements a fusion model that combines features from two pretrained speech models:
- **ECAPA-TDNN** (from SpeechBrain)
- **WavLM-Base** (from Microsoft)

### Workflow
1. **Feature Extraction**
   - **ECAPA**: Takes raw waveforms (`wavs`) as input and outputs a 256-dimensional speaker/language embedding.
   - **WavLM**: Takes tokenized audio (`wavlm_input_values`) and outputs frame-level features of size 1024, which are mean-pooled to get a single vector.

2. **Projection Layers**
   - ECAPA embeddings → projected to 256 dimensions.
   - WavLM pooled embeddings → projected from 1024 to 256 dimensions.

3. **Feature Fusion**
   - Concatenates the projected ECAPA and WavLM features into a 512-dimensional vector.
   - Applies layer normalization.

4. **Classification Head**
   - Fully-connected layers reduce the fused feature vector to the number of target classes (`num_classes`).
   - Outputs raw logits and predicted class indices.




In [None]:
class ECAPA_WavLM_Fusion(nn.Module):
    def __init__(self, num_classes=8):
        super(ECAPA_WavLM_Fusion, self).__init__()
        self.ecapa = EncoderClassifier.from_hparams(
            source="speechbrain/lang-id-voxlingua107-ecapa",
            savedir="tmp/ecapa"
        )
        for param in self.ecapa.parameters():
            param.requires_grad = False
        # The embedding model itself is not needed as a separate attribute here
        # self.ecapa_feats = self.ecapa.mods.embedding_model


        self.wavlm = AutoModel.from_pretrained("microsoft/wavlm-base")
        for param in self.wavlm.parameters():
            param.requires_grad = False

        # Changed input features from 192 to 256 based on error message
        self.ecapa_proj = nn.Linear(256, 256)
        self.wavlm_proj = nn.Linear(768, 256) # Corrected input feature size
        self.classifier = nn.Sequential(
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, num_classes)
        )

    def forward(self, wavs, wavlm_input_values):
        # wavs shape: [B, T] — do NOT unsqueeze
        with torch.no_grad():
            ecapa_output = self.ecapa.encode_batch(wavs)  # [B, 1, 256]
            ecapa_output_squeezed = ecapa_output.squeeze(1)  # [B, 256]

        ecapa_feats = self.ecapa_proj(ecapa_output_squeezed)  # [B, 256]

        wavlm_out = self.wavlm(wavlm_input_values).last_hidden_state  # [B, T, 1024] # Corrected output feature size
        wavlm_pooled = torch.mean(wavlm_out, dim=1)  # [B, 1024] # Corrected output feature size
        wavlm_feats = self.wavlm_proj(wavlm_pooled)  # [B, 256]

        fused = torch.cat([ecapa_feats, wavlm_feats], dim=-1)  # [B, 512]
        fused = F.layer_norm(fused, fused.shape[1:])
        logits = self.classifier(fused)  # [B, num_classes]

        preds = torch.argmax(logits, dim=1)
        return logits, preds

# Training Dataset

In [None]:

countries = ['Algeria', 'Egypt', 'Jordan', 'Mauritania', 'Morocco', 'Palestine', 'UAE', 'Yemen']



ds = load_dataset("UBC-NLP/NADI2025_subtask1_SLID")
ds.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/456M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/459M [00:00<?, ?B/s]

validation-00000-of-00004.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

validation-00001-of-00004.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

validation-00002-of-00004.parquet:   0%|          | 0.00/462M [00:00<?, ?B/s]

validation-00003-of-00004.parquet:   0%|          | 0.00/448M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12900 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12700 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'audio', 'country'],
        num_rows: 12900
    })
    validation: Dataset({
        features: ['ID', 'audio', 'country'],
        num_rows: 12700
    })
})

In [None]:
sample = ds["train"][0]
print(type(sample["audio"]))  # Should be dict
print(sample["audio"].keys())  # Should show: dict_keys(['path', 'array', 'sampling_rate'])
print(sample["audio"]["array"][:5])  # Waveform samples


<class 'dict'>
dict_keys(['path', 'array', 'sampling_rate'])
tensor([-0.0047, -0.0134, -0.0146, -0.0172, -0.0194])


In [None]:
labels2id = {key: idx for idx, key in enumerate(countries)}
id2labels = {idx: key for key, idx in labels2id.items()}

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")

def collate_fn(samples):
    arrays = []
    labels = []

    for sample in samples:
        try:
            waveform = sample["audio"]["array"]
            if not isinstance(waveform, torch.Tensor):
                waveform = torch.tensor(waveform, dtype=torch.float32)
            if waveform.numel() == 0:
                continue
            arrays.append(waveform)
            labels.append(labels2id[sample["country"]])
        except Exception as e:
            print(f"Skipping due to error: {e}")
            continue

    if len(arrays) == 0:
        return None

    # Pad for ECAPA-TDNN: expects [B, 1, T]
    padded = pad_sequence(arrays, batch_first=True)  # [B, T]

    # WavLM: extract input values with padding
    # Convert each tensor to list so feature_extractor works properly
    arrays_for_wavlm = [arr.tolist() for arr in arrays]
    inputs = feature_extractor(
        arrays_for_wavlm, sampling_rate=16000, return_tensors="pt", padding=True
    )

    labels = torch.tensor(labels, dtype=torch.long)

    return padded, inputs["input_values"], labels





preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [None]:
trainloader = DataLoader(ds['train'], shuffle=True, collate_fn=collate_fn, batch_size=4)
valloader = DataLoader(ds['validation'], shuffle=False, collate_fn=collate_fn, batch_size=4)

In [None]:
trainloader

<torch.utils.data.dataloader.DataLoader at 0x79bdd24db250>

## Training Setup


### Optimizer
- **AdamW** optimizer is used for better weight decay handling in transformer-like architectures.
- Different learning rates are assigned:
  - **ECAPA & WavLM**: `1e-5` (small LR to avoid disrupting pretrained weights)
  - **Projection layers** (`ecapa_proj`, `wavlm_proj`) and **classifier**: `1e-4` (larger LR for newly added trainable layers)
- **Weight decay**: `1e-2` for regularization.

### Loss Function
- **CrossEntropyLoss** is used for multi-class classification.
- Works directly with raw logits output from the model.


In [None]:
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ECAPA_WavLM_Fusion().to(device)
optimizer = AdamW([
    {"params": model.classifier.parameters(), "lr": 1e-4},
    {"params": model.ecapa_proj.parameters(), "lr": 1e-4},
    {"params": model.wavlm_proj.parameters(), "lr": 1e-4},
], weight_decay=1e-2)
loss_fn = nn.CrossEntropyLoss()

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/lang-id-voxlingua107-ecapa' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/0253049ae131d6a4be1c4f0d8b0ff483a0f8c8e9/hyperparams.yaml' -> '/content/tmp/ecapa/hyperparams.yaml'
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp/ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/lang-id-voxlingua107-ecapa' if not cached


embedding_model.ckpt:   0%|          | 0.00/84.5M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/0253049ae131d6a4be1c4f0d8b0ff483a0f8c8e9/embedding_model.ckpt' -> '/content/tmp/ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp/ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/lang-id-voxlingua107-ecapa' if not cached


classifier.ckpt:   0%|          | 0.00/763k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/0253049ae131d6a4be1c4f0d8b0ff483a0f8c8e9/classifier.ckpt' -> '/content/tmp/ecapa/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/tmp/ecapa/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/lang-id-voxlingua107-ecapa' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--lang-id-voxlingua107-ecapa/snapshots/0253049ae131d6a4be1c4f0d8b0ff483a0f8c8e9/label_encoder.txt' -> '/content/tmp/ecapa/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /content/tmp/ecapa/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /content/tmp/ecapa/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /content/tmp/ecapa/classifier.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): label_encoder -> /content/tmp/ecapa/label_encoder.ckpt
DEBUG:speechbrain.dataio.encoder:Loaded categorical encoding from /content/tmp/ec

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

## Evaluation Functions

- **`llr(logits)`** – Computes pairwise log-likelihood ratios between classes, zeroing diagonals.  
- **`compute_actual_cost(...)`** – NIST SRE cost function: calculates FPR, FNR, and weighted cost using `beta`.  
- **`compute_ave_cost(logits, labels)`** – Converts logits to LLRs, averages per-class FPR/FNR to get overall cost.  
- **`eval_loop(model, loader, device)`** – Runs model evaluation, collects accuracy, average cost, FPR, and FNR.


In [None]:
#Calculate log likelihood ratios
def llr(logits):
  classes = logits.shape[1]
  l2 = logits.unsqueeze(dim=1)

  l = logits.unsqueeze(dim=1)
  l = l.repeat(1,8,1)
  l2 = l2.repeat(1,8,1)
  l2 = l2.permute((0,2,1))
  dif = l-l2
  e = torch.exp(dif)
  for i in range(len(e)):
    e[i].fill_diagonal_(0)
  return -torch.log(torch.sum(e,dim=-1)/ (classes-1))



#Actual Cost Function from the NIST Speech Recognition Evaluation tool package

def compute_actual_cost(scores, labels, p_target, c_miss=1, c_fa=1):
    beta = c_fa * (1 - p_target) / (c_miss * p_target)
    decisions = (scores >= np.log(beta)).astype('i')
    num_targets = np.sum(labels)
    fp = np.sum(decisions * (1 - labels))
    num_nontargets = np.sum(1 - labels)
    fn = np.sum((1 - decisions) * labels)
    fpr = fp / num_nontargets if num_nontargets > 0 else np.nan
    fnr = fn / num_targets if num_targets > 0 else np.nan
    return fnr + beta * fpr, fpr, fnr

def compute_ave_cost(logits, labels, num_l = 8):
  llratio = llr(logits)
  llratio = llratio.numpy()
  labels = labels.numpy()
  order = labels.argsort()
  labels.sort()
  llratio = llratio[order]
  indices = np.where(labels[:-1] != labels[1:])[0]
  indices = np.append(indices, [-1])
  one_hot = np.eye(8)[labels]
  fprs = []
  fnrs = []
  last = 0
  for i in indices:
    _, fpr, fnr = compute_actual_cost(llratio[last:i], one_hot[last:i], 0.5)
    fprs.append(fpr)
    fnrs.append(fnr)
    last = i
  fpr = sum(fprs)/num_l
  fnr = sum(fnrs)/num_l
  cost = fpr+fnr
  return cost, fpr, fnr





def eval_loop(model, loader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    logits_list = []
    labels_list = []

    with torch.no_grad():
        for batch in loader:
            if batch is None:
                continue

            wavs, wavlm_inputs, labels = batch
            wavs = wavs.to(device)
            wavlm_inputs = wavlm_inputs.to(device)
            labels = labels.to(device)

            logits, preds = model(wavs, wavlm_inputs)

            logits_list.append(logits.cpu())
            labels_list.append(labels.cpu())

            total_correct += (preds.cpu() == labels.cpu()).sum().item()
            total_samples += len(labels)

    logits = torch.cat(logits_list, dim=0)
    labels = torch.cat(labels_list, dim=0)

    cost, fpr, fnr = compute_ave_cost(logits, labels)
    accuracy = total_correct / total_samples * 100
    return accuracy, cost, fpr, fnr

## Training Loop

- Uses **phased LR scheduling**:
  1. **Warmup** → **Constant LR** while pretrained layers are frozen.
  2. After `frozen_steps` (8k), unfreeze all parameters and switch to **Linear Warmup → Constant → Cosine Decay**.
- **Gradient clipping** (`max_norm=1.0`) for stability.
- Logs average loss every `logging_steps` (100 steps) and runs validation every `val_steps` (10k steps).


In [None]:
from tqdm import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import (
    LinearLR, ConstantLR, CosineAnnealingLR, SequentialLR
)

# === Training Config ===
max_steps = 40_000
logging_steps = 100
val_steps = 5_000
frozen_steps = 12_000
step = 0
avg_loss = 0.0
epoch = 0


# === Scheduler Before Unfreeze (up to step 12_000) ===
scheduler1 = LinearLR(optimizer, start_factor=1/3, total_iters=3000)  # Warmup: 0 → 3000
scheduler2 = ConstantLR(optimizer, factor=1.0, total_iters=9000)      # Constant: 3000 → 12,000

scheduler = SequentialLR(
    optimizer,
    [scheduler1, scheduler2],
    milestones=[3000],
    verbose=True
)
# === Helper to Print Trainable Params ===
def print_trainable_parameters(model):
    total = 0
    print("Trainable Parameters:")
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_params = param.numel()
            print(f"  {name:<40} | {num_params:,} parameters")
            total += num_params
    print(f"\nTotal trainable parameters: {total:,}")

# === Training Loop ===
model.train()
print_trainable_parameters(model)
epoch = 0

while step < max_steps:
    epoch += 1
    print(f"\n=== Epoch {epoch} ===")
    pbar = tqdm(trainloader, desc=f"Training Step {step}/{max_steps}")

    for batch in pbar:
        if batch is None:
            continue

        wavs, wavlm_inputs, labels = batch
        wavs = wavs.to(device)
        wavlm_inputs = wavlm_inputs.to(device)
        labels = labels.to(device)

        logits, preds = model(wavs, wavlm_inputs)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        avg_loss += loss.item()

        if step % logging_steps == 0 and step != 0:
            pbar.set_postfix(loss=avg_loss / logging_steps, lr=scheduler.get_last_lr()[0])
            avg_loss = 0.0

        if step % val_steps == 0 and step != 0:
            acc, cost, fpr, fnr = eval_loop(model, valloader, device)
            print(f"[VAL @ step {step}] Accuracy: {acc:.2f}%, Cost: {cost:.4f}, FPR: {fpr:.4f}, FNR: {fnr:.4f}")

        # === UNFREEZE MODEL AFTER frozen_steps ===
        if step + 1 == frozen_steps:
            for param in model.parameters():
                param.requires_grad = True
            print(">>> Unfroze all model parameters!")

            optimizer = AdamW([
                {"params": model.ecapa.parameters(), "lr": 1e-5},
                {"params": model.wavlm.parameters(), "lr": 1e-5},
                {"params": model.classifier.parameters(), "lr": 1e-4},
                {"params": model.ecapa_proj.parameters(), "lr": 1e-4},
                {"params": model.wavlm_proj.parameters(), "lr": 1e-4},
            ], weight_decay=1e-2)

            # === Scheduler After Unfreeze ===
            scheduler3 = LinearLR(optimizer, start_factor=1/10, end_factor=1.0, total_iters=4000)
            scheduler4 = ConstantLR(optimizer, factor=1.0, total_iters=14000)
            scheduler5 = CosineAnnealingLR(optimizer, T_max=10000)

            scheduler = SequentialLR(
                optimizer,
                [scheduler3, scheduler4, scheduler5],
                milestones=[16000, 30000],
                verbose=True
            )


            print_trainable_parameters(model)

        if step >= max_steps:
            break
        step += 1


Trainable Parameters:
  ecapa_proj.weight                        | 65,536 parameters
  ecapa_proj.bias                          | 256 parameters
  wavlm_proj.weight                        | 196,608 parameters
  wavlm_proj.bias                          | 256 parameters
  classifier.0.weight                      | 131,072 parameters
  classifier.0.bias                        | 256 parameters
  classifier.3.weight                      | 2,048 parameters
  classifier.3.bias                        | 8 parameters

Total trainable parameters: 396,040

=== Epoch 1 ===


Training Step 0/40000:   0%|          | 0/3225 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Training Step 0/40000: 100%|██████████| 3225/3225 [11:51<00:00,  4.53it/s, loss=1.43, lr=0.0001]



=== Epoch 2 ===


Training Step 3225/40000:  55%|█████▌    | 1776/3225 [12:59<44:37:16, 110.86s/it, loss=1.26, lr=0.0001]

[VAL @ step 5000] Accuracy: 66.57%, Cost: 0.2821, FPR: 0.1569, FNR: 0.1252


Training Step 3225/40000: 100%|██████████| 3225/3225 [16:05<00:00,  3.34it/s, loss=0.709, lr=0.0001]



=== Epoch 3 ===


Training Step 6450/40000: 100%|██████████| 3225/3225 [06:56<00:00,  7.75it/s, loss=0.683, lr=0.0001]



=== Epoch 4 ===


Training Step 9675/40000:  10%|█         | 326/3225 [07:08<67:17:51, 83.57s/it, loss=0.657, lr=0.0001]

[VAL @ step 10000] Accuracy: 76.59%, Cost: 0.2019, FPR: 0.0855, FNR: 0.1164


Training Step 9675/40000:  72%|███████▏  | 2325/3225 [11:24<02:14,  6.69it/s, loss=0.58, lr=0.0001]

>>> Unfroze all model parameters!
Trainable Parameters:
  ecapa.mods.embedding_model.blocks.0.conv.conv.weight | 307,200 parameters
  ecapa.mods.embedding_model.blocks.0.conv.conv.bias | 1,024 parameters
  ecapa.mods.embedding_model.blocks.0.norm.norm.weight | 1,024 parameters
  ecapa.mods.embedding_model.blocks.0.norm.norm.bias | 1,024 parameters
  ecapa.mods.embedding_model.blocks.1.tdnn1.conv.conv.weight | 1,048,576 parameters
  ecapa.mods.embedding_model.blocks.1.tdnn1.conv.conv.bias | 1,024 parameters
  ecapa.mods.embedding_model.blocks.1.tdnn1.norm.norm.weight | 1,024 parameters
  ecapa.mods.embedding_model.blocks.1.tdnn1.norm.norm.bias | 1,024 parameters
  ecapa.mods.embedding_model.blocks.1.res2net_block.blocks.0.conv.conv.weight | 49,152 parameters
  ecapa.mods.embedding_model.blocks.1.res2net_block.blocks.0.conv.conv.bias | 128 parameters
  ecapa.mods.embedding_model.blocks.1.res2net_block.blocks.0.norm.norm.weight | 128 parameters
  ecapa.mods.embedding_model.blocks.1.res2ne

Training Step 9675/40000: 100%|██████████| 3225/3225 [15:09<00:00,  3.55it/s, loss=0.509, lr=2.8e-6]



=== Epoch 5 ===


Training Step 12900/40000:  65%|██████▌   | 2102/3225 [13:48<20:09:00, 64.59s/it, loss=0.494, lr=7.75e-6]

[VAL @ step 15000] Accuracy: 77.59%, Cost: 0.2049, FPR: 0.0655, FNR: 0.1394


Training Step 12900/40000: 100%|██████████| 3225/3225 [18:30<00:00,  2.90it/s, loss=0.42, lr=1e-5]



=== Epoch 6 ===


Training Step 16125/40000: 100%|██████████| 3225/3225 [13:27<00:00,  3.99it/s, loss=0.624, lr=1e-5]



=== Epoch 7 ===


Training Step 19350/40000:  20%|██        | 651/3225 [07:51<66:26:01, 92.91s/it, loss=0.414, lr=1e-5]

[VAL @ step 20000] Accuracy: 82.20%, Cost: 0.1654, FPR: 0.0435, FNR: 0.1219


Training Step 19350/40000: 100%|██████████| 3225/3225 [18:40<00:00,  2.88it/s, loss=0.17, lr=1e-5]



=== Epoch 8 ===


Training Step 22575/40000:  75%|███████▌  | 2426/3225 [15:17<20:33:10, 92.60s/it, loss=0.356, lr=1e-5]

[VAL @ step 25000] Accuracy: 88.86%, Cost: 0.1043, FPR: 0.0242, FNR: 0.0801


Training Step 22575/40000: 100%|██████████| 3225/3225 [18:38<00:00,  2.88it/s, loss=0.359, lr=1e-5]



=== Epoch 9 ===


Training Step 25800/40000: 100%|██████████| 3225/3225 [13:32<00:00,  3.97it/s, loss=0.309, lr=1e-5]



=== Epoch 10 ===


Training Step 29025/40000:  30%|███       | 976/3225 [09:21<58:29:54, 93.64s/it, loss=0.101, lr=1e-5]

[VAL @ step 30000] Accuracy: 90.22%, Cost: 0.0989, FPR: 0.0189, FNR: 0.0800


Training Step 29025/40000: 100%|██████████| 3225/3225 [18:43<00:00,  2.87it/s, loss=0.155, lr=1e-5]



=== Epoch 11 ===


Training Step 32250/40000:  85%|████████▌ | 2752/3225 [16:41<8:37:36, 65.66s/it, loss=0.144, lr=1e-5] 

[VAL @ step 35000] Accuracy: 90.68%, Cost: 0.0964, FPR: 0.0174, FNR: 0.0791


Training Step 32250/40000: 100%|██████████| 3225/3225 [18:43<00:00,  2.87it/s, loss=0.108, lr=1e-5]



=== Epoch 12 ===


Training Step 35475/40000: 100%|██████████| 3225/3225 [13:33<00:00,  3.96it/s, loss=0.106, lr=1e-5]



=== Epoch 13 ===


Training Step 38700/40000:  40%|████      | 1300/3225 [10:43<15:52,  2.02it/s, loss=0.134, lr=1e-5]

[VAL @ step 40000] Accuracy: 89.43%, Cost: 0.1088, FPR: 0.0186, FNR: 0.0902





In [None]:
torch.save(model.state_dict(), "fusion_model_last.pt")

In [None]:
from google.colab import files
files.download("fusion_model_last.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Submission

In [None]:
def write_logits(o, filename):
    with open(filename, "a") as fp:
        for i in o:
            l = "\t".join(str(j.item()) for j in i)
            fp.write(l.strip() + "\n")

def write_preds(o, filename):
    with open(filename, "a") as fp:
        for i in o:
            fp.write(str(i.item()).strip() + "\n")

def submission_writer(model, loader, device):
    model.eval()
    for batch in tqdm(loader):
        wavs, wavlm_input_values, _ = batch  # Adjust based on your dataset format

        wavs = wavs.to(device)
        wavlm_input_values = wavlm_input_values.to(device)

        with torch.no_grad():
            logits, preds = model(wavs, wavlm_input_values)

        write_logits(logits.to('cpu'), 'logits.tsv')
        write_preds(preds.to('cpu'), 'predictions.tsv')


In [None]:
submission_writer(model, valloader, device)

100%|██████████| 3175/3175 [04:01<00:00, 13.17it/s]


In [None]:
!zip submission.zip logits.tsv predictions.tsv

  adding: logits.tsv (deflated 57%)
  adding: predictions.tsv (deflated 94%)


In [None]:
import csv

def read_logits(filename):
  l = []
  with open(filename, "r") as fp:
    reader = csv.reader(fp, delimiter="\t")
    for line in reader:
      line = list(map(lambda x: float(x), line))
      l.append(torch.tensor(line).unsqueeze(dim=0))
  return torch.concat(l, dim=0)



In [None]:
read_logits("logits.tsv")

NameError: name 'torch' is not defined

# Testing Set Phase

In [None]:
from datasets import load_dataset

countries = ['Algeria', 'Egypt', 'Jordan', 'Mauritania', 'Morocco', 'Palestine', 'UAE', 'Yemen']
ds = load_dataset("UBC-NLP/NADI2025_subtask1_ADI_Test")



ds.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
sample = ds["test"][0]
print(type(sample["audio"]))  # Should be dict
print(sample["audio"].keys())  # Should show: dict_keys(['path', 'array', 'sampling_rate'])
print(sample["audio"]["array"][:5])  # Waveform samples


<class 'dict'>
dict_keys(['array', 'sample_rate'])
tensor([0.0216, 0.0276, 0.0227, 0.0141, 0.0008])


In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")

def collate_fn_test(samples):
    arrays = []

    for sample in samples:
        try:
            waveform = sample["audio"]["array"]
            if not isinstance(waveform, torch.Tensor):
                waveform = torch.tensor(waveform, dtype=torch.float32)
            if waveform.numel() == 0:
                continue
            arrays.append(waveform)
        except Exception as e:
            print(f"Skipping due to error: {e}")
            continue

    if len(arrays) == 0:
        return None

    padded = pad_sequence(arrays, batch_first=True)

    arrays_for_wavlm = [arr.tolist() for arr in arrays]
    inputs = feature_extractor(
        arrays_for_wavlm, sampling_rate=16000, return_tensors="pt", padding=True
    )

    return padded, inputs["input_values"]


testloader = DataLoader(ds['test'], shuffle=False, collate_fn=collate_fn_test, batch_size=4)

In [None]:
testloader

<torch.utils.data.dataloader.DataLoader at 0x78d513c10b10>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ECAPA_WavLM_Fusion()
model.load_state_dict(torch.load("/content/drive/MyDrive/fusion_model_final.pt", map_location=device))
model.to(device)
model.eval()

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp/ecapa/hyperparams.yaml'
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp/ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp/ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp/ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch classifi

ECAPA_WavLM_Fusion(
  (ecapa): EncoderClassifier(
    (mods): ModuleDict(
      (compute_features): Fbank(
        (compute_STFT): STFT()
        (compute_fbanks): Filterbank()
        (compute_deltas): Deltas()
        (context_window): ContextWindow()
      )
      (mean_var_norm): InputNormalization()
      (embedding_model): ECAPA_TDNN(
        (blocks): ModuleList(
          (0): TDNNBlock(
            (conv): Conv1d(
              (conv): Conv1d(60, 1024, kernel_size=(5,), stride=(1,))
            )
            (activation): ReLU()
            (norm): BatchNorm1d(
              (norm): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            )
            (dropout): Dropout1d(p=0.0, inplace=False)
          )
          (1): SERes2NetBlock(
            (tdnn1): TDNNBlock(
              (conv): Conv1d(
                (conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
              )
              (activation): ReLU()
              (norm)

## Test-Time Augmentation (TTA)

Applies speed perturbation, Gaussian noise, band-pass filtering, and time masking.  
All outputs match original length.  
**Note:** These augmentations reduced performance.


In [None]:
import torch
import torch.nn.functional as Fnn
import torchaudio
import torchaudio.functional as AF
import torchaudio.transforms as T

def match_length(wave, target_len):
    """
    Pad or truncate waveform to match target length.
    """
    if wave.shape[-1] > target_len:
        return wave[:target_len]
    elif wave.shape[-1] < target_len:
        return Fnn.pad(wave, (0, target_len - wave.shape[-1]))
    return wave

def apply_tta(waveform, sample_rate=16000):
    """
    Apply test-time augmentation to a waveform.
    Returns a list of augmented waveforms with fixed length.
    """
    if waveform.ndim == 2:
        waveform = waveform.squeeze(0)  # Convert from [1, T] to [T] if needed

    target_len = waveform.shape[-1]
    augmentations = []

    # Original
    augmentations.append(waveform)

    # Speed perturbation (resample)
    for rate in [0.95, 1.05]:
        resampled = AF.resample(waveform, sample_rate, int(sample_rate * rate))
        augmentations.append(match_length(resampled, target_len))

    # Add Gaussian noise
    noise_level = torch.FloatTensor(1).uniform_(0.001, 0.005).item()
    noisy = waveform + noise_level * torch.randn_like(waveform)
    augmentations.append(noisy)

    # Bandpass filtering with random frequency
    central_freq = int(torch.randint(300, 3000, (1,)).item())
    filtered = AF.band_biquad(waveform, sample_rate, central_freq, Q=0.8)
    augmentations.append(filtered)

    # Time masking (note: this works best on spectrograms, but acceptable here with small mask)
    masker = T.TimeMasking(time_mask_param=30)
    masked = masker(waveform.unsqueeze(0)).squeeze(0)
    augmentations.append(masked)

    return augmentations


# Submission - Trying with augmantation

In [None]:
# def submission_writer(model, loader, device, feature_extractor, use_tta=True, temperature=1.5):
#     model.eval()
#     open("logits.tsv", "w").close()
#     open("predictions.tsv", "w").close()

#     for batch in tqdm(loader, desc="Predicting with TTA" if use_tta else "Standard prediction"):
#         if batch is None:
#             continue

#         if len(batch) == 3:
#             wavs, _, _ = batch
#         else:
#             wavs, _ = batch

#         for waveform in wavs:
#             waveform = waveform.cpu()
#             waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-7)  # Normalize input

#             if use_tta:
#                 tta_waveforms = apply_tta(waveform)
#             else:
#                 tta_waveforms = [waveform]

#             all_logits = []
#             for w in tta_waveforms:
#                 ecapa_input = w.unsqueeze(0).to(device)
#                 inputs = feature_extractor([w.tolist()], sampling_rate=16000, return_tensors="pt", padding=True)
#                 wavlm_input = inputs["input_values"].to(device)

#                 with torch.no_grad():
#                     logits, _ = model(ecapa_input, wavlm_input)
#                     all_logits.append(logits)

#             avg_logits = torch.mean(torch.stack(all_logits), dim=0)

#             # Apply temperature scaling
#             scaled_logits = avg_logits / temperature
#             pred = torch.argmax(scaled_logits, dim=1)

#             # Write scaled logits and prediction
#             with open("logits.tsv", "a") as f1:
#                 f1.write("\t".join(f"{v.item():.6f}" for v in scaled_logits.squeeze()) + "\n")
#             with open("predictions.tsv", "a") as f2:
#                 f2.write(f"{pred.item()}\n")




# submission_writer(model, testloader, device, feature_extractor)


Predicting with TTA: 100%|██████████| 1567/1567 [30:11<00:00,  1.16s/it]


# Submission without augmantation

This version was used for the **final submission**

In [None]:
def submission_writer(model, loader, device, feature_extractor):
    model.eval()
    open("logits.tsv", "w").close()
    open("predictions.tsv", "w").close()

    for batch in tqdm(loader, desc="Standard prediction"):
        if batch is None:
            continue

        if len(batch) == 3:
            wavs, _, _ = batch
        else:
            wavs, _ = batch

        for waveform in wavs:
            waveform = waveform.cpu()
            waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-7)  # Normalize

            ecapa_input = waveform.unsqueeze(0).to(device)
            inputs = feature_extractor([waveform.tolist()], sampling_rate=16000, return_tensors="pt", padding=True)
            wavlm_input = inputs["input_values"].to(device)

            with torch.no_grad():
                logits, _ = model(ecapa_input, wavlm_input)

            pred = torch.argmax(logits, dim=1)

            with open("logits.tsv", "a") as f1:
                f1.write("\t".join(f"{v.item():.6f}" for v in logits.squeeze()) + "\n")
            with open("predictions.tsv", "a") as f2:
                f2.write(f"{pred.item()}\n")


# Run prediction
submission_writer(model, testloader, device, feature_extractor)


In [None]:
!zip submission.zip logits.tsv predictions.tsv

updating: logits.tsv (deflated 55%)
updating: predictions.tsv (deflated 75%)
