#  DoRA Fine-Tuning and Evaluation Notebook
This notebook contains the full workflow for the **Bonus Question** of Speech Understanding Assignment 3.

We:
- Reproduce results from the SpeechGLUE paper on SST-2 and MRPC
- Fine-tune a DoRA-based model on SNIPS (intent classification)
- Transfer the encoder to SST-2 and MRPC using new classifier heads


In [None]:
!pip install datasets transformers gtts torchaudio librosa pandas scikit-learn --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset

sst2 = load_dataset("glue", "sst2")
mrpc = load_dataset("glue", "mrpc")

snips = load_dataset("sonos-nlu-benchmark/snips_built_in_intents")

print("SST-2:", sst2['train'][0])
print("MRPC:", mrpc['train'][0])
print("SNIPS:", snips['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/6.65k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/328 [00:00<?, ? examples/s]

SST-2: {'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}
MRPC: {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
SNIPS: {'text': "Share my location with Hillary's sister", 'label': 5}


##  Step 1: Convert Text to Speech with gTTS
We use Google TTS to synthesize audio files from the text of SST-2, MRPC, and SNIPS.

In [None]:
from gtts import gTTS
import os
from pathlib import Path
import pandas as pd
import re
from time import sleep

def clean_text(text):
    text = text.replace("’", "'").replace("‘", "'")
    text = text.replace("Jo's", "Jo")
    text = text.replace("Airbnb", "your stay")
    text = re.sub(r"[^a-zA-Z0-9\s\.\?\!]", "", text)
    return text.strip()

def safe_tts(text, path, retries=3):
    for attempt in range(retries):
        try:
            tts = gTTS(text)
            tts.save(path)
            return True
        except Exception as e:
            print(f"Retry {attempt+1} failed for: {text} -> {e}")
            sleep(1)
    return False

def generate_audio(dataset, field, label_field, output_dir, max_samples=None):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    records = []
    for i, sample in enumerate(dataset):
        if max_samples and i >= max_samples:
            break
        raw_text = sample[field]
        text = clean_text(raw_text)
        label = sample[label_field]
        filename = f"{i}.wav"
        path = os.path.join(output_dir, filename)

        if safe_tts(text, path):
            records.append({"path": path, "label": label})
            print(f" Saved: {text}")
        else:
            print(f" Failed completely for: {text}")

    return pd.DataFrame(records)

df_sst2 = generate_audio(sst2['train'], 'sentence', 'label', 'audio_sst2', 500)
df_mrpc = generate_audio(mrpc['train'], 'sentence1', 'label', 'audio_mrpc', 500)
df_snips = generate_audio(snips['train'], 'text', 'label', 'audio_snips', 328)

df_sst2.to_csv("metadata_sst2.csv", index=False)
df_mrpc.to_csv("metadata_mrpc.csv", index=False)
df_snips.to_csv("metadata_snips.csv", index=False)

 Saved: hide new secretions from the parental units
 Saved: contains no wit  only labored gags
 Saved: that loves its characters and communicates something rather beautiful about human nature
 Saved: remains utterly satisfied to remain the same throughout
 Saved: on the worst revengeofthenerds clichs the filmmakers could dredge up
 Saved: that s far too tragic to merit such superficial treatment
 Saved: demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small  personal film with an emotional wallop .
 Saved: of saucy
 Saved: a depressed fifteenyearold s suicidal poetry
 Saved: are more deeply thought through than in most  rightthinking  films
 Saved: goes to absurd lengths
 Saved: for those moviegoers who complain that  they do nt make movies like they used to anymore
 Saved: the part where nothing s happening
 Saved: saw how bad this movie was
 Saved: lend some dignity to a dumb story
 Saved: the greatest musicians
 Saved: cold movie
 S

##  Step 2: Load Pre-extracted Features
Here we load the audio features extracted using Wav2Vec2 for SNIPS, SST-2, and MRPC.

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
model.eval()

def extract_features_from_csv(csv_path, out_path):
    df = pd.read_csv(csv_path)
    features = []
    labels = []

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Extracting from {csv_path}"):
        path = row["path"]
        label = row["label"]

        try:
            waveform, sr = torchaudio.load(path)
            waveform = waveform.squeeze(0)

            if sr != 16000:
                waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

            inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
            with torch.no_grad():
                out = model(inputs.input_values.to(device)).last_hidden_state.mean(dim=1)

            features.append(out.squeeze().cpu())
            labels.append(label)

        except Exception as e:
            print(f" Failed: {path}, error: {e}")

    features = torch.stack(features)
    labels = torch.tensor(labels)

    torch.save({"features": features, "labels": labels}, out_path)
    print(f" Saved: {out_path}")

extract_features_from_csv("metadata_sst2.csv", "features_sst2.pt")
extract_features_from_csv("metadata_mrpc.csv", "features_mrpc.pt")
extract_features_from_csv("metadata_snips.csv", "features_snips.pt")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Extracting from metadata_sst2.csv:   0%|          | 0/500 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Extracting from metadata_sst2.csv: 100%|██████████| 500/500 [00:18<00:00, 26.96it/s]


 Saved: features_sst2.pt


Extracting from metadata_mrpc.csv: 100%|██████████| 500/500 [00:32<00:00, 15.26it/s]


 Saved: features_mrpc.pt


Extracting from metadata_snips.csv: 100%|██████████| 328/328 [00:09<00:00, 33.07it/s]

 Saved: features_snips.pt





##  Step 3: Prepare PyTorch Dataset + DataLoader

In [None]:
from torch.utils.data import DataLoader, TensorDataset

def get_dataloader(pt_file, batch_size=32, shuffle=True):
    data = torch.load(pt_file)
    features = data["features"]
    labels = data["labels"]
    dataset = TensorDataset(features, labels)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return loader

train_loader_sst2 = get_dataloader("features_sst2.pt", batch_size=32)
train_loader_mrpc = get_dataloader("features_mrpc.pt", batch_size=32)
train_loader_snips = get_dataloader("features_snips.pt", batch_size=32)

batch = next(iter(train_loader_sst2))
print("SST-2 Sample batch shape:", batch[0].shape, "Labels:", batch[1].shape)

SST-2 Sample batch shape: torch.Size([32, 768]) Labels: torch.Size([32])


##  Step 4: Define the DoRA Model
We define a Decomposed Rank-Adaptive (DoRA) classifier with a low-rank adapter layer.

In [None]:
import torch.nn as nn

class DoRALinear(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super(DoRALinear, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.dora_A = nn.Linear(in_features, rank, bias=False)
        self.dora_B = nn.Linear(rank, out_features, bias=False)

    def forward(self, x):
        return self.linear(x) + self.dora_B(self.dora_A(x))

class DoRAClassifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=2, rank=4, dropout=0.1):
        super(DoRAClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.classifier = DoRALinear(input_dim, num_classes, rank)

    def forward(self, x):
        x = self.dropout(x)
        return self.classifier(x)

##  Step 5: Load Pre-extracted Features
Here we load the audio features extracted using Wav2Vec2 for SNIPS, SST-2, and MRPC.

In [None]:
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

snips_data = torch.load("features_snips.pt")
X = snips_data["features"]
y = snips_data["labels"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train_loader_snips = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader_snips = DataLoader(TensorDataset(X_test, y_test), batch_size=32, shuffle=False)

class DoRALinear(nn.Module):
    def __init__(self, in_features, out_features, rank=16):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.dora_A = nn.Linear(in_features, rank, bias=False)
        self.dora_B = nn.Linear(rank, out_features, bias=False)

    def forward(self, x):
        return self.linear(x) + self.dora_B(self.dora_A(x))

class DoRAClassifierImproved(nn.Module):
    def __init__(self, input_dim=768, num_classes=10, rank=16, hidden_dim=256):
        super().__init__()
        self.dropout = nn.Dropout(0.3)
        self.dora1 = DoRALinear(input_dim, hidden_dim, rank)
        self.relu = nn.ReLU()
        self.out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.dropout(x)
        x = self.relu(self.dora1(x))
        return self.out(x)

model = DoRAClassifierImproved().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)
criterion = nn.CrossEntropyLoss()

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for features, labels in train_loader_snips:
        features, labels = features.to(device), labels.to(device)

        logits = model(features)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Accuracy: {acc:.4f}")
    scheduler.step()

Epoch 1/20 - Loss: 20.3743 - Accuracy: 0.1756
Epoch 2/20 - Loss: 19.7579 - Accuracy: 0.2137
Epoch 3/20 - Loss: 19.4022 - Accuracy: 0.2328
Epoch 4/20 - Loss: 19.5344 - Accuracy: 0.2443
Epoch 5/20 - Loss: 19.3568 - Accuracy: 0.2366
Epoch 6/20 - Loss: 19.2508 - Accuracy: 0.2748
Epoch 7/20 - Loss: 19.0181 - Accuracy: 0.2634
Epoch 8/20 - Loss: 18.8969 - Accuracy: 0.2863
Epoch 9/20 - Loss: 18.6463 - Accuracy: 0.3053
Epoch 10/20 - Loss: 18.3484 - Accuracy: 0.2977
Epoch 11/20 - Loss: 19.0164 - Accuracy: 0.3053
Epoch 12/20 - Loss: 18.4555 - Accuracy: 0.3053
Epoch 13/20 - Loss: 18.2204 - Accuracy: 0.2863
Epoch 14/20 - Loss: 17.9982 - Accuracy: 0.3206
Epoch 15/20 - Loss: 18.3907 - Accuracy: 0.3206
Epoch 16/20 - Loss: 18.1364 - Accuracy: 0.3130
Epoch 17/20 - Loss: 18.1379 - Accuracy: 0.3321
Epoch 18/20 - Loss: 17.8394 - Accuracy: 0.3550
Epoch 19/20 - Loss: 17.8183 - Accuracy: 0.3282
Epoch 20/20 - Loss: 17.7228 - Accuracy: 0.3359


##  Step 6: Create New Classifier Heads for Transfer
We freeze the encoder and define new heads for SST-2 and MRPC.

In [None]:
for param in model.dora1.parameters():
    param.requires_grad = False

head_sst2 = nn.Linear(256, 2).to(device)
head_mrpc = nn.Linear(256, 2).to(device)

##  Step 7: Train New Head on SST-2

In [None]:
optimizer_sst2 = torch.optim.AdamW(head_sst2.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    head_sst2.train()
    total_loss, all_preds, all_labels = 0, [], []

    for features, labels in train_loader_sst2:
        features, labels = features.to(device), labels.to(device)
        with torch.no_grad():
            encoded = model.dora1(features)
        logits = head_sst2(encoded)
        loss = criterion(logits, labels)

        optimizer_sst2.zero_grad()
        loss.backward()
        optimizer_sst2.step()

        total_loss += loss.item()
        all_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"[SST-2] Epoch {epoch+1} - Loss: {total_loss:.4f} - Accuracy: {acc:.4f}")

[SST-2] Epoch 1 - Loss: 11.1505 - Accuracy: 0.4940
[SST-2] Epoch 2 - Loss: 11.0396 - Accuracy: 0.5460
[SST-2] Epoch 3 - Loss: 11.0733 - Accuracy: 0.5140
[SST-2] Epoch 4 - Loss: 11.0424 - Accuracy: 0.5360
[SST-2] Epoch 5 - Loss: 11.0811 - Accuracy: 0.5140
[SST-2] Epoch 6 - Loss: 11.0560 - Accuracy: 0.5300
[SST-2] Epoch 7 - Loss: 11.1782 - Accuracy: 0.4940
[SST-2] Epoch 8 - Loss: 11.0696 - Accuracy: 0.4940
[SST-2] Epoch 9 - Loss: 11.0624 - Accuracy: 0.5400
[SST-2] Epoch 10 - Loss: 11.0132 - Accuracy: 0.5360


##  Step 8: Train Classifier on MRPC (Transfer Learning)
We train a 2-class classifier on MRPC using the same frozen encoder.

In [None]:
optimizer_mrpc = torch.optim.AdamW(head_mrpc.parameters(), lr=1e-3)

for epoch in range(10):
    head_mrpc.train()
    total_loss, all_preds, all_labels = 0, [], []

    for features, labels in train_loader_mrpc:
        features, labels = features.to(device), labels.to(device)
        with torch.no_grad():
            encoded = model.dora1(features)
        logits = head_mrpc(encoded)
        loss = criterion(logits, labels)

        optimizer_mrpc.zero_grad()
        loss.backward()
        optimizer_mrpc.step()

        total_loss += loss.item()
        all_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"[MRPC] Epoch {epoch+1} - Loss: {total_loss:.4f} - Accuracy: {acc:.4f}")

[MRPC] Epoch 1 - Loss: 10.2635 - Accuracy: 0.6920
[MRPC] Epoch 2 - Loss: 10.0245 - Accuracy: 0.6920
[MRPC] Epoch 3 - Loss: 9.9195 - Accuracy: 0.6920
[MRPC] Epoch 4 - Loss: 9.8232 - Accuracy: 0.6920
[MRPC] Epoch 5 - Loss: 9.9436 - Accuracy: 0.6920
[MRPC] Epoch 6 - Loss: 9.9606 - Accuracy: 0.6920
[MRPC] Epoch 7 - Loss: 9.8825 - Accuracy: 0.6920
[MRPC] Epoch 8 - Loss: 9.9738 - Accuracy: 0.6920
[MRPC] Epoch 9 - Loss: 9.9024 - Accuracy: 0.6920
[MRPC] Epoch 10 - Loss: 9.8747 - Accuracy: 0.6920
