<a href="https://colab.research.google.com/github/senudidinaya/Smart-Agri-Suite/blob/main/AI/Cultivatior%20Intent%20Module/Gate1_Audio_Text_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Gate-1 Voice + Text Intent Classification Notebook**

In [1]:
# Setup and Imports
# Install compatible versions
!pip -q install --upgrade --no-cache-dir "transformers==4.39.3" "huggingface_hub>=0.23" accelerate

# Imports (after pip cell finishes running)
import torch, numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from transformers import AutoTokenizer, AutoModel, pipeline

In [2]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Audio: Use Wav2Vec2 Feature Extractor instead of Processor
fe = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
xlsr = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").to(device).eval()

# Text: BERT base
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased").to(device).eval()

# ASR: Whisper base
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Audio + Text Featurizer
import os, hashlib
import torch.nn.functional as F
import soundfile as sf
import torchaudio

TEXT_CACHE = "/content/features/text_embeds_cache"
os.makedirs(TEXT_CACHE, exist_ok=True)

# Util to generate a hash for each audio file (used as cache key)
def audio_hash(path):
    with open(path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

# Use Whisper to get ASR transcript
def transcribe_whisper(path):
    return asr(path, language="en")["text"]

# Get text embedding using BERT
@torch.no_grad()
def bert_embed(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
    h = bert(**tokens).last_hidden_state
    return F.adaptive_avg_pool1d(h.transpose(1,2), 1).squeeze().cpu().numpy()  # (768,)

@torch.no_grad()
def xlsr_embed(audio, sr=1600):
    inputs = fe(audio, sampling_rate=sr, return_tensors="pt").input_values.to(device)
    h = xlsr(inputs).last_hidden_state
    return F.adaptive_avg_pool1d(h.transpose(1,2), 1).squeeze().cpu().numpy()

def load_wav(path, sr=16000):
    y, s = sf.read(path)
    if s != sr:
        y = torchaudio.functional.resample(torch.tensor(y).float().unsqueeze(0), s, sr).squeeze(0).numpy()
    if y.ndim > 1: y = y.mean(-1)
    return y

# Complete featurizer: XLSR audio + BERT text
def featurize(path, sr=16000):
    audio = load_wav(path, sr).astype(np.float32)
    a_feat = xlsr_embed(audio, sr)  # (1024,)

    # Cache logic
    key = os.path.join(TEXT_CACHE, f"{audio_hash(path)}.npy")
    if os.path.exists(key):
        t_feat = np.load(key)
    else:
        text = transcribe_whisper(path)
        t_feat = bert_embed(text)
        np.save(key, t_feat)

    return np.concatenate([a_feat, t_feat])  # (1024 + 768 = 1792,)

In [5]:
# Download & Extract MELD Dataset
KAGGLE_USERNAME = "senudirupasinghe"
KAGGLE_KEY = "7780e1bc02634783fb08137fa45db94e"

!pip install kaggle --upgrade --quiet
!mkdir -p /root/.kaggle
!echo '{"username":"<KAGGLE_USERNAME>","key":"<KAGGLE_KEY>"}' > /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d zaber666/meld-dataset -p /content/ --unzip

Dataset URL: https://www.kaggle.com/datasets/zaber666/meld-dataset
License(s): CC0-1.0
Downloading meld-dataset.zip to /content
100% 11.0G/11.0G [02:33<00:00, 185MB/s]
100% 11.0G/11.0G [02:33<00:00, 77.0MB/s]


In [6]:
# Extract CSV paths and audio mapping
import os, glob, pandas as pd
MELD_ROOT = "/content/MELD.Raw"
AUDIO_OUT = "/content/audio/meld"

# Index mp4 files
mp4_index = {}
for p in glob.glob(os.path.join(MELD_ROOT, "**", "*.mp4"), recursive=True):
    mp4_index[os.path.basename(p).lower()] = p

# Extract audio from MP4 to WAV
import subprocess, soundfile as sf
os.makedirs(AUDIO_OUT, exist_ok=True)
def mp4_to_wav(dialogue_id, utterance_id):
    mp4 = mp4_index.get(f"dia{dialogue_id}_utt{utterance_id}.mp4")
    if not mp4: return None
    out_path = os.path.join(AUDIO_OUT, f"{dialogue_id}_{utterance_id}.wav")
    if not os.path.exists(out_path):
        subprocess.run(["ffmpeg","-y","-i",mp4,"-vn","-ac","1","-ar","16000","-sample_fmt","s16",out_path],
                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return out_path if os.path.exists(out_path) else None


In [7]:
import glob
import subprocess

AUDIO_OUT = "/content/audio/meld"
os.makedirs(AUDIO_OUT, exist_ok=True)

# Build a fast lookup once
mp4_index = {}
for p in glob.glob("/content/MELD-RAW/MELD.Raw/**/*.mp4", recursive=True):
    mp4_index[os.path.basename(p).lower()] = p

def mp4_to_wav(dialogue_id, utterance_id):
    name = f"dia{dialogue_id}_utt{utterance_id}.mp4".lower()
    src = mp4_index.get(name)
    if not src:
        return None
    dst = os.path.join(AUDIO_OUT, f"{dialogue_id}_{utterance_id}.wav")
    if not os.path.exists(dst):
        subprocess.run([
            "ffmpeg", "-y", "-i", src, "-vn",
            "-ac", "1", "-ar", "16000", "-sample_fmt", "s16", dst
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return dst if os.path.exists(dst) else None

In [8]:
# Generate Manifest CSVs
def build_manifest(csv_path, out_path, lang="en", domain="general"):
    df = pd.read_csv(csv_path)
    paths, labels = [], []
    emap = {
        "engaged":    {"neutral","joy","happy","excited"},
        "hesitating": {"sadness","fear","surprise"},
        "rejecting":  {"anger","disgust","contempt"}
    }
    label_map = {"engaged":0,"hesitating":1,"rejecting":2}
    for i, row in df.iterrows():
        emo = row["Emotion"].lower().strip()
        label = None
        for k,v in emap.items():
            if emo in v: label = label_map[k]
        if label is None: continue
        wav = mp4_to_wav(row["Dialogue_ID"], row["Utterance_ID"])
        if wav: paths.append(wav); labels.append(label)
    pd.DataFrame({
        "path": paths,
        "label": labels,
        "lang": lang,
        "domain": domain
    }).to_csv(out_path, index=False)

build_manifest("/content/MELD-RAW/MELD.Raw/train/train_sent_emo.csv", "/content/manifest_meld_train.csv")
build_manifest("/content/MELD-RAW/MELD.Raw/dev_sent_emo.csv", "/content/manifest_meld_dev.csv")
build_manifest("/content/MELD-RAW/MELD.Raw/test_sent_emo.csv", "/content/manifest_meld_test.csv")

In [9]:
# Rebuild combined feature arrays (audio + text)
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

def build_npz(manifest_csv, out_npz):
    man = pd.read_csv(manifest_csv)
    X, y = [], []
    for p, lab in tqdm(zip(man["path"], man["label"]), total=len(man), desc=f"Featurize {os.path.basename(manifest_csv)}"):
        try:
            x = featurize(p)
            X.append(x); y.append(int(lab))
        except Exception as e:
            print(f"[!] Failed: {p} -> {e}")
    X = np.stack(X).astype(np.float32)
    y = np.array(y, dtype=np.int64)
    np.savez(out_npz, X=X, y=y)
    print(out_npz, X.shape, y.shape)

# Trigger rebuilds (may take 10+ mins depending on GPU/CPU)
build_npz("/content/manifest_meld_train.csv", "/content/features/meld_train.npz")
build_npz("/content/manifest_meld_dev.csv", "/content/features/meld_dev.npz")
build_npz("/content/manifest_meld_test.csv", "/content/features/meld_test.npz")

Featurize manifest_meld_train.csv: 100%|██████████| 9988/9988 [09:57<00:00, 16.72it/s]


/content/features/meld_train.npz (9988, 1792) (9988,)


Featurize manifest_meld_dev.csv: 100%|██████████| 1109/1109 [01:04<00:00, 17.18it/s]


/content/features/meld_dev.npz (1109, 1792) (1109,)


Featurize manifest_meld_test.csv: 100%|██████████| 2609/2609 [02:31<00:00, 17.20it/s]


/content/features/meld_test.npz (2609, 1792) (2609,)


In [11]:
import torch.nn as nn, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import f1_score

def train_model(train_npz, dev_npz, save_path):
    Xtr, ytr = np.load(train_npz)["X"], np.load(train_npz)["y"]
    Xdv, ydv = np.load(dev_npz)["X"], np.load(dev_npz)["y"]

    tr = TensorDataset(torch.from_numpy(Xtr), torch.from_numpy(ytr))
    dv = TensorDataset(torch.from_numpy(Xdv), torch.from_numpy(ydv))

    # Balanced sampling
    counts = np.bincount(ytr, minlength=3).astype(np.float32)
    class_w = (counts.sum() / (3.0 * np.maximum(counts, 1)))
    cw = torch.tensor(class_w, dtype=torch.float32, device=device)

    sample_w = class_w[ytr]
    sampler = WeightedRandomSampler(weights=torch.from_numpy(sample_w), num_samples=len(sample_w), replacement=True)

    train_loader = DataLoader(tr, batch_size=64, sampler=sampler)
    dev_loader = DataLoader(dv, batch_size=128, shuffle=False)

    class Head(nn.Module):
        def __init__(self, d_in=1792, d_h=256, d_h2=64, n_out=3):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(d_in,d_h), nn.ReLU(), nn.Dropout(0.2),
                nn.Linear(d_h,d_h2), nn.ReLU(), nn.Dropout(0.2),
                nn.Linear(d_h2,n_out)
            )
        def forward(self, x): return self.net(x)

    model = Head().to(device)
    crit = nn.CrossEntropyLoss(weight=cw)
    opt = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

    best_f1, wait, patience = 0, 0, 6
    for ep in range(60):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad(); loss = crit(model(xb), yb); loss.backward(); opt.step()
        model.eval(); preds, gold = [], []
        with torch.no_grad():
            for xb, yb in dev_loader:
                pb = model(xb.to(device)).softmax(1).argmax(1).cpu().numpy()
                preds.extend(pb); gold.extend(yb.numpy())
        f1 = f1_score(gold, preds, average="macro")
        print(f"Epoch {ep:02d}  dev Macro-F1 = {f1:.3f}")
        if f1 > best_f1: best_f1, wait = f1, 0; torch.save(model.state_dict(), save_path)
        else: wait += 1
        if wait >= patience: print("Early stop."); break

    print("Best dev Macro-F1:", round(best_f1, 3))

In [12]:
train_model(
    "/content/features/meld_train.npz",
    "/content/features/meld_dev.npz",
    "/content/features/head_best.pt"
)

Epoch 00  dev Macro-F1 = 0.164
Epoch 01  dev Macro-F1 = 0.207
Epoch 02  dev Macro-F1 = 0.229
Epoch 03  dev Macro-F1 = 0.229
Epoch 04  dev Macro-F1 = 0.230
Epoch 05  dev Macro-F1 = 0.228
Epoch 06  dev Macro-F1 = 0.234
Epoch 07  dev Macro-F1 = 0.231
Epoch 08  dev Macro-F1 = 0.229
Epoch 09  dev Macro-F1 = 0.245
Epoch 10  dev Macro-F1 = 0.264
Epoch 11  dev Macro-F1 = 0.235
Epoch 12  dev Macro-F1 = 0.284
Epoch 13  dev Macro-F1 = 0.255
Epoch 14  dev Macro-F1 = 0.281
Epoch 15  dev Macro-F1 = 0.295
Epoch 16  dev Macro-F1 = 0.302
Epoch 17  dev Macro-F1 = 0.275
Epoch 18  dev Macro-F1 = 0.268
Epoch 19  dev Macro-F1 = 0.268
Epoch 20  dev Macro-F1 = 0.296
Epoch 21  dev Macro-F1 = 0.302
Epoch 22  dev Macro-F1 = 0.311
Epoch 23  dev Macro-F1 = 0.313
Epoch 24  dev Macro-F1 = 0.303
Epoch 25  dev Macro-F1 = 0.298
Epoch 26  dev Macro-F1 = 0.307
Epoch 27  dev Macro-F1 = 0.292
Epoch 28  dev Macro-F1 = 0.313
Epoch 29  dev Macro-F1 = 0.318
Epoch 30  dev Macro-F1 = 0.318
Epoch 31  dev Macro-F1 = 0.320
Epoch 32