<a href="https://colab.research.google.com/github/senudidinaya/Smart-Agri-Suite/blob/main/AI/Cultivatior%20Intent%20Module/Gate1_Audio_Text_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Gate-1 Voice + Text Intent Classification Notebook**

In [8]:
# Setup and Imports
# ✅ Install compatible versions
!pip -q install --upgrade --no-cache-dir "transformers==4.39.3" "huggingface_hub>=0.23" accelerate

# ✅ Imports (after pip cell finishes running)
import torch, numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from transformers import AutoTokenizer, AutoModel, pipeline

In [9]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# Audio: Use Wav2Vec2 Feature Extractor instead of Processor
fe = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
xlsr = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").to(device).eval()

# Text: BERT base
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert = AutoModel.from_pretrained("bert-base-uncased").to(device).eval()

# ASR: Whisper base
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Audio + Text Featurizer
import os, hashlib
import torch.nn.functional as F

TEXT_CACHE = "/content/features/text_embeds_cache"
os.makedirs(TEXT_CACHE, exist_ok=True)

# Util to generate a hash for each audio file (used as cache key)
def audio_hash(path):
    with open(path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

# Use Whisper to get ASR transcript
def transcribe_whisper(path):
    return asr(path)["text"]

# Get text embedding using BERT
@torch.no_grad()
def bert_embed(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
    h = bert(**tokens).last_hidden_state
    return F.adaptive_avg_pool1d(h.transpose(1,2), 1).squeeze().cpu().numpy()  # (768,)

# Complete featurizer: XLSR audio + BERT text
def featurize(path, sr=16000):
    audio = load_wav(path, sr).astype(np.float32)
    a_feat = xlsr_embed(audio, sr)  # (1024,)

    # Cache logic
    key = os.path.join(TEXT_CACHE, f"{audio_hash(path)}.npy")
    if os.path.exists(key):
        t_feat = np.load(key)
    else:
        text = transcribe_whisper(path)
        t_feat = bert_embed(text)
        np.save(key, t_feat)

    return np.concatenate([a_feat, t_feat])  # (1024 + 768 = 1792,)

In [15]:
# Download & Extract MELD Dataset
KAGGLE_USERNAME = "senudirupasinghe"
KAGGLE_KEY = "7780e1bc02634783fb08137fa45db94e"

!pip install kaggle --upgrade --quiet
!mkdir -p /root/.kaggle
!echo '{"username":"<KAGGLE_USERNAME>","key":"<KAGGLE_KEY>"}' > /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

!kaggle datasets download -d zaber666/meld-dataset -p /content/ --unzip

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.4/256.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.3/159.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDataset URL: https://www.kaggle.com/datasets/zaber666/meld-dataset
License(s): CC0-1.0
Downloading meld-dataset.zip to /content
100% 11.0G/11.0G [03:47<00:00, 46.7MB/s]
100% 11.0G/11.0G [03:47<00:00, 51.8MB/s]


In [14]:
# Rebuild combined feature arrays (audio + text)
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

def build_npz(manifest_csv, out_npz):
    man = pd.read_csv(manifest_csv)
    X, y = [], []
    for p, lab in tqdm(zip(man["path"], man["label"]), total=len(man), desc=f"Featurize {os.path.basename(manifest_csv)}"):
        try:
            x = featurize(p)
            X.append(x); y.append(int(lab))
        except Exception as e:
            print(f"[!] Failed: {p} -> {e}")
    X = np.stack(X).astype(np.float32)
    y = np.array(y, dtype=np.int64)
    np.savez(out_npz, X=X, y=y)
    print(out_npz, X.shape, y.shape)

# Trigger rebuilds (may take 10+ mins depending on GPU/CPU)
build_npz("/content/manifest_meld_train.csv", "/content/features/meld_train.npz")
build_npz("/content/manifest_meld_dev.csv",   "/content/features/meld_dev.npz")
build_npz("/content/manifest_meld_test.csv",  "/content/features/meld_test.npz")

FileNotFoundError: [Errno 2] No such file or directory: '/content/manifest_meld_train.csv'