In [2]:
# === üéôÔ∏è Minimal Colab demo to test best.pt on real voice input ===
!pip install -q gradio librosa transformers soundfile torch

import gradio as gr, torch, librosa, numpy as np
from transformers import AutoTokenizer, AutoModel

# --- Load text embedding model ---
tok = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
txt_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").eval().to("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# --- Define your model architecture (same as training) ---
class SimpleRegressor(torch.nn.Module):
    def __init__(self, in_dim=300+74, hidden=256, dropout=0.1):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, hidden), torch.nn.ReLU(), torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden, hidden), torch.nn.ReLU(), torch.nn.Dropout(dropout),
            torch.nn.Linear(hidden, 1)
        )
    def forward(self, x): return self.net(x)

# --- Load your trained model weights ---
model = SimpleRegressor()
ckpt = torch.load("best.pt", map_location=device)
model.load_state_dict(ckpt["model"])
model.to(device).eval()
print("‚úÖ Loaded best.pt successfully!")

# --- Define inference function ---
def voice_to_emotion(audio, text):
    if audio is None:
        return "Please record or upload an audio clip."

    # 1Ô∏è‚É£ Extract audio features (MFCC mean)
    y, sr = librosa.load(audio, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=74)
    audio_feat = np.mean(mfcc, axis=1)

    # 2Ô∏è‚É£ Get text embedding (optional)
    if text.strip():
        inputs = tok(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            text_emb = txt_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()[0]
    else:
        text_emb = np.zeros(300, dtype=np.float32)

    # 3Ô∏è‚É£ Concatenate & predict
    x = np.concatenate([text_emb, audio_feat]).astype(np.float32)
    x = torch.tensor(x).unsqueeze(0).to(device)
    with torch.no_grad():
        pred = model(x).cpu().item()

    # 4Ô∏è‚É£ Interpret score
    emo = "üòä Positive" if pred > 0.6 else "üòê Neutral" if pred > 0.4 else "üòû Negative"
    return f"{emo}  (score = {pred:.2f})"

# --- Gradio interface for live testing ---
demo = gr.Interface(
    fn=voice_to_emotion,
    inputs=[
        gr.Audio(label="üéôÔ∏è Speak or Upload Audio", type="filepath"),
        gr.Textbox(label="‚úçÔ∏è Transcript (optional)")
    ],
    outputs="text",
    title="Bimodal Voice + Text Emotion Demo"
)

demo.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Device: cpu
‚úÖ Loaded best.pt successfully!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f452e5c0def55b3fc7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


