In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/Datasets/audio_speech_actors_01-24.zip" -d /content/ravdess

In [None]:
!pip install transformers datasets torchaudio librosa evaluate -q

In [None]:
!pip install -U transformers accelerate -q

In [None]:
import os
import torch
import librosa
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer
)

In [None]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

In [None]:
import os
import pandas as pd

data_path = "/content/ravdess/audio_speech_actors_01-24"

filepaths = []
emotions = []

for root, dirs, files in os.walk(data_path):
    for file in files:
        if file.endswith(".wav"):

            parts = file.split("-")
            emotion_code = parts[2]   # âœ… THIRD number = emotion

            filepaths.append(os.path.join(root, file))
            emotions.append(emotion_map[emotion_code])

df = pd.DataFrame({
    "path": filepaths,
    "emotion": emotions
})

df.head()

In [None]:
label_list = sorted(df["emotion"].unique())

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

df["label"] = df["emotion"].map(label2id)

print(label2id)
df.head()

In [None]:
print("Total samples:", len(df))
print("\nClass distribution:")
print(df["emotion"].value_counts())

In [None]:
from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification

model_name = "facebook/wav2vec2-base"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)

In [None]:
def preprocess_function(example):
    speech, sr = librosa.load(example["path"], sr=16000)

    inputs = feature_extractor(
        speech,
        sampling_rate=16000,
        padding="max_length",
        truncation=True,
        max_length=16000 * 5,
    )

    return {
        "input_values": inputs["input_values"][0],  # remove batch dim
        "labels": example["label"]
    }

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["path", "label"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

dataset = dataset.map(preprocess_function, remove_columns=["path"])

dataset.set_format(type="torch", columns=["input_values", "labels"])

In [None]:
model.freeze_feature_encoder()

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./emotion_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    fp16=True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
# Freeze CNN feature extractor
model.freeze_feature_encoder()

# Unfreeze last 2 transformer encoder layers
for name, param in model.named_parameters():
    if "encoder.layers.10" in name or "encoder.layers.11" in name:
        param.requires_grad = True

In [None]:
trainer.train()

In [None]:
save_path = "/content/emotion_wav2vec2_model"

model.save_pretrained(save_path)
feature_extractor.save_pretrained(save_path)

print("Model saved to:", save_path)

In [None]:
import shutil

shutil.make_archive("/content/emotion_wav2vec2_model", 'zip', save_path)

print("Zipped successfully")

In [None]:
from google.colab import files

files.download("/content/emotion_wav2vec2_model.zip")