In [1]:
!pip install -q datasets


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# prediction model

emotions = ['anger', 'fear', 'happy', 'neutral', 'sad', 'surprise']


In [17]:
import os
import random
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import Audio
from datasets import load_dataset
from transformers import AutoConfig, Wav2Vec2FeatureExtractor, HubertPreTrainedModel, HubertModel

# Define model parameters
model_name_or_path = "xmj2002/hubert-base-ch-speech-emotion-recognition"
duration = 6
sample_rate = 16000

# Load the configuration for the model
config = AutoConfig.from_pretrained(model_name_or_path)

# Emotion classification function
def id2class(id):
    if id == 0:
        return "angry"
    elif id == 1:
        return "fear"
    elif id == 2:
        return "happy"
    elif id == 3:
        return "neutral"
    elif id == 4:
        return "sad"
    else:
        return "surprise"

# Define the prediction function
# def predict(audio_array, processor, model):
#     speech = processor(audio_array, padding="max_length", truncation=True, max_length=duration * sample_rate,
#                        return_tensors="pt", sampling_rate=sample_rate).input_values
#     with torch.no_grad():
#         logit = model(speech)
#     score = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
#     emotion_id = torch.argmax(logit).cpu().numpy()
#     emotion_class = id2class(emotion_id)
#     print(f"Predicted Emotion: {emotion_class} \t Confidence: {score[emotion_id]:.4f}")

def predict(audio_array, processor, model):
    # Prepare the input array for the model
    speech = processor(audio_array, padding="max_length", truncation=True, max_length=duration * sample_rate,
                       return_tensors="pt", sampling_rate=sample_rate).input_values

    with torch.no_grad():
        logit = model(speech)

    # Get the softmax scores
    score = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
    emotion_id = torch.argmax(logit).cpu().numpy()

    # Convert ID to emotion class
    emotion_class = id2class(emotion_id)

    # Return both the predicted emotion and confidence score
    return emotion_class, score[emotion_id]


# Define the Hubert Classification Head
class HubertClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_class)

    def forward(self, x):
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# Define the Hubert model for speech classification
class HubertForSpeechClassification(HubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.hubert = HubertModel(config)
        self.classifier = HubertClassificationHead(config)
        self.init_weights()

    def forward(self, x):
        outputs = self.hubert(x)
        hidden_states = outputs[0]
        x = torch.mean(hidden_states, dim=1)
        x = self.classifier(x)
        return x

# # Load the dataset
# ds = load_dataset("hf-internal-testing/librispeech_asr_dummy")

# # Get the first entry from the validation dataset
# first_entry = ds['validation'][0]

# # Access the audio array and sampling rate
# audio_array = first_entry['audio']['array']
# sampling_rate = first_entry['audio']['sampling_rate']

# Resample the audio array if necessary
if sampling_rate != sample_rate:
    print(f"Resampling from {sampling_rate}Hz to {sample_rate}Hz")
    audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=sample_rate)

# Initialize the processor and model
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
model = HubertForSpeechClassification.from_pretrained(model_name_or_path)
model.eval()

# Send the audio array to the model for prediction
# predict(audio_array, processor, model)


config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

HubertForSpeechClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

# predicting for whole dataset

In [11]:
import json
import numpy as np

def load_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e.msg}")
        print(f"Line {e.lineno}, Column {e.colno}, Position {e.pos}")
        raise  # Re-raise the exception to stop execution if the file is invalid

def load_audio_data(file_path):
    data = load_json_file(file_path)
    cleaned_data = []  # List to store valid data entries

    for item in data:
        try:
            if 'array' in item:
                # Convert the list back to a numpy array if needed
                item['array'] = np.array(item['array'])
            else:
                print(f"Missing 'array' key in item: {item}")
                continue  # Skip this item if 'array' is missing

            cleaned_data.append(item)  # Add valid item to cleaned data list

        except Exception as e:
            print(f"Error processing item: {item}. Skipping this entry. Error: {e}")
            continue  # Skip this item and continue with the next

    return cleaned_data

try:
    data = load_audio_data("/content/audio_array_data.json")
    print(f"Loaded {len(data)} valid items from the JSON file.")
except Exception as e:
    print(f"Failed to load audio data: {e}")


Loaded 250 valid items from the JSON file.


In [15]:
print(data)

[{'filename': '1184-121024-0021.wav', 'array': array([-71, -74, -75, ..., 125, 120, 111]), 'sampling_rate': 16000}, {'filename': '1184-121024-0037.wav', 'array': array([ -76,  -90, -100, ...,   20,   -5,    2]), 'sampling_rate': 16000}, {'filename': '1184-121024-0090.wav', 'array': array([-157, -167, -171, ...,  -40,  -37,  -43]), 'sampling_rate': 16000}, {'filename': '1184-121024-0158.wav', 'array': array([-671, -746, -809, ...,    7,    4,    1]), 'sampling_rate': 16000}, {'filename': '1184-121024-0159.wav', 'array': array([ -1,   1,   1, ..., -59, -63, -63]), 'sampling_rate': 16000}, {'filename': '1184-121024-0160.wav', 'array': array([ -60,  -67,  -71, ..., -239, -244, -244]), 'sampling_rate': 16000}, {'filename': '1184-121024-0162.wav', 'array': array([111, 133, 134, ...,  -3, -13, -16]), 'sampling_rate': 16000}, {'filename': '1184-121024-0172.wav', 'array': array([ 51,  71,  79, ..., -49, -46, -42]), 'sampling_rate': 16000}, {'filename': '1184-121024-0173.wav', 'array': array([ -

In [19]:
import librosa
import numpy as np
import pandas as pd

predictions = []

# Define the target sample rate (replace with actual sample rate)
sample_rate = 16000

for entry in data:
    audio_array = entry['array']
    sampling_rate = entry['sampling_rate']

    # Resample if necessary
    if sampling_rate != sample_rate:
        audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=sample_rate)

    try:
        # Ensure audio array is in the expected dtype (float64 or Double)
        audio_array = np.array(audio_array, dtype=np.float64)

        # Get prediction
        output = predict(audio_array, processor, model)

        if output is not None:  # Check if output is valid
            emotion_class, confidence = output
            print(f"{emotion_class}, {confidence}")

            # Append to predictions list
            predictions.append({"emotion": emotion_class, "confidence": confidence})

    except Exception as e:
        # Use a different key that exists in the entry if 'uri' doesn't
        entry_info = entry.get('audio', {}).get('path', 'unknown')
        print(f"Error processing entry: {entry_info}, Error: {e}")

# Create a DataFrame from the predictions list
df_ser = pd.DataFrame(predictions)

# Display the DataFrame with predictions
df_ser


sad, 0.865588366985321
sad, 0.9996588230133057
sad, 0.9632764458656311
sad, 0.9996325969696045
sad, 0.9964584708213806
sad, 0.7992238998413086
fear, 0.705068051815033
surprise, 0.9953951239585876
surprise, 0.7996217608451843
sad, 0.9997286200523376
sad, 0.8939249515533447
neutral, 0.4817354679107666
surprise, 0.9164060950279236
fear, 0.5557313561439514
sad, 0.6041005253791809
surprise, 0.9664352536201477
surprise, 0.7975367307662964
sad, 0.982225239276886
sad, 0.9755983352661133
surprise, 0.577741801738739
sad, 0.5689936280250549
sad, 0.9932034015655518
fear, 0.9895918965339661
happy, 0.47840452194213867
sad, 0.8648197650909424
surprise, 0.9989944100379944
sad, 0.8974634408950806
sad, 0.8647295236587524
surprise, 0.9954219460487366
sad, 0.99437016248703
fear, 0.5023998618125916
fear, 0.8208234310150146
sad, 0.37295690178871155
sad, 0.48124536871910095
surprise, 0.4129791557788849
fear, 0.777337908744812
surprise, 0.5611509084701538
sad, 0.6548876762390137
sad, 0.8796119093894958
sad, 0

Unnamed: 0,emotion,confidence
0,sad,0.865588
1,sad,0.999659
2,sad,0.963276
3,sad,0.999633
4,sad,0.996458
...,...,...
245,angry,0.999910
246,surprise,0.999901
247,surprise,0.975903
248,neutral,0.685876


In [20]:
df_ser.to_csv("ser_output.csv")