In [7]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer,Wav2Vec2Processor
from model import EmotionModel
from sklearn.model_selection import train_test_split
from dataset import RecolaDataset
from torch import autocast
from torch.cuda.amp import GradScaler

In [37]:
Ravdess="./archive/"
max_length = 8000  # Max audio length in samples
batch_size = 8
learning_rate = 1e-4
num_epochs = 10

In [9]:
ravdess_directory_list = os.listdir(Ravdess)
    
file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files

emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
print(len(emotion_df))

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
df = pd.concat([emotion_df, path_df], axis=1)

1440


In [10]:
print("Labels: ", df["Emotions"].unique())
print()
df.groupby("Emotions").count()[["Path"]]

Labels:  [1 2 3 4 5 6 7 8]



Unnamed: 0_level_0,Path
Emotions,Unnamed: 1_level_1
1,96
2,192
3,192
4,192
5,192
6,192
7,192
8,192


In [11]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["Path"]
label = sample["Emotions"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), orig_sr=sr, target_sr=16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 1268
      Label: 2



In [12]:
train_paths, val_paths, train_labels, val_labels = train_test_split(df['Path'].to_list(),df['Emotions'].to_list(),test_size=0.2, random_state=42)
#     #print(Ravdess_df['Emotions'].to_list())
    # Prepare the datasets and data loaders
    #target_sampling_rate = modetokenizerl.sampling_rate
save_path = "./archive"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["Emotions"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)


(1152, 2)
(288, 2)


In [31]:
train_paths, val_paths, train_labels, val_labels = train_test_split(df['Path'].to_list(),df['Emotions'].to_list(),test_size=0.2, random_state=42)


In [13]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "archive/train.csv", 
    "validation": "archive/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Downloading and preparing dataset csv/default to C:/Users/saipr/.cache/huggingface/datasets/csv/default-eeef983550d595f9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 1980.78it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 55.40it/s]
                                                             

Dataset csv downloaded and prepared to C:/Users/saipr/.cache/huggingface/datasets/csv/default-eeef983550d595f9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 46.31it/s]

Dataset({
    features: ['Emotions', 'Path'],
    num_rows: 1152
})
Dataset({
    features: ['Emotions', 'Path'],
    num_rows: 288
})





In [14]:
# We need to specify the input and output column
input_column = "Path"
output_column = "Emotions"

In [15]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 8 classes: [1, 2, 3, 4, 5, 6, 7, 8]


In [16]:
from transformers import AutoConfig, Wav2Vec2Processor
model_name ="facebook/wav2vec2-large-960h"
pooling_mode = "mean"

In [17]:
# config
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [18]:
processor = Wav2Vec2Processor.from_pretrained(model_name,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [60]:

train_dataset = RecolaDataset(train_paths, train_labels, processor, max_length,)
val_dataset = RecolaDataset(val_paths, val_labels, processor, max_length)


train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

In [57]:
# import torchaudio


# def label_to_id(label, label_list):
#     #label_list = train_dataset.unique(output_column)
#     if len(label_list) > 0:
#         return label_list.index(label) if label in label_list else -1

#     return label

# def speech_file_to_array_fn(path):
#     speech_array, sampling_rate = torchaudio.load(path)
#     resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
#     speech = resampler(speech_array).squeeze().numpy()
#     return speech
# def preprocess_function(examples):
#     from transformers import Wav2Vec2Processor
#     import torchaudio
 
#     label_list =[1, 2, 3, 4, 5, 6, 7, 8]
#     speech_list =[]
#     for path in examples['Path']:
#         speech_array, sampling_rate = torchaudio.load(path)
#         resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
#         speech = resampler(speech_array).squeeze().numpy()
#         speech_list.append(speech)
#     target_list=[]
#     for label in examples['Emotions']:
#         if len(label_list) > 0:
#             label_list.index(label) if label in label_list else -1
#         target_list.append(label)
  
#     processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h",)
#     result = processor(speech_list, sampling_rate=16000)
#     result["labels"] = target_list

#     return result
# train_dataset = train_dataset.map(
#     preprocess_function,
#     batch_size=,
#     batched=True,
#     num_proc=4
# )
# eval_dataset = eval_dataset.map(
#     preprocess_function,
#     batch_size=100,
#     batched=True,
#     num_proc=4
# )

In [63]:
model = EmotionModel.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing EmotionModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing EmotionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EmotionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EmotionModel were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['classifier.out_proj.bias', 'wav2vec2.masked_spec_embed', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()


scaler = GradScaler()

In [65]:
for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_loss = 0
        total_correct = 0
        total_examples = 0
        for batch in train_loader:
            input_values = batch['input_values'].to(device)
            #attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device,dtype = torch.float32)
           

            with autocast(device_type='cuda',dtype=torch.float16):
                outputs = model(input_values=input_values)
                loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            
            scaler.update()
            train_loss += loss.item()
            predicted_labels = torch.argmax(outputs, dim=1)
            correct = torch.sum(predicted_labels == labels).item()
            total_correct += correct
            total_examples += labels.size(0)
            print("train loss:", train_loss)

        #train_loss /= len(train_loader)
        train_loss /= len(train_loader)
        train_accuracy = total_correct / total_examples
        # Validation
        model.eval()
        val_loss = 0.0

        val_loss = 0
        total_correct = 0
        total_examples = 0
        with torch.no_grad():
                for batch in val_loader:
                    input_values = batch['input_values'].to(device)
                    labels = batch['labels'].to(device,dtype = torch.float32)

                    outputs = model(input_values=input_values)
                    loss = loss_fn(outputs, labels)


                    predicted_labels = torch.argmax(outputs, dim=1)
                    correct = torch.sum(predicted_labels == labels).item()
                    total_correct += correct
                    total_examples += labels.size(0)

                    val_loss += loss.item()

                val_loss /= len(val_loader)
                val_accuracy = total_correct / total_examples

        print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Train Accuracy: {train_accuracy:.4f} - Val Accuracy: {val_accuracy:.4f}')
        checkpoint_path = os.path.join("./", f'epoch_{epoch+1}_{val_loss}_{val_accuracy}.pt')
        torch.save(model.state_dict(), checkpoint_path)


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([1, 8]))