In [33]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torchaudio
from sklearn.model_selection import train_test_split
import os
import sys
import librosa
import IPython.display as ipd
from datasets import load_dataset
from transformers import AutoConfig, Wav2Vec2Processor

In [16]:
emotion_mapping = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

data = []

dataset_path = Path("../dataset")

for path in tqdm(dataset_path.glob("**/*.wav")):
    name = str(path).split('\\')[-1].split('.')[0]
    label_code = name.split('-')[2]  # L'émotion est le 3ème segment du nom de fichier
    
    emotion = emotion_mapping.get(label_code, "unknown")

    try:
        # Chargement du fichier pour vérifier qu'il est valide
        s = torchaudio.load(path)
        data.append({
            "name": name,
            "path": str(path),  # Convertir en chaîne de caractères
            "emotion": emotion
        })
    except Exception as e:
        # Fichiers corrompus ignorés
        pass


1440it [00:01, 1311.53it/s]


In [17]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,path,emotion
0,03-01-01-01-01-01-01,..\dataset\Actor_01\03-01-01-01-01-01-01.wav,neutral
1,03-01-01-01-01-02-01,..\dataset\Actor_01\03-01-01-01-01-02-01.wav,neutral
2,03-01-01-01-02-01-01,..\dataset\Actor_01\03-01-01-01-02-01-01.wav,neutral
3,03-01-01-01-02-02-01,..\dataset\Actor_01\03-01-01-01-02-02-01.wav,neutral
4,03-01-02-01-01-01-01,..\dataset\Actor_01\03-01-02-01-01-01-01.wav,calm


In [18]:
print(f"Step 0: {len(df)}")

# Vérifier l'existence des fichiers
df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)

# Supprimer les fichiers qui n'existent pas
df = df.dropna(subset=["path"])

# Supprimer la colonne 'status' après la vérification
df = df.drop("status", axis=1)

# Afficher le nombre de fichiers restants après nettoyage
print(f"Step 1: {len(df)}")

# Mélanger les données aléatoirement
df = df.sample(frac=1)

# Réinitialiser l'index pour garder un ordre propre
df = df.reset_index(drop=True)

# Afficher les premières lignes du DataFrame
df.head()

Step 0: 1440
Step 1: 1440


Unnamed: 0,name,path,emotion
0,03-01-08-01-01-01-21,..\dataset\Actor_21\03-01-08-01-01-01-21.wav,surprised
1,03-01-08-02-01-02-21,..\dataset\Actor_21\03-01-08-02-01-02-21.wav,surprised
2,03-01-08-01-01-02-16,..\dataset\Actor_16\03-01-08-01-01-02-16.wav,surprised
3,03-01-02-02-02-01-07,..\dataset\Actor_07\03-01-02-02-02-01-07.wav,calm
4,03-01-05-02-02-02-04,..\dataset\Actor_04\03-01-05-02-02-02-04.wav,angry


In [19]:
print("Labels: ", df["emotion"].unique())
print()
df.groupby("emotion").count()[["path"]]

Labels:  ['surprised' 'calm' 'angry' 'fearful' 'sad' 'neutral' 'disgust' 'happy']



Unnamed: 0_level_0,path
emotion,Unnamed: 1_level_1
angry,192
calm,192
disgust,192
fearful,192
happy,192
neutral,96
sad,192
surprised,192


In [25]:
save_path = "../dataset_csv"

# Diviser les données en entraînement et test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["emotion"])

# Réinitialiser l'index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Sauvegarder les DataFrames en CSV
train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)

# Afficher les tailles des jeux de données
print(train_df.shape)
print(test_df.shape)

(1152, 3)
(288, 3)


In [30]:
data_files = {
    "train": "../dataset_csv/train.csv", 
    "validation": "../dataset_csv/test.csv",
}

# Charger le dataset
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

# Extraire les jeux de données d'entraînement et de validation
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

# Afficher les informations sur les datasets
print(train_dataset)
print(eval_dataset)

Generating train split: 1152 examples [00:00, 28582.64 examples/s]
Generating validation split: 288 examples [00:00, 59405.90 examples/s]

Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 1152
})
Dataset({
    features: ['name', 'path', 'emotion'],
    num_rows: 288
})





In [39]:
input_column = "path"
output_column = "emotion"

In [32]:
label_list = train_dataset.unique(output_column)

# Trier les étiquettes
label_list.sort()

# Calculer le nombre de classes
num_labels = len(label_list)

# Afficher les résultats
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 8 classes: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']


In [34]:
model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
pooling_mode = "mean"

In [35]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [36]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")



The target sampling rate: 16000


# Preprocess Data

In [37]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [40]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

Map (num_proc=4):   0%|          | 0/1152 [00:02<?, ? examples/s]


NameError: name 'input_column' is not defined