In [48]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchvision.transforms as transforms
import numpy as np
import os
import pathlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from PIL import Image
import tqdm

import xgboost as xgb

EPOCHS = 100

# PyTorch

In [14]:
class AudioDataset(Dataset):
    def __init__(self, dataset_path: str) -> None:
        self.dataset_path = pathlib.Path(dataset_path)
        self.file_paths = []
        self.labels = []

        self.bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        self.model = self.bundle.get_model()
        self.sample_rate = self.bundle.sample_rate

        self.labels_meaning = {
            0: 'нейтрально',
            1: 'спокойно',
            2: 'счастливо',
            3: 'грустно',
            4: 'сердито',
            5: 'напуганно',
            6: 'недовольно',
            7: 'удивлённо'
        }

        for elem in os.listdir(self.dataset_path):
            dirpath = self.dataset_path.joinpath(elem)
            for audio in os.listdir(dirpath):
                audiopath = dirpath.joinpath(audio)
                self.file_paths.append(audiopath)
                emotion = int(audio.split('-')[2])-1
                self.labels.append(emotion)


    def __len__(self) -> int:
        return len(self.labels)
    

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if sample_rate != self.bundle.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.sample_rate)

        label = torch.tensor(self.labels[idx]).type(torch.long)

        with torch.inference_mode():
            features, _ = self.model.extract_features(waveform)
        features = torch.squeeze(torch.concatenate(features, axis=1))
        features = transforms.ToPILImage()(features).resize((760, 1500))
        features = transforms.ToTensor()(features)
        
        return features, label

In [15]:
train_dataset = AudioDataset('./train')
test_dataset = AudioDataset('./test')

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [16]:
class SpeechEmotionClassifier(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),

            nn.AdaptiveAvgPool2d(output_size=(2,2)),
            nn.Flatten(),
            nn.Linear(512*2*2, 8),
            nn.Softmax()
        )

    def forward(self, x: torch.Tensor):
        return self.model(x)

In [17]:
model = SpeechEmotionClassifier()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [18]:
for epoch in tqdm.tqdm(range(EPOCHS)):
    train_loss = 0
    for features, labels in train_dataloader:
        optimizer.zero_grad()
        y_pred = model(features)
        loss = loss_fn(y_pred, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    print(f'Эпоха: {epoch}. Ошибка: {train_loss}')
        
        

  input = module(input)
  0%|          | 0/100 [34:26<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [1, 1500, 760] at entry 0 and [2, 1500, 760] at entry 1

# XGBoost

In [43]:
X, y = [], []

In [44]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
bundlemodel = bundle.get_model()
sample_rate = bundle.sample_rate

In [45]:
root = pathlib.Path('./train')
for elem in tqdm.tqdm(os.listdir(root)):
    dirpath = root.joinpath(elem)
    for audio in os.listdir(dirpath):
        audiopath = dirpath.joinpath(audio)
        waveform, sample_rate = torchaudio.load(audiopath)
        if sample_rate != bundle.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

        with torch.inference_mode():
            features, _ = bundlemodel.extract_features(waveform)
        features = [feature[0] for feature in features]
        features = np.concatenate(features, axis=0)
        features = Image.fromarray(features, mode='L').resize((760, 1500))
        features = np.array(features)
        X.append(features)

        emotion = int(audio.split('-')[2])-1
        y.append(emotion)

100%|██████████| 24/24 [11:22<00:00, 28.43s/it]


In [52]:
X = np.array(X)
X = X.reshape((X.shape[0], -1))

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [58]:
params = {
    "objective": "multi:softmax",
    "num_class": 8,
    "max_depth": 3,
    "learning_rate": 0.1,
    "n_estimators": 100
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4212962962962963
