# Простой классификатор

## Обучите классификатор музыкального жанра

Обучите классификатор музыкального жанра на датасете GTZAN с помощью кода в лекции.

Можете взять датасет с сайта Kaggle:

https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification

(Это не официальный источник, в нем поврежден файл `jazz.00054.wav`, официальный недоступен)


Можете взять эти данные:
https://storage.yandexcloud.net/aiueducation/Content/base/l12/genres.zip



Примичание: т.к. аудио разной длинны, можно взять небольшой кусочек 10-15 секунд случайным образом из фрагмента.

Ваше решение:

## Загрузка данных

In [None]:
!wget https://storage.yandexcloud.net/aiueducation/Content/base/l12/genres.zip  # Загружаем данные в колаб

--2025-01-21 05:13:48--  https://storage.yandexcloud.net/aiueducation/Content/base/l12/genres.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1226497462 (1.1G) [application/x-zip-compressed]
Saving to: ‘genres.zip’


2025-01-21 05:15:16 (13.4 MB/s) - ‘genres.zip’ saved [1226497462/1226497462]



In [None]:
!unzip -qq genres.zip # распаковываем zip-архив

In [None]:
!ls genres # Данные расположены в папке genres

blues  classical  country  disco  hiphop  jazz	metal  pop  reggae  rock


## Альтернативная версия

In [None]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
import torchaudio
from tqdm.auto import tqdm

In [None]:
def normalize_spectrogram(spec):
    mean = spec.mean()
    std = spec.std()
    return (spec - mean) / std

# Определение класса для загрузки и предварительной обработки данных
class GTZANDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.genre_labels = os.listdir(root_dir)
        self.data = []
        self.labels = []

        for i, genre_label in enumerate(self.genre_labels):
            genre_dir = os.path.join(root_dir, genre_label)
            for filename in os.listdir(genre_dir):
                file_path = os.path.join(genre_dir, filename)
                self.data.append(file_path)
                self.labels.append(i)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        audio_path = self.data[index]
        label = self.labels[index]

        waveform, sr = torchaudio.load(audio_path, normalize=True)
        mel_spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)

        if self.transform:
            mel_spectrogram = self.transform(mel_spectrogram)

        return mel_spectrogram, label

# Загрузка и предварительная обработка данных
root_dir = "genres"  # Укажите путь к папке genres, где находятся данные
transform = transforms.Compose([
    normalize_spectrogram,
    transforms.Resize((128, 128))
])
dataset = GTZANDataset(root_dir, transform=transform)

train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)



In [None]:
# Определение модели классификатора
class MusicGenreClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MusicGenreClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 30 * 30, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = nn.MaxPool2d(kernel_size=2)(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = nn.MaxPool2d(kernel_size=2)(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        return x

# Создание экземпляра модели и определение функции потерь и оптимизатора
num_classes = len(dataset.genre_labels)
model = MusicGenreClassifier(num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Обучение модели
num_epochs = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in tqdm(range(num_epochs)):
    train_loss = 0.0
    model.train()

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * inputs.size(0)

    train_loss /= len(train_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss}")

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1/20, Training Loss: 2.1023389959335326
Epoch 2/20, Training Loss: 1.3843897151947022
Epoch 3/20, Training Loss: 0.8460321414470673
Epoch 4/20, Training Loss: 0.5034598463773727
Epoch 5/20, Training Loss: 0.2524362772703171
Epoch 6/20, Training Loss: 0.1283048892021179
Epoch 7/20, Training Loss: 0.0782527656853199
Epoch 8/20, Training Loss: 0.0565951497014612
Epoch 9/20, Training Loss: 0.04374402040150017
Epoch 10/20, Training Loss: 0.05091000819578767
Epoch 11/20, Training Loss: 0.04965374423190951
Epoch 12/20, Training Loss: 0.10116989929229021
Epoch 13/20, Training Loss: 0.12002754792571067
Epoch 14/20, Training Loss: 0.12351254105567933
Epoch 15/20, Training Loss: 0.033706165309995414
Epoch 16/20, Training Loss: 0.020863168076612055
Epoch 17/20, Training Loss: 0.018405548254959284
Epoch 18/20, Training Loss: 0.013083869910333305
Epoch 19/20, Training Loss: 0.01482038661139086
Epoch 20/20, Training Loss: 0.016781528457067908


In [None]:
# Оценка модели на тестовом наборе данных
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = correct_predictions / total_predictions
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.475


Значение точности довольно низкое на тестовой выборке, но задача по порогу Accuracy не ставилась. Очевидно, что или надо увеличивать количество эпох, но, скорее всего, необходимо поработать с моделью. Но, в целом, обучение классификатора работает. Опять же, причина может крыться в качестве исходных данных.

# Более продвинутый классификатор

## Обучите классификатор музыкального жанра используя спектральные признаки
Используйте на входе сети не исходные аудио-сигналы, а признаки: MEL, MFCC, LFCC и прочее. Признаки нужно вычислить из аудио с помощью кода (не брать готовые из kaggle датасета)

Ваше решение:

### Импорт

In [None]:
import os
import torch
import torchaudio
import torchvision
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms as T
from torch.functional import F

from pathlib import Path
from tqdm.auto import tqdm
from matplotlib import pyplot as plt

### Данные

In [None]:
%pip install -qq kaggle

In [None]:
import ipywidgets as widgets
import os

def get_and_store_secret(placeholder, output_secret_path):
  password_field = widgets.Password(
      description=f"Enter Key:",
      placeholder=placeholder,
      disabled=False
  )

  output = widgets.Output()

  def on_button_click(b):
    with output:
      token = password_field.value
      with open(os.path.expanduser(output_secret_path), "w") as f:
        f.write(token)
      password_field.close()
      button.close()

  button = widgets.Button(description="Save Key")
  button.on_click(on_button_click)

  display(password_field, button, output)

In [None]:
kaggle_user_path = os.path.expanduser("~/.cache/kaggle-user.txt")
get_and_store_secret("KAGGLE_USERNAME", kaggle_user_path)

Password(description='Enter Key:', placeholder='KAGGLE_USERNAME')

Button(description='Save Key', style=ButtonStyle())

Output()

In [None]:
kaggle_key_path = os.path.expanduser("~/.cache/kaggle.token")
get_and_store_secret("KAGGLE_KEY", kaggle_key_path)

Password(description='Enter Key:', placeholder='KAGGLE_KEY')

Button(description='Save Key', style=ButtonStyle())

Output()

In [None]:
with open(kaggle_user_path) as f:
    os.environ["KAGGLE_USERNAME"] = f.read().strip()

with open(kaggle_key_path) as f:
    os.environ["KAGGLE_KEY"] = f.read().strip()

In [None]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification -p ./gtzan-kaggle --force

Downloading gtzan-dataset-music-genre-classification.zip to ./gtzan-kaggle
 99% 1.20G/1.21G [00:23<00:00, 222MB/s]
100% 1.21G/1.21G [00:23<00:00, 55.0MB/s]


In [None]:
#if not Path("gtzan").exists:# Не сработал
!unzip -qq gtzan-kaggle/gtzan-dataset-music-genre-classification.zip -d gtzan

### Датасет

In [None]:
def time_stretch(waveform, factor):
    return librosa.effects.time_stretch(waveform.numpy(), rate=float(factor))

def pitch_shift(waveform, sample_rate, n_steps):
    return librosa.effects.pitch_shift(waveform.numpy(), sr=sample_rate, n_steps=n_steps)

def add_noise(waveform, noise_factor):
    noise = torch.randn_like(waveform)
    return waveform + noise_factor * noise


In [None]:
class GTZAN_Dataset(Dataset):
    def __init__(self, root_dir, transform=None, augmentation=False):
        self.root_dir = root_dir
        self.transform = transform
        self.genres = sorted(os.listdir(root_dir))
        self.augmentation = augmentation
        self.filepaths = [os.path.join(self.root_dir, genre, fname) for genre in self.genres for fname in os.listdir(os.path.join(self.root_dir, genre))]
        self.filepaths = sorted([v for v in self.filepaths if "jazz.00054.wav" not in v])

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        audio_path = self.filepaths[idx]
        genre = os.path.dirname(audio_path).split("/")[-1]
        genre_idx = self.genres.index(genre)
        waveform, sample_rate = torchaudio.load(audio_path)

        waveform = self.random_crop(waveform, sample_rate, duration=30)
        if self.augmentation:
            # Apply time stretching
            if torch.rand(1) < 0.5:
                # factor = torch.normal(mean=1, std=0.05)
                factor = torch.randn([1])*0.05 + 1
                waveform = torch.from_numpy(time_stretch(waveform.squeeze(), factor[0])).unsqueeze(0)

            # Apply pitch shifting
            if torch.rand(1) < 0.5:
                n_steps = np.random.randint(-2, 2)
                waveform = torch.from_numpy(pitch_shift(waveform.squeeze(), sample_rate, n_steps)).unsqueeze(0)

            # Apply additive noise
            if torch.rand(1) < 0.5:
                noise_factor = torch.randn([1])*0.005
                waveform = add_noise(waveform, noise_factor[0])

            waveform = self.random_crop(waveform, sample_rate, duration=30)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, genre_idx

    def random_crop(self, waveform, sample_rate, duration):
        audio_length = waveform.size(1)
        crop_length = duration * sample_rate
        if audio_length > crop_length:
            start = torch.randint(0, audio_length - crop_length, (1,)).item()
            end = start + crop_length
            waveform = waveform[:, start:end]
        elif audio_length < crop_length:
            padding = torch.zeros(waveform.size(0), crop_length - audio_length)
            waveform = torch.cat((waveform, padding), dim=1)
        return waveform

In [None]:
def resample_audio(waveform, old_sr, new_sr):
    return torchaudio.transforms.Resample(old_sr, new_sr)(waveform)

def melspectrogram(waveform, sample_rate, n_mels=128):
    return torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)(waveform)


In [None]:
new_sample_rate = 22050

transform = torchvision.transforms.Compose([
    lambda x: resample_audio(x, old_sr=22050, new_sr=new_sample_rate),
    lambda x: melspectrogram(x, sample_rate=new_sample_rate),
    lambda x: x.log2().clamp(min=-50)
])

dataset = GTZAN_Dataset("gtzan/Data/genres_original", transform=transform, augmentation=False)  # ~80% val acc
# dataset = GTZAN_Dataset("gtzan/Data/genres_original", transform=transform, augmentation=True)  # Очень медленно

In [None]:
len(dataset)

999

In [None]:
for mel_image, label in tqdm(dataset):
  assert mel_image.shape == torch.Size([1, 128, 3308]), mel_image.shape

  0%|          | 0/999 [00:00<?, ?it/s]



In [None]:
batch_size = 16

train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)




In [None]:
len(train_loader), len(valid_loader)

(50, 13)

### Обучение модели

In [None]:
from torchvision.models import resnet18

class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.resnet = resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        x = self.resnet(x)
        return x

In [None]:
num_classes = 10
model = AudioClassifier(num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 151MB/s]


In [None]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    return running_loss / len(dataloader), correct / total

def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    return running_loss / len(dataloader), correct / total


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 30

for epoch in tqdm(range(num_epochs)):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    valid_loss, valid_acc = validate(model, valid_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_acc:.4f}")


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1/30
Train Loss: 1.8820, Train Acc: 0.3842
Valid Loss: 3.2197, Valid Acc: 0.3850
Epoch 2/30
Train Loss: 1.4194, Train Acc: 0.4856
Valid Loss: 1.5438, Valid Acc: 0.5600
Epoch 3/30
Train Loss: 1.2228, Train Acc: 0.5632
Valid Loss: 1.2182, Valid Acc: 0.6250
Epoch 4/30
Train Loss: 1.1455, Train Acc: 0.6033
Valid Loss: 2.2091, Valid Acc: 0.4400
Epoch 5/30
Train Loss: 1.0493, Train Acc: 0.6370
Valid Loss: 4.0036, Valid Acc: 0.3700
Epoch 6/30
Train Loss: 0.9004, Train Acc: 0.6946
Valid Loss: 1.2679, Valid Acc: 0.6050
Epoch 7/30
Train Loss: 0.8648, Train Acc: 0.7109
Valid Loss: 0.8934, Valid Acc: 0.7600
Epoch 8/30
Train Loss: 0.8671, Train Acc: 0.7096
Valid Loss: 2.1160, Valid Acc: 0.5000
Epoch 9/30
Train Loss: 0.7601, Train Acc: 0.7372
Valid Loss: 0.8937, Valid Acc: 0.7200
Epoch 10/30
Train Loss: 0.6225, Train Acc: 0.7947
Valid Loss: 1.5946, Valid Acc: 0.5650
Epoch 11/30
Train Loss: 0.6830, Train Acc: 0.7597
Valid Loss: 0.9985, Valid Acc: 0.6700
Epoch 12/30
Train Loss: 0.6198, Train Acc