In [2]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import librosa

if torch.cuda.is_available():
    print(f"Количество доступных GPU: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA не доступна.")

# --- Настройки ---
audio_folder_noise = './dataset/hr_bot_noise'
json_file_noise = './dataset/annotation/hr_bot_noise.json'
audio_folder_clear = './dataset/hr_bot_synt'
json_file_clear = './dataset/annotation/hr_bot_synt.json'
model_output_dir = "./trained"
os.makedirs(model_output_dir, exist_ok=True)

# --- Гиперпараметры ---
input_dim = 128  # Размерность MFCC
hidden_dim = 2048  # Увеличение размерности скрытых слоев
output_dim = 30  # Количество классов
batch_size = 4  # Размер пакета
learning_rate = 0.000001
num_epochs = 1  # Увеличиваем количество эпох для более глубокого обучения

Количество доступных GPU: 1
GPU 0: Tesla T4


In [3]:
# --- Функции ---
def convert_texts_to_labels(texts):
    unique_texts = list(set(texts))
    text_to_index = {text: index for index, text in enumerate(unique_texts)}
    labels = [text_to_index[text] for text in texts]
    return torch.tensor(labels, dtype=torch.long)

def extract_mfcc(audio_path):
    """Извлекает MFCC из аудиофайла."""
    try:
        waveform, sample_rate = librosa.load(audio_path, sr=16000)
        mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=input_dim)
        return mfcc.T  # Транспонируем для удобства
    except Exception as e:
        print(f"Error loading audio: {audio_path}, Error: {e}")
        return None  

def pad_collate(batch):
    mfccs, texts = zip(*batch)
    mfccs = [m for m in mfccs if m is not None]
    if len(mfccs) == 0:
        return None, None
    mfccs_padded = pad_sequence(mfccs, batch_first=True, padding_value=0.0)
    return mfccs_padded, texts

In [4]:
from speech2label_classes import SpeechDataset, LSTMSpeechRecognizer

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Создание датасета на шумных данных
full_dataset_noise = SpeechDataset(audio_folder_noise, json_file_noise, transform=extract_mfcc)

# Определение размеров подвыборок для шумных данных
train_size = int(0.8 * len(full_dataset_noise))
val_size = len(full_dataset_noise) - train_size

# Разделение на обучающую и проверочную выборки
train_dataset, val_dataset = random_split(full_dataset_noise, [train_size, val_size])

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)

# Создание и подготовка модели
model = LSTMSpeechRecognizer(input_dim, hidden_dim, output_dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 регуляризация
criterion = nn.CrossEntropyLoss()

In [6]:
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()  # Включаем режим обучения
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (mfcc, texts) in enumerate(train_loader):
        if mfcc is None:
            continue
        
        mfcc = mfcc.to(device)
        labels = convert_texts_to_labels(texts).to(device)

        optimizer.zero_grad()
        outputs = model(mfcc)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if batch_idx % 10 == 0:
            print(f"Epoch: {epoch+1}/{num_epochs}, Batch: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)
    print(f"Epoch: {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_accuracy:.4f}")

    # --- Валидация ---
    model.eval()  # Включаем режим оценки
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():  # Отключаем градиенты для валидации
        for batch_idx, (mfcc, texts) in enumerate(val_loader):
            if mfcc is None:
                continue
            
            mfcc = mfcc.to(device)
            labels = convert_texts_to_labels(texts).to(device)

            outputs = model(mfcc)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_epoch_loss = val_loss / len(val_loader)
    val_epoch_accuracy = val_correct / val_total
    val_losses.append(val_epoch_loss)
    val_accuracies.append(val_epoch_accuracy)
    print(f"Epoch: {epoch+1}/{num_epochs}, Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_epoch_accuracy:.4f}")

Epoch: 1/1, Batch: 1/215, Loss: 3.4175
Epoch: 1/1, Batch: 11/215, Loss: 3.4475
Epoch: 1/1, Batch: 21/215, Loss: 3.4409
Epoch: 1/1, Batch: 31/215, Loss: 3.3868
Epoch: 1/1, Batch: 41/215, Loss: 3.3203
Epoch: 1/1, Batch: 51/215, Loss: 3.3844
Epoch: 1/1, Batch: 61/215, Loss: 3.2622
Epoch: 1/1, Batch: 71/215, Loss: 3.2676
Epoch: 1/1, Batch: 81/215, Loss: 3.1825
Epoch: 1/1, Batch: 91/215, Loss: 3.2799
Epoch: 1/1, Batch: 101/215, Loss: 3.1673
Epoch: 1/1, Batch: 111/215, Loss: 3.2350
Epoch: 1/1, Batch: 121/215, Loss: 3.0704
Epoch: 1/1, Batch: 131/215, Loss: 3.0753
Epoch: 1/1, Batch: 141/215, Loss: 3.0667
Epoch: 1/1, Batch: 151/215, Loss: 3.0720
Epoch: 1/1, Batch: 161/215, Loss: 2.9616
Epoch: 1/1, Batch: 171/215, Loss: 3.0152
Epoch: 1/1, Batch: 181/215, Loss: 2.8813
Epoch: 1/1, Batch: 191/215, Loss: 2.8677
Epoch: 1/1, Batch: 201/215, Loss: 2.9331
Epoch: 1/1, Batch: 211/215, Loss: 2.8314
Epoch: 1/1, Train Loss: 3.1829, Train Accuracy: 0.2047
Epoch: 1/1, Validation Loss: 2.9131, Validation Accura

In [7]:
torch.save(model.state_dict(), os.path.join(model_output_dir, "lstm_speech.pth"))