In [18]:
from speech2label_classes import LSTMSpeechRecognizer, SpeechDataset
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import librosa

model_output_dir = "./trained"

In [19]:
# --- Гиперпараметры ---
input_dim = 128  # Размерность MFCC
hidden_dim = 2048  # Увеличение размерности скрытых слоев
output_dim = 30  # Количество классов
batch_size = 1  # Размер пакета
learning_rate = 0.000001
num_epochs = 3  # Увеличиваем количество эпох для более глубокого обучения

device = torch.device("cpu")
model = LSTMSpeechRecognizer(input_dim, hidden_dim, output_dim).to(device)
model.load_state_dict(torch.load(os.path.join(model_output_dir, "lstm_speech.pth")))
model.eval()  # Переключаем модель в режим оценки

  model.load_state_dict(torch.load(os.path.join(model_output_dir, "lstm_speech.pth")))


LSTMSpeechRecognizer(
  (lstm): LSTM(128, 2048, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=4096, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=30, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [20]:
def convert_texts_to_labels(texts):
    unique_texts = list(set(texts))
    text_to_index = {text: index for index, text in enumerate(unique_texts)}
    labels = [text_to_index[text] for text in texts]
    return torch.tensor(labels, dtype=torch.long)

def extract_mfcc(audio_path):
    """Извлекает MFCC из аудиофайла."""
    try:
        waveform, sample_rate = librosa.load(audio_path, sr=16000)
        mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=input_dim)
        return mfcc.T  # Транспонируем для удобства
    except Exception as e:
        print(f"Error loading audio: {audio_path}, Error: {e}")
        return None 

# --- Тестирование модели ---
def test_model(model, test_loader, criterion):
    model.eval()  # Переключаем модель в режим оценки
    test_loss = 0
    test_correct = 0
    test_total = 0

    with torch.no_grad():  # Отключаем градиенты для тестирования
        for batch_idx, (mfcc, texts) in enumerate(test_loader):
            if mfcc is None:
                continue

            mfcc = mfcc.to(device)
            labels = convert_texts_to_labels(texts).to(device)

            outputs = model(mfcc)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_epoch_loss = test_loss / len(test_loader)
    test_epoch_accuracy = test_correct / test_total
    print(f"Test Loss: {test_epoch_loss:.4f}, Test Accuracy: {test_epoch_accuracy:.4f}")
    
def pad_collate(batch):
    mfccs, texts = zip(*batch)
    mfccs = [m for m in mfccs if m is not None]
    if len(mfccs) == 0:
        return None, None
    mfccs_padded = pad_sequence(mfccs, batch_first=True, padding_value=0.0)
    return mfccs_padded, texts

audio_folder_clear = './dataset/hr_bot_synt'
json_file_clear = './dataset/annotation/hr_bot_synt.json'
criterion = nn.CrossEntropyLoss()

In [21]:

# Создание тестового набора на чистых данных
test_dataset = SpeechDataset(audio_folder_clear, json_file_clear, transform=extract_mfcc)
test_dataset = [test_dataset[i] for i in range(20)]
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)

# Тестирование модели
test_model(model, test_loader, criterion)

Test Loss: 0.0222, Test Accuracy: 1.0000
