In [1]:
import librosa
import numpy as np

def process_audio(file_path, target_sr=22050):
    # Загрузить аудио файл, конвертируя его в моно
    y, sr = librosa.load(file_path, sr=target_sr, mono=True)
    
    # Нормализация громкости: приводим амплитуду к диапазону [-1, 1]
    y = y / np.max(np.abs(y))
    
    return y

In [2]:
import soundfile as sf
from IPython.display import Audio, display
import uuid 

def save_and_display_audio(audio_vector, sample_rate=22050):
    output_path = f'/tmp/{uuid.uuid4().hex}.wav'
    
    sf.write(output_path, audio_vector, sample_rate)
    
    # Воспроизвести аудио файл в Jupyter Notebook
    display(Audio(output_path, autoplay=True))

In [3]:
file_path = 'dataset/17a.mp3'
audio_vector = process_audio(file_path)
save_and_display_audio(audio_vector)

In [4]:
file_path_b = 'dataset/17b.wav'
audio_vector_b = process_audio(file_path_b)
save_and_display_audio(audio_vector_b)

In [5]:
import whisper

model = whisper.load_model('base')

In [6]:
import re

def transcribe_audio(audio_vector, sample_rate, language='ja'):
    temp_path = 'temp_audio.wav'
    sf.write(temp_path, audio_vector, sample_rate)
    result = model.transcribe(temp_path, language=language, word_timestamps=True)
    return result['segments']

def split_text_into_sentences_and_timestamps(segments):
    sentences = []
    current_sentence = ""
    sentence_start = None

    for segment in segments:
        for word_info in segment['words']:
            word = word_info['word']
            if sentence_start is None:
                sentence_start = word_info['start']
            
            current_sentence += word

            if re.match(r'.*[。．.?]', word):
                sentence_end = word_info['end']
                sentences.append((current_sentence.strip(), sentence_start, sentence_end))
                current_sentence = ""
                sentence_start = None

    if current_sentence:
        sentences.append((current_sentence.strip(), sentence_start, word_info['end']))
    
    return sentences
        

def get_sentence_audio_pairs(audio_vector, sample_rate, sentences):
    pairs = []
    for sentence, start, end in sentences:
        start_sample = int(start * sample_rate)
        end_sample = int((end + 0.1) * sample_rate)
        sentence_audio_vector = audio_vector[start_sample:end_sample]
        trimmed_audio, _ = librosa.effects.trim(sentence_audio_vector)
        pairs.append((sentence, trimmed_audio))
    return pairs


def process_audio_vec(audio_vector, sample_rate=22050, language='ja'):
    segments = transcribe_audio(audio_vector, sample_rate, language)
    sentences = split_text_into_sentences_and_timestamps(segments)
    pairs = get_sentence_audio_pairs(audio_vector, sample_rate, sentences)
    return pairs

In [7]:
audio_vector_splited = process_audio_vec(audio_vector)

In [8]:
text, audio = audio_vector_splited[2]
text

'どうしましたか?'

In [9]:
save_and_display_audio(audio)

Требуемый результат получен, теперь масштабируем подход. Нужно получить список нормализованых аудио векторов из всей обучающей выборки.

In [38]:
import os
import pandas as pd
import librosa

data_path = './dataset/jvs_ver1/'
audio_vectors = []

def load_data():
    data = []
    for speaker in os.listdir(data_path):
        speaker_dir = os.path.join(data_path, speaker)
        if not os.path.isdir(speaker_dir):
            continue
        
        for subfolder in ['nonpara30', 'parallel100']:
            wav_path = os.path.join(speaker_dir, subfolder, 'wav24kHz16bit')
            transcripts_path = os.path.join(speaker_dir, subfolder, 'transcripts_utf8.txt')
            
            if not os.path.isdir(wav_path) or not os.path.exists(transcripts_path):
                continue
            
            with open(transcripts_path, 'r', encoding='utf-8') as f:
                transcripts = f.readlines()
            
            for line in transcripts:
                file_name, text = line.strip().split(':')
                audio_path = os.path.join(wav_path, f'{file_name}.wav')
            
                if os.path.exists(audio_path):
                    audio_vec = process_audio(audio_path)
                    audio_vectors.append(audio_vec)

In [39]:
load_data()

In [12]:
save_and_display_audio(audio_vectors[100])

В датасете каждый файл - одно предложение, разбивать не требуется.

In [34]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Функция для извлечения MFCC с нормализацией
def extract_mfcc(audio_vector, sr=22050, n_mfcc=30, n_frames=400):
    mfcc = librosa.feature.mfcc(y=audio_vector, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] > n_frames:
        mfcc = mfcc[:, :n_frames]
    else:
        mfcc = np.pad(mfcc, ((0, 0), (0, n_frames - mfcc.shape[1])), mode='constant')
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)  # Нормализация
    return mfcc.T

mfcc_features = np.array([extract_mfcc(audio) for audio in audio_vectors])

# Параметры модели
timesteps = 400
num_features = 30
latent_dim = 64

# Encoder
inputs = Input(shape=(timesteps, num_features))
encoded = LSTM(latent_dim, return_sequences=False)(inputs)
encoded = Dropout(0.2)(encoded)

# Decoder
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(num_features, return_sequences=True)(decoded)

# Autoencoder model
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Callback для ранней остановки
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Обучение модели с использованием ранней остановки
autoencoder.fit(mfcc_features, mfcc_features,
                epochs=100,
                batch_size=32,
                validation_split=0.1,
                callbacks=[early_stopping])



Epoch 1/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - loss: 0.7857 - val_loss: 0.7346
Epoch 2/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 0.7301 - val_loss: 0.7341
Epoch 3/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 0.7284 - val_loss: 0.7332
Epoch 4/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 0.7278 - val_loss: 0.7323
Epoch 5/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 0.7277 - val_loss: 0.7318
Epoch 6/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 0.7263 - val_loss: 0.7314
Epoch 7/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - loss: 0.7274 - val_loss: 0.7310
Epoch 8/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 0.7274 - val_loss: 0.7309
Epoch 9/100
[1m366/366

KeyboardInterrupt: 

In [40]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Функция для извлечения MFCC с нормализацией
def extract_mfcc(audio_vector, sr=22050, n_mfcc=30, n_frames=400):
    mfcc = librosa.feature.mfcc(y=audio_vector, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] > n_frames:
        mfcc = mfcc[:, :n_frames]
    else:
        mfcc = np.pad(mfcc, ((0, 0), (0, n_frames - mfcc.shape[1])), mode='constant')
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)  # Нормализация
    return mfcc.T

mfcc_features = np.array([extract_mfcc(audio) for audio in audio_vectors])

# Параметры модели
timesteps = 400
num_features = 30
latent_dim = 32  # Уменьшение размерности скрытого состояния

# Encoder
inputs = Input(shape=(timesteps, num_features))
encoded = LSTM(latent_dim, return_sequences=False)(inputs)
encoded = Dropout(0.2)(encoded)
encoded = Dense(latent_dim, activation='relu')(encoded)

# Decoder
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(latent_dim, return_sequences=True)(decoded)
decoded = Dropout(0.2)(decoded)
decoded = TimeDistributed(Dense(num_features))(decoded)

# Autoencoder model
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='rmsprop', loss='mse')

# Callback для ранней остановки
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Обучение модели с использованием ранней остановки
autoencoder.fit(mfcc_features, mfcc_features,
                epochs=100,
                batch_size=32,
                validation_split=0.1,
                callbacks=[early_stopping])

Epoch 1/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 46ms/step - loss: 0.5863 - val_loss: 0.4693
Epoch 2/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 40ms/step - loss: 0.4570 - val_loss: 0.4676
Epoch 3/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - loss: 0.4544 - val_loss: 0.4186
Epoch 4/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - loss: 0.3985 - val_loss: 0.4663
Epoch 5/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - loss: 0.3631 - val_loss: 0.4427
Epoch 6/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - loss: 0.3485 - val_loss: 0.3162
Epoch 7/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 41ms/step - loss: 0.3364 - val_loss: 0.3064
Epoch 8/100
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 40ms/step - loss: 0.3258 - val_loss: 0.3169
Epoch 9/100
[1m

<keras.src.callbacks.history.History at 0x7ad6d08aa8f0>

In [41]:
autoencoder.save('best.keras')

In [44]:
# Encoder model to get latent space representations
encoder = Model(inputs, encoded)

# Получение представлений латентного пространства для обучающих данных
latent_representations = encoder.predict(mfcc_features)

# Функция для определения сходства нового MFCC
def get_similarity(new_audio_vector):
    new_mfcc = extract_mfcc(new_audio_vector)
    new_mfcc = np.expand_dims(new_mfcc, axis=0)
    new_latent_representation = encoder.predict(new_mfcc)
    similarities = np.linalg.norm(latent_representations - new_latent_representation, axis=1)
    return similarities

similarities = get_similarity(audio_vectors[100])
print("Сходства с обучающими данными:", np.mean(similarities))

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Сходства с обучающими данными: 2.399384


In [45]:
similarities = get_similarity(audio_vectors[10])
print("Сходства с обучающими данными:", np.mean(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Сходства с обучающими данными: 3.5928688


In [46]:
similarities = get_similarity(audio_vectors[110])
print("Сходства с обучающими данными:", np.mean(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Сходства с обучающими данными: 3.3693738


In [66]:
similarities = get_similarity(process_audio('./dataset/17a.mp3'))
print("Сходства с обучающими данными:", np.median(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Сходства с обучающими данными: 4.8612237


In [65]:
similarities = get_similarity(process_audio('./dataset/17b.wav'))
print("Сходства с обучающими данными:", np.median(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Сходства с обучающими данными: 5.045302


In [50]:
similarities = get_similarity(process_audio('./dataset/test/My Recording - 1-A.mp3'))
print("Сходства с обучающими данными:", np.mean(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Сходства с обучающими данными: 3.8693738


In [57]:
import os

directory = './dataset/test/'
file_list = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

In [59]:
results = []
for file in file_list:
    similarities = get_similarity(process_audio('./dataset/test/My Recording - 1-A.mp3'))
    results.append(np.mean(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30

In [60]:
np.mean(results)

3.8693736

In [61]:
similarities = get_similarity(process_audio('./ru.wav'))
print("Сходства с обучающими данными:", np.mean(similarities))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Сходства с обучающими данными: 2.8648171
