In [6]:
import librosa
import numpy as np

def process_audio(file_path, target_sr=22050):
    # Загрузить аудио файл, конвертируя его в моно
    y, sr = librosa.load(file_path, sr=target_sr, mono=True)
    
    # Нормализация громкости: приводим амплитуду к диапазону [-1, 1]
    y = y / np.max(np.abs(y))
    
    return y

In [7]:
import soundfile as sf
from IPython.display import Audio, display
import uuid 

def save_and_display_audio(audio_vector, sample_rate=22050):
    output_path = f'/tmp/{uuid.uuid4().hex}.wav'
    
    sf.write(output_path, audio_vector, sample_rate)
    
    # Воспроизвести аудио файл в Jupyter Notebook
    display(Audio(output_path, autoplay=True))

In [8]:
import os
import pandas as pd
import librosa

data_path = './dataset/jvs_ver1/'
audio_vectors = []

def load_data():
    data = []
    for speaker in os.listdir(data_path):
        speaker_dir = os.path.join(data_path, speaker)
        if not os.path.isdir(speaker_dir):
            continue
        
        for subfolder in ['nonpara30', 'parallel100']:
            wav_path = os.path.join(speaker_dir, subfolder, 'wav24kHz16bit')
            transcripts_path = os.path.join(speaker_dir, subfolder, 'transcripts_utf8.txt')
            
            if not os.path.isdir(wav_path) or not os.path.exists(transcripts_path):
                continue
            
            with open(transcripts_path, 'r', encoding='utf-8') as f:
                transcripts = f.readlines()
            
            for line in transcripts:
                file_name, text = line.strip().split(':')
                audio_path = os.path.join(wav_path, f'{file_name}.wav')
            
                if os.path.exists(audio_path):
                    audio_vec = process_audio(audio_path)
                    audio_vectors.append(audio_vec)

In [6]:
load_data()

In [7]:
import whisper
from collections import defaultdict
import numpy as np
from tqdm import tqdm

def recognize_words(audio_vectors, sample_rate=16000):
    # Загрузим модель whisper
    model = whisper.load_model("base")
    
    # Словарь для хранения результатов
    word_to_audio_map = defaultdict(list)
    
    # Проходим по всем аудио векторам с отображением прогресса
    for audio_vector in tqdm(audio_vectors, desc="Processing audio files"):
        # Распознаем текст из аудио вектора
        result = model.transcribe(audio_vector, language='ja', word_timestamps=True)
        
        # Получаем временные метки для слов
        segments = result["segments"]
        
        # Проходим по каждому сегменту и нарезаем аудио на слова
        for segment in segments:
            for word_info in segment["words"]:
                word = word_info["word"].lower()
                start_time = word_info["start"]
                end_time = word_info["end"]
                
                # Вычисляем индексы для нарезки массива
                start_index = int(start_time * sample_rate)
                end_index = int(end_time * sample_rate)
                
                # Выделяем фрагмент аудио
                word_audio_fragment = audio_vector[start_index:end_index]
                
                # Добавляем фрагмент в мапу
                word_to_audio_map[word].append(word_audio_fragment)
    
    return dict(word_to_audio_map)

In [8]:
word_to_pronounces = recognize_words(audio_vectors)

Processing audio files: 100%|██████████████████████████████████████████████████████| 12997/12997 [1:02:09<00:00,  3.49it/s]


In [11]:
import re

def clean_word(word):
    # Удаляем пунктуацию из слова
    return re.sub(r'[^\w\s]', '', word)

def get_top_keys_by_value_count(word_to_audio_map, top_n=10):
    # Фильтруем слова, длина которых больше одного символа после удаления пунктуации
    filtered_map = {k: v for k, v in word_to_audio_map.items() if len(clean_word(k)) > 2}
    
    # Сортируем ключи словаря по длине списков значений в порядке убывания
    sorted_items = sorted(filtered_map.items(), key=lambda item: len(item[1]), reverse=True)
    
    # Возвращаем топ N ключей вместе с количеством значений
    return [(key, len(values)) for key, values in sorted_items[:top_n]]


In [33]:
import re
from collections import defaultdict

def clean_key(key):
    # Удаляем знаки препинания с помощью регулярного выражения
    return re.sub(r'[、。！？]', '', key)

def merge_values(mapping):
    new_mapping = defaultdict(list)
    
    for key, values in mapping.items():
        cleaned_key = clean_key(key)
        new_mapping[cleaned_key].extend(values)
    
    return dict(new_mapping)

In [34]:
merged_word_to_pronounces = merge_values(word_to_pronounces)

In [36]:
top_count = get_top_keys_by_value_count(merged_word_to_pronounces)
top_count

[('という', 255),
 ('ように', 203),
 ('かった', 159),
 ('ところ', 141),
 ('100', 119),
 ('ください', 113),
 ('そして', 111),
 ('ません', 106),
 ('います', 93),
 ('あります', 84)]

In [3]:
import pickle

In [41]:
%store word_to_pronounces

Stored 'word_to_pronounces' (dict)


  db[ 'autorestore/' + arg ] = obj


In [14]:
with open('word_to_pronounces.dump', 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(word_to_pronounces, outp, pickle.HIGHEST_PROTOCOL)

In [4]:
with open('word_to_pronounces.dump', 'rb') as f:
    word_to_pronounces = pickle.load(f)

In [9]:
save_and_display_audio(word_to_pronounces['ください。'][31])

In [37]:
top_count

[('という', 255),
 ('ように', 203),
 ('かった', 159),
 ('ところ', 141),
 ('100', 119),
 ('ください', 113),
 ('そして', 111),
 ('ません', 106),
 ('います', 93),
 ('あります', 84)]

In [38]:
keys = [pair[0] for pair in top_count]
keys

['という', 'ように', 'かった', 'ところ', '100', 'ください', 'そして', 'ません', 'います', 'あります']

In [39]:
import psycopg2

def insert_words_and_get_ids(words):
    conn = psycopg2.connect(
        dbname="ml_app",
        user="ml_app",
        password="ml_app",
        host="10.0.0.7",
        port="5432"
    )
    try:
        with conn.cursor() as cur:
            # Вставляем строки в таблицу и получаем их ID
            insert_query = "INSERT INTO words (word) VALUES (%s) RETURNING id;"
            ids = []
            for word in words:
                cur.execute(insert_query, (word,))
                word_id = cur.fetchone()[0]
                ids.append(word_id)
            conn.commit()
            return ids
    finally:
        conn.close()

In [40]:
ids = insert_words_and_get_ids(keys)
print(ids)

[13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [41]:
word_to_id = list(zip(keys, ids))
word_to_id

[('という', 13),
 ('ように', 14),
 ('かった', 15),
 ('ところ', 16),
 ('100', 17),
 ('ください', 18),
 ('そして', 19),
 ('ません', 20),
 ('います', 21),
 ('あります', 22)]

In [42]:
keys

['という', 'ように', 'かった', 'ところ', '100', 'ください', 'そして', 'ません', 'います', 'あります']

In [50]:
values = [word_to_pronounces.get(key, '') for key in keys]

In [55]:
save_and_display_audio(values[4][3])

In [62]:
X_train = []
y_train = []
for id in ids:
    for pronounciations in values:
        for pronounciation in pronounciations:
            X_train.append(pronounciation)
            y_train.append(id)

In [63]:
X_train

[array([ 0.00055234,  0.0003801 ,  0.00050302, ...,  0.00398974,
        -0.00094819, -0.00198005], dtype=float32),
 array([-0.42930055, -0.44190362, -0.38354644, ...,  0.09844543,
         0.16704592,  0.21627201], dtype=float32),
 array([0.40668812, 0.40481833, 0.42078426, ..., 0.02289606, 0.04315449,
        0.0459672 ], dtype=float32),
 array([-0.28139156, -0.24121189, -0.19554256, ...,  0.00406151,
         0.02203902,  0.04783314], dtype=float32),
 array([-0.11341335, -0.10820135, -0.11050954, ..., -0.02292737,
        -0.02134912, -0.01845973], dtype=float32),
 array([-0.13623218, -0.11698574, -0.0604715 , ...,  0.01047974,
         0.01037463,  0.0108691 ], dtype=float32),
 array([-0.02549177, -0.02331359, -0.02687275, ...,  0.23921484,
         0.2533119 ,  0.24624383], dtype=float32),
 array([-0.00031369,  0.00058683, -0.00110646, ...,  0.11488179,
        -0.01030469, -0.04689192], dtype=float32),
 array([-0.01866904, -0.01601225, -0.01380831, ...,  0.08090752,
         0.10

In [199]:
y_train

array([0, 0, 0, ..., 9, 9, 9])

In [108]:
save_and_display_audio(X_train[5])

In [130]:
import librosa
import numpy as np

def extract_mfcc(audio, n_mfcc=30):
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=n_mfcc, hop_length=100)
    return mfccs.T  # Транспонируем, чтобы размерности были (временные_шаги, n_mfcc)

In [143]:
X_mfcc = [extract_mfcc(audio) for audio in X_train]



In [144]:
np.mean(list(map(len, X_mfcc)))

67.25106382978723

In [146]:
indexes_to_remove = [i for i, lst in enumerate(X_mfcc) if len(lst) < 25]
indexes_to_remove

[438,
 549,
 560,
 1143,
 1254,
 1265,
 1848,
 1959,
 1970,
 2553,
 2664,
 2675,
 3258,
 3369,
 3380,
 3963,
 4074,
 4085,
 4668,
 4779,
 4790,
 5373,
 5484,
 5495,
 6078,
 6189,
 6200,
 6783,
 6894,
 6905]

In [147]:
filtered_X_mfcc = [lst for i, lst in enumerate(X_mfcc) if i not in indexes_to_remove]
filtered_y_train = [item for i, item in enumerate(y_train) if i not in indexes_to_remove]

In [148]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_mfcc_padded = pad_sequences(filtered_X_mfcc, maxlen=70, padding='post', dtype='float32')
X_mfcc_padded

array([[[-3.11396362e+02,  4.13363075e+01, -1.89788008e+00, ...,
          9.10219383e+00, -8.48279667e+00, -3.00759411e+00],
        [-2.68394226e+02,  4.08668289e+01, -1.05667651e+00, ...,
          1.02267151e+01, -8.76066875e+00, -3.39358115e+00],
        [-2.34883499e+02,  4.20312576e+01,  1.51365805e+00, ...,
          1.12950554e+01, -7.83208084e+00, -2.94048309e+00],
        ...,
        [-2.22813324e+02,  1.32052917e+02,  8.13613663e+01, ...,
         -2.57662868e+00, -9.06222439e+00,  7.79875183e+00],
        [-2.34511368e+02,  1.32179901e+02,  8.46189499e+01, ...,
         -2.49985719e+00, -1.03147945e+01,  8.48649406e+00],
        [-2.48567276e+02,  1.31726318e+02,  8.74352112e+01, ...,
         -2.46732974e+00, -1.14276056e+01,  9.18448830e+00]],

       [[-3.60391357e+02,  1.16798714e+02,  4.62773094e+01, ...,
          2.66633835e+01,  8.03816319e+00, -6.80010509e+00],
        [-3.64363251e+02,  1.15293808e+02,  4.77469864e+01, ...,
          2.78656921e+01,  1.06652832e

In [155]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_mfcc_padded_reshaped = X_mfcc_padded.reshape(-1, X_mfcc_padded.shape[-1])
X_mfcc_padded_scaled = scaler.fit_transform(X_mfcc_padded_reshaped).reshape(X_mfcc_padded.shape)

In [183]:
X = X_mfcc_padded_scaled

In [202]:
y = list(map(lambda x: x-13, filtered_y_train))

In [203]:
y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [205]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

embedding_dim = 128
max_sequence_length = 100 
num_classes = 10 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)

model = Sequential()
model.add(Embedding(input_dim=len(y_train), output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=64, return_sequences=False))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [206]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Оценка модели
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/5


ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 70, 30, 128)