In [2]:
from pathlib import Path

from datasets import load_dataset, Audio

yodas = load_dataset('yodas2_ru000_16k').cast_column('audio', Audio(decode=True))['train']

class SimpleTextSearcher:
    def __init__(self):
        self._dict = {}
    def add(self, id: str, text: str):
        self._dict[id] = text
    def find(self, query: str, max_count: int | None = None) -> list[str]:
        return [id for id, text in self._dict.items() if query in text][:max_count]

searcher = SimpleTextSearcher()
for text_file in list(Path('yodas_search/yodas_texts').glob('*.txt'))[:100]:
    with open(text_file) as f:
        searcher.add(id=int(text_file.stem), text=f.read())

Resolving data files:   0%|          | 0/63 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/63 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
import gradio as gr


results_per_tab = 3
n_tabs = 20
    
with gr.Blocks(fill_height=True, theme=gr.themes.Origin()) as demo:
    
    with gr.Row(equal_height=True):
        with gr.Column(scale=3):
            query = gr.Textbox(label="Query", autofocus=True)
            collected = gr.Textbox(label="Collected audios", lines=4)
        transcription = gr.Textbox(label="Transcription", scale=7, lines=9, interactive=False)
    
    def add_result(query: str, collected: str, id: str) -> gr.Textbox:
        new_line = f'{id} (q="{query}")'
        return gr.Textbox(value= f'{collected}, {new_line}' if len(collected) else new_line)
    
    search_result_elements = []
    for tab_idx in range(n_tabs):
        with gr.Tab(f"{tab_idx * results_per_tab + 1}-{(tab_idx + 1) * results_per_tab}"):
            for i in range(results_per_tab):
                with gr.Row(variant='compact', equal_height=True):
                    with gr.Column(scale=1, min_width=0):
                        search_result_elements.append(add_button := gr.Button(visible=False, min_width=0))
                        search_result_elements.append(result_id := gr.Label(visible=False, min_width=0, container=False))
                    search_result_elements.append(gr.Audio(visible=False, scale=15, editable=False, waveform_options=))
                    add_button.click(add_result, inputs=[query, collected, result_id], outputs=collected)

    debug_log = gr.Textbox(label="Debug log", lines=3, interactive=False)

    def search(query: str, debug_log: str) -> list[gr.Button]:

        found_ids = searcher.find(query, max_count=results_per_tab * n_tabs)
        debug_log += f'Search query: "{query}\n", found {len(found_ids)} results:'

        returned_elements = []
        for i in range(results_per_tab * n_tabs):
            if i < len(found_ids):
                id = found_ids[i]
                sample = yodas[id]
                youtube_id = sample['video_id']
                audio_path = sample['audio']['path']
                audio_len = len(yodas[0]['audio']['array']) / yodas[0]['audio']['sampling_rate']
                filesize = Path(audio_path).stat().st_size
                debug_log += (
                    f'id={id} | {audio_len:.0f} sec | {filesize / 1024**2:.0f} MB'
                    f' | https://www.youtube.com/watch?v={youtube_id}'
                    '\n'
                )
                returned_elements += [
                    gr.Button(visible=True, value="Add"),
                    gr.Label(visible=False, value=id),  # to save audio id
                    gr.Audio(visible=True, value=audio_path, label=f'id {id}, youtube_id {youtube_id}'),
                ]
            else:
                returned_elements += [
                    gr.Button(visible=False),
                    gr.Label(visible=False),
                    gr.Audio(visible=False),
                ]

        return returned_elements + [debug_log]
    
    query.submit(search, inputs=[query, debug_log], outputs=search_result_elements + [debug_log])

demo.launch(share=True, allowed_paths=['/home/oleg/.cache/huggingface'])