# Notebook #1 for Generation of a synthetic audio

This notebook is designed to generate synthetic audio recordings based on voices from a reference dataset.

In [None]:
import os
import json
from pathlib import Path
from datetime import datetime

import torch
from TTS.api import TTS

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if not torch.cuda.is_available():
    print('Warning: CUDA is not available!')

## 1. Configs

In [None]:
ref_data_path = Path('./data/google_speech_command_v002')
output_data_path = Path('./data/Synt-RuSC/raw/')

# check ref path
assert ref_data_path.is_dir(), 'path is not exist, check "ref_data_path"'

# create output path
output_data_path.mkdir(parents=True, exist_ok=True)

# Words Lists for Target commands generation
target_commands = [
    'yes', 'no', 'up', 'down', 'left', 
    'right', 'on', 'off', 'stop', 'go',
    'zero', 'one', 'two', 'three', 'four', 
    'five', 'six', 'seven', 'eight', 'nine',
    'backward', 'forward', 'follow', 'learn', 'visual',
]
target_commands_ru = [
    'да', 'нет', 'вверх', 'вниз', 'налево', 
    'направо', 'включи', 'выключи', 'стоп', 'иди',
    'ноль', 'один', 'два', 'три', 'четыре', 
    'пять', 'шесть', 'семь', 'восемь', 'девять', 
    'назад', 'вперед', 'следуй', 'изучай', 'наблюдай',
]

# Words Lists for Consonant words generation
non_commands_ru = [
    'создай', 'зарыдай', 'сверх', 'разлад', 'вред', 
    'гибнет', 'гвозди', 'ржавее', 'исключи', 'девиз', 
    'беда', 'новее', 'стучи', 'сдуй', 
]
non_commands_ref = [
    'learn', 'visual', 'up', 'backward', 'forward', 
    'no', 'go', 'right', 'off', 'down',
    'yes', 'left', 'on', 'follow', 
]
non_commands_en = [
    'create', 'cry', 'over',  'discord', 'harm', 
    'dies', 'nails', 'rustier', 'exclude', 'motto',
    'grief', 'newer', 'knock', 'blow_off',
]

In [None]:
# Using nemo manifests to get file lists.

with open(ref_data_path / 'test_manifest.json', 'r') as f:
    testing_list = [json.loads(line)['audio_filepath'] 
                    for line in f if line.strip()!='']
    testing_list = [line.replace('./google_speech_recognition_v2', 
                                 str(ref_data_path)) 
                    for line in testing_list]

with open(ref_data_path / 'validation_manifest.json', 'r') as f:
    validation_list = [json.loads(line)['audio_filepath'] 
                       for line in f if line.strip()!='']
    validation_list = [line.replace('./google_speech_recognition_v2', 
                                    str(ref_data_path)) 
                       for line in validation_list]

with open(ref_data_path / 'train_manifest.json', 'r') as f:
    training_list = [json.loads(line)['audio_filepath'] 
                     for line in f if line.strip()!='']
    training_list = [line.replace('./google_speech_recognition_v2', 
                                  str(ref_data_path)) 
                     for line in training_list]

## 2. Generation synthetic audio

Target commands generation:
- 15 words with the basic commands: 'Yes', 'No', 'Up', 'Down', 'Left',
'Right', 'On', 'Off', 'Stop', 'Go', 'Backward', 'Forward', 'Follow',
'Learn' and 'Visual';
- 10 words for digits: from 'Zero' to 'Nine'

Consonant words generation:
- 14 Non-command, consonant words: создай ('create'), зарыдай ('cry'), сверх ('over'), разлад ('discord'), вред ('harm'), гибнет ('dies'), гвозди ('nails'), ржавее ('rustier'), исключи ('exclude'), девиз ('motto'), беда ('grief'), новее ('newer'), стучи ('knock'), сдуй ('blow off').

In [None]:
# Init TTS model

# For Generation of a synthetic audio we are using xtts-v2 model from 
# https://github.com/coqui-ai/TTS  or  https://huggingface.co/coqui/XTTS-v2
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", 
          gpu=(True if device=='cuda' else False)).to(device)

In [None]:
# time for log
process_start = str(datetime.now()).split('.')[0].replace(':', '-')

skipped_files = []
for subset, subset_files_list in zip(['train', 'test', 'validation'], 
                                     [training_list, testing_list, validation_list]):
    print('subset', subset)
    if not os.path.exists(output_data_path / subset):
        os.mkdir(output_data_path / subset)
    
    
    # Target commands generation
    print('Target commands generation')
    for command_ind, command_name in enumerate(target_commands):
        command_ru_name = target_commands_ru[command_ind]
        print('command_name', command_name, command_ru_name)
        
        if not os.path.exists(output_data_path / subset / command_name):
            os.mkdir(output_data_path / subset / command_name)
        
        for fPath in subset_files_list:
            fpath_cmd, fName = fPath.split('/')[-2:]
            
            if (fpath_cmd != command_name) or 
               (fName in os.listdir(output_data_path / subset / command_name)):
                    continue
            
            # generation
            try:
                tts.tts_to_file(
                    text=command_ru_name, 
                    speaker_wav=fPath, 
                    language='ru', 
                    file_path=output_data_path / subset / command_name / fName
                )
            except:
                skipped_files.append(fPath)
                with open(f'skipped_files_{process_start}.json', 'w') as f:
                    json.dump(skipped_files, f)
    print()
    
    
    # Consonant words generation
    print('Consonant words generation')
    for word_ind, word_name in enumerate(non_commands_ru):
        ref_word_name = non_commands_ref[word_ind]
        en_word_name = non_commands_en[word_ind]
        
        print('word_name', word_name, ref_word_name, en_word_name)
        
        if not os.path.exists(output_data_path / subset / en_word_name):
            os.mkdir(output_data_path / subset / en_word_name)
        
        for fPath in subset_files_list:
            fpath_cmd, fName = fPath.split('/')[-2:]
            if (fpath_cmd != ref_word_name) or
               (subset=='train' and 'nohash_0' not in fName) or
               (fName in os.listdir(output_data_path / subset / en_word_name)):
                    continue
            
            try:
                tts.tts_to_file(
                    text=word_name, 
                    speaker_wav=fPath, 
                    language='ru', 
                    file_path=output_data_path / subset / en_word_name / fName
                )
            except:
                skipped_files.append(fPath+f'({en_word_name})')
                with open(f'skipped_files_{process_start}.json', 'w') as f:
                    json.dump(skipped_files, f)
