# Pre-Process Speech Dataset for *Efficient Speech* Model Training
## Important: This notebook requires a GPU with CUDA available!

  The notebook demonstrates preprocessing a speech dataset into the format expected by EfficientSpeech for training checkpoints.

### **Dataset Specifications**
#### Input Dataset format:
This notebook assumes your dataset is a **folder** of Mono 22050Hz .wav files, with each audio file having a transcription text file with the same name.

* `MyDataset`:  folder
  - `speaker_001.wav`: an audio file
  - `speaker_001.txt`: text transcription of speaker_001.wav
  - ...
  - `speaker_999.wav`
  - `speaker_999.txt`

#### Output Dataset format:
The output for training is in this format:
* `content/output_dataset`:  folder
  - `configs/MyDataset`: folder
    - `preprocess.yaml`: Only this file is necessary to train EfficientSpeech models
    - `model.yaml`
    - `train.yaml`
  - `preprocessed_data/MyDataset`: folder
    - `duration`: folder
    - `energy`: folder
    - `mel`: folder
    - `pitch`: folder
    - `TextGrid/universal`: folder of .TextGrid files
    - `speakers.json`
    - `stats.json`
    - `train.txt`
    - `val.txt`     
  - `raw_data/universal`: folder
    - `speaker_001.wav`
    - `speaker_001.txt`
    - ...
    - `speaker_999.wav`
    - `speaker_999.txt`

### Links
EfficientSpeech repository: https://github.com/roatienza/efficientspeech  
FastSpeech2 repository: https://github.com/ming024/FastSpeech2  
Montreal Forced Aligner Tutorial: https://eleanorchodroff.com/mfa_tutorial.html

### Configuration Settings
##### Dataset
* dataset_name: The name of your dataset
* dataset_path: A directory with the raw audio files + text transcriptions. The text and audio file names should match.
* speaker_name: One of 'universal', 'LJSpeech'
* val_size: The size of your validation set. (default: 512)  
  - 0 < *val_size* < total audio files. 
  - Example: For FastSpeech2, LJSpeech config has 13,100 audio files with a *val_size* of 512.

##### Output
* output_path: Where to save the working files
* output_zip_path: Where to save the finished dataset as a .zip file

##### MFA (Montreal Forced Aligner) Settings
* text_file_extension: the file format extension of the text transcription files.
* corpus_name: 'metadata.csv'
* lexicon_path: the lexicon/dictionary to use when running MFA.
* dictionary_file: the lexicon/dictionary to use when preprocessing dataset  
* allow_overwrite_existing_corpus: Enable to allow overwriting existing `corpus_name` file.
* acoustic_model: MFA acoustic model (default: 'english_us_arpa')
* dictionary_model - MFA dictionary model (unused)

In [None]:
import os
import yaml
from yaml import CLoader as Loader, CDumper as Dumper
import librosa
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm
import pandas as pd
from preprocessor import Preprocessor
from tqdm.auto import tqdm
import shutil
from utils import get_yaml_path, get_yaml_contents, write_yaml
import subprocess
import sys

tqdm.pandas()

In [None]:

def create_data_df(transcript_path, data_path, speaker_id=None):
    data_df = pd.read_csv(transcript_path, sep="|")
    data_df["wav_filename"] = data_df["wav_filename"].apply(
        lambda x: os.path.join(data_path, x)
    )
    if speaker_id:
        return data_df[data_df["client_id"] == speaker_id].reset_index(drop=True)
    return data_df


def create_dataset(transcript_path, data_path, speaker_id, out_path):
    data_df = create_data_df(
        transcript_path=transcript_path, data_path=data_path, speaker_id=speaker_id
    )
    data_df = data_df[:100]
    if os.path.isdir(out_path):
        shutil.rmtree(out_path)
    dst = os.path.join(out_path)
    os.makedirs(dst)
    for idx, row in tqdm(data_df.iterrows()):
        audio_source = str(row["wav_filename"])
        audio_dst = os.path.join(dst, audio_source.split("/")[-1])
        shutil.copy(audio_source, audio_dst)
        transcript_dst = audio_dst.replace("wav", "lab")
        with open(transcript_dst, "w") as f:
            f.write(row["transcript"])
    data_df.to_csv(os.path.join(dst, "metadata.csv"), index=False)


def prepare_align(config, data_path=None):
    sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
    max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
    if data_path is None:
        data_path = config["path"]["corpus_path"]
    for filename in tqdm(os.listdir(data_path)):
        if filename.split(".")[-1] == "wav":
            wav_path = os.path.join(data_path, filename)
            wav, sr = librosa.load(wav_path, sr=sampling_rate)
            wav = wav / max(abs(wav)) * max_wav_value
            wavfile.write(wav_path, sampling_rate, wav.astype(np.int16))

In [None]:
DATASET_NAME = "CML_Polish"
DATASET_ROOT = "../cml-tts/"
TRAIN_PATH = DATASET_ROOT
TRAIN_TRANSCRIPT = os.path.join(DATASET_ROOT, "train.csv")
SPEAKER_ID = 6892  ##10K+ samples for this ID, best to train TTS
CONFIG_DIR = os.path.join("./config", DATASET_NAME)
OUTPUT_PATH = "../output_dataset"

ACOUSTIC_MODEL = "polish_mfa"
DICTIONARY_MODEL = "polish_mfa"
SPEAKER_NAME = "universal"

PREPROCESSED_DATA_PATH = os.path.join(OUTPUT_PATH, "preprocessed_data", DATASET_NAME)
RAW_PATH = os.path.join(OUTPUT_PATH, "raw_data")
RAW_DATA_SPEAKER_PATH = os.path.join(RAW_PATH, SPEAKER_NAME)
CORPUS_PATH = RAW_DATA_SPEAKER_PATH
TEXTGRID_DIR = os.path.join(PREPROCESSED_DATA_PATH, "TextGrid", SPEAKER_NAME)
CONFIG = get_yaml_contents(CONFIG_DIR, "preprocess")

## Create directory structure

In [None]:
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(CORPUS_PATH, exist_ok=True)
os.makedirs(PREPROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(RAW_DATA_SPEAKER_PATH, exist_ok=True)
os.makedirs(TEXTGRID_DIR, exist_ok=True)

## Create dataset and apply wav preprocessing

In [None]:
#create dataset and apply wav preprocessing
create_dataset(TRAIN_TRANSCRIPT, TRAIN_PATH, SPEAKER_ID, RAW_DATA_SPEAKER_PATH)
prepare_align(CONFIG, data_path=RAW_DATA_SPEAKER_PATH)

### Run MFA forced alignment

In [None]:
align_cmd_opts = f"{CORPUS_PATH} {DICTIONARY_MODEL} {ACOUSTIC_MODEL} {TEXTGRID_DIR}"
command = f"mfa align --clean --single_speaker {align_cmd_opts}"
print(command)
with open("test.log", "wb") as f:
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    for c in iter(lambda: process.stdout.read(1), b""):
        sys.stdout.buffer.write(c)
        f.buffer.write(c)

# Build dataset

In [None]:

preprocessor = Preprocessor(
    CONFIG, raw_path=RAW_PATH, preprocessed_path=PREPROCESSED_DATA_PATH
)
preprocessor.build_from_path()