# Model training notebook for **EfficientSpeech: An On-Device Text to Speech Model**
## *This notebook requires a GPU for training.*  

  This goal of this notebook is to demonstrate training new checkpoints for EfficientSpeech.

  The dataset format used for training is FastSpeech2. Please use preprocess_dataset notebook for dataset preparation.
  

#### Links
Official EfficientSpeech repository: https://github.com/roatienza/efficientspeech  
Paper: https://ieeexplore.ieee.org/abstract/document/10094639





#  
---



# Mount drive

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive




#  
---



# Configuration options
#### Make sure to configure the settings in the `Configuration Settings` section below before running further cells.

##### Dataset
* dataset_name: name of the dataset (default 'MyDataset')
* dataset_location: absolute path to the prepared dataset folder (default: '/content/output_dataset')
* speaker_name: name of the speaker in raw_data folder (default: 'universal')
* config_dir: absolute path to EfficientSpeech configuration directory
* lexicon_path: absolute path to a .txt file with the lexicon/dictionary the dataset is prepared for (defaults to `librispeech-lexicon.txt`)

##### Output
* output_dir: A path to save all generated .ckpt files + logs to. A folder with your dataset name will be created in this folder.
* infer_device: Device used for inference after training. One of 'cuda', 'cpu' (default: 'cuda')

##### Model training options
* accelerator: One of `cpu`, `gpu`
* devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`, based on the accelerator type, per pytorch-lightning documentation.
* batch_size: (default: 128)
* num_workers: (default: 4)
* precision: (default: 16-mixed)
* model_size_to_train: One of 'tiny', 'small', 'base' (default: tiny)
* max_epochs: The number of epochs to stop training at (default: 5000)

In [None]:
# Dataset parameters
dataset_name = 'MyDataset' #@param {type:"string"}
dataset_location = '/content/output_dataset' #@param {type:"string"}
speaker_name = 'universal' #@param {type:'string'}

config_dir = '/content/efficientspeech/config' #@param {type:"string"}
output_dir = '/content/drive/MyDrive/saved_checkpoints' #@param {type:"string"}
lexicon_path = '/content/efficientspeech/lexicon/librispeech-lexicon.txt' #@param {type:"string"}

# Model training Options
cmd_line_opts = ''

# Accelerator is TPU for Colab
accelerator = 'gpu' #@param {type:'string'} ['gpu', 'cpu']
cmd_line_opts += f' --accelerator {accelerator}'

# Devices
devices = 1 #@param {type:'integer'}
cmd_line_opts += f' --devices {devices}'

# Num Workers
num_workers = 4 #@param {type:'integer'}
cmd_line_opts += f' --num_workers {num_workers}'

# Precision
precision = '16-mixed' #@param {type:'string'} ["bf16-mixed", "16-mixed", "16", "32", "64"]
cmd_line_opts += f' --precision {precision}'

# Batch size (128 is default)
batch_size = 128 #@param [16, 32, 64, 128]
cmd_line_opts += f' --batch-size {batch_size}'

# Cmd line opts for training different size models 
# Specify options explicitly for tiny to address this error
# RuntimeError: Calculated padded input size per channel: (4). Kernel size: (5). Kernel size can't be greater than actual input size 
model_size_to_train = "tiny" #@param ["tiny", "small", "base"]
match (model_size_to_train):
  case "small":
    model_opts = ' --head 1 --reduction 1 --expansion 2  --kernel-size 5 --n-blocks 3 --block-depth 3'
  case "base":
    model_opts = ' --head 2 --reduction 1 --expansion 2  --kernel-size 5 --n-blocks 3 --block-depth 3'
  case _: #tiny
    model_opts = ' --head 1 --reduction 4 --expansion 1 --kernel-size 3 --n-blocks 2 --block-depth 2'
cmd_line_opts += model_opts

# Max epochs
max_epochs = 5000 #@param {type:"integer"}
cmd_line_opts += f' --max_epochs {max_epochs}'

# Inference device
infer_device = 'cuda' #@param {type:"string"} ["cuda", "cpu"]
cmd_line_opts += f' --infer-device {infer_device}'

!echo Command line arguments: $cmd_line_opts

Command line arguments: --accelerator gpu --devices 1 --num_workers 4 --precision 16-mixed --batch-size 128 --head 1 --reduction 4 --expansion 1 --kernel-size 3 --n-blocks 2 --block-depth 2 --max_epochs 5000 --infer-device cuda




#  
---



# Unzip Preprocessed dataset
If you used the prepare_dataset notebook to prepare your dataset, this will extract it to the default location (`/content/output_dataset`).
After extraction, the directory structure will look like this:
* `/content/output_dataset`
  - `configs`
  - `raw_data`
  - `preprocessed_data`

In [None]:
import os

%cd /content/
zip_file_location = '/content/drive/MyDrive/output_dataset/MyDataset.zip' #@param
!unzip -u $zip_file_location -d /

# Sanity check - Check if folders are named what we expect
expected_folders = [os.path.join(dataset_location, 'preprocessed_data', dataset_name),
                  os.path.join(dataset_location, 'raw_data', speaker_name)]
for folder_name in expected_folders:
  assert os.path.exists(folder_name), f'No folder named {folder_name} exists, please check directory structure is correct and folder exists'

/content
Archive:  /content/drive/MyDrive/output_dataset/MyDataset.zip
   creating: /content/output_dataset/
   creating: /content/output_dataset/raw_data/
   creating: /content/output_dataset/raw_data/universal/
  inflating: /content/output_dataset/raw_data/universal/p303_008.wav  
  inflating: /content/output_dataset/raw_data/universal/p303_004.lab  
  inflating: /content/output_dataset/raw_data/universal/p303_003.wav  
  inflating: /content/output_dataset/raw_data/universal/p303_005.wav  
  inflating: /content/output_dataset/raw_data/universal/p303_005.lab  
  inflating: /content/output_dataset/raw_data/universal/p303_001.wav  
  inflating: /content/output_dataset/raw_data/universal/p303_003.lab  
  inflating: /content/output_dataset/raw_data/universal/p303_004.wav  
  inflating: /content/output_dataset/raw_data/universal/metadata.csv  
  inflating: /content/output_dataset/raw_data/universal/p303_007.wav  
  inflating: /content/output_dataset/raw_data/universal/p303_007.lab  
  infl

# Setup dependencies


In [None]:
%cd /content

# Delete existing
## !rm -rf /content/efficientspeech

# Clone repository (Note: this is my fork with additional training options)
!git clone https://github.com/roatienza/efficientspeech

# Make training config directory
dataset_config_dir = os.path.join(config_dir, dataset_name)
!mkdir $dataset_config_dir

# Download model files
!mkdir /content/efficientspeech/checkpoints
!wget --continue -nv -O /content/efficientspeech/checkpoints/base_eng_4M.ckpt  https://github.com/roatienza/efficientspeech/releases/download/pytorch2.0.1/base_eng_4M.ckpt 
!wget --continue -nv -O /content/efficientspeech/checkpoints/small_eng_952k.ckpt  https://github.com/roatienza/efficientspeech/releases/download/pytorch2.0.1/small_eng_952k.ckpt
!wget --continue -nv -O /content/efficientspeech/checkpoints/tiny_eng_266k.ckpt  https://github.com/roatienza/efficientspeech/releases/download/pytorch2.0.1/tiny_eng_266k.ckpt 

/content
Cloning into 'efficientspeech'...
remote: Enumerating objects: 1511, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 1511 (delta 87), reused 122 (delta 58), pack-reused 1349[K
Receiving objects: 100% (1511/1511), 5.03 MiB | 21.66 MiB/s, done.
Resolving deltas: 100% (983/983), done.
2023-06-01 02:37:07 URL:https://objects.githubusercontent.com/github-production-release-asset-2e65be/483135884/d61e6948-debe-4924-ad39-32ba0f29a53b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230601%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230601T023706Z&X-Amz-Expires=300&X-Amz-Signature=cdbc78d3e885d2989ac7e2a11fb8bdecba35df996ca7b2663f90aa7a298d2f0d&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=483135884&response-content-disposition=attachment%3B%20filename%3Dbase_eng_4M.ckpt&response-content-type=application%2Foctet-stream [51366419/51366419] -> "/content/efficientspeech/checkp

In [None]:
# https://pytorch-lightning.readthedocs.io/en/1.2.10/advanced/tpu.html#tpu-terminology
#!pip install cloud-tpu-client==0.10 # https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
#!pip install torch-xla --index-url https://pip.repos.neuron.amazonaws.com
#!pip install wandb

# Install requirements
!pip install tensorboard
!pip install -r /content/efficientspeech/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightning>=2.0.2 (from -r /content/efficientspeech/requirements.txt (line 3))
  Downloading lightning-2.0.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics>=0.11.4 (from -r /content/efficientspeech/requirements.txt (line 4))
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unidecode (from -r /content/efficientspeech/requirements.txt (line 5))
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m25.9 MB/s[0m eta [36m0:00:

## YAML helper functions

In [None]:
import os
import yaml


# YAML helper functions
def get_yaml_path(name, config_dir):
  return os.path.join(config_dir, name+'.yaml')


def get_yaml_contents(name, config_dir):
  with open(get_yaml_path(name, config_dir), 'r') as f:
    return yaml.safe_load(f.read())
            

def write_yaml(name, contents, config_dir):
  with open(get_yaml_path(name), 'w') as f:
    f.write(yaml.dump(contents))

Read existing preprocess.yaml and make a new preprocess.yaml configuration file with the paths we expect in this notebook

In [None]:
import os
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper


existing_config_path = os.path.join(dataset_location, 'configs', dataset_name)
dataset_config_dir = os.path.join(config_dir, dataset_name)

# preprocess.yaml - update paths and add field to text
print(f'Reading preprocess.yaml from "{existing_config_path}"')
pp = get_yaml_contents('preprocess', existing_config_path)
pp['dataset'] = dataset_name
pp['path']['corpus_path'] = f"{dataset_location}/corpus"
pp['path']['lexicon_path'] = lexicon_path
pp['path']['raw_path'] = f"{dataset_location}/raw_data"
pp['path']['preprocessed_path'] = f"{dataset_location}/preprocessed_data/{dataset_name}"

print(f'Writing preprocess.yaml in "{dataset_config_dir}/preprocess.yaml":')

with open(os.path.join(dataset_config_dir, 'preprocess.yaml'), 'w') as f:
  f.write(yaml.dump(pp))
print('\n')

!cat $dataset_config_dir\/preprocess.yaml


# pp_config = f"""dataset: "{dataset_name}"

# path:
#   corpus_path: "{dataset_location}/corpus"
#   lexicon_path: "{lexicon_path}"
#   raw_path: "{dataset_location}/raw_data"
#   preprocessed_path: "{dataset_location}/preprocessed_data"

# preprocessing:
#   val_size: 64
#   text:
#     text_cleaners: ["english_cleaners"]
#     language: "en"
#     max_length: 4096
#   audio:
#     sampling_rate: 22050
#     max_wav_value: 32768.0
#   stft:
#     filter_length: 1024
#     hop_length: 256
#     win_length: 1024
#   mel:
#     n_mel_channels: 80
#     mel_fmin: 0
#     mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
#   pitch:
#     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
#     normalization: True
#   energy:
#     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
#     normalization: True
# """

# # Write config to file
# with open(f'{dataset_config_dir}/preprocess.yaml', mode='w') as f:
#   f.write(pp_config.expandtabs())

Reading preprocess.yaml from "/content/output_dataset/configs/MyDataset"
Writing preprocess.yaml in "/content/efficientspeech/config/MyDataset/preprocess.yaml":


dataset: MyDataset
path:
  corpus_path: /content/output_dataset/corpus
  lexicon_path: /content/efficientspeech/lexicon/librispeech-lexicon.txt
  preprocessed_path: /content/output_dataset/preprocessed_data/MyDataset
  raw_path: /content/output_dataset/raw_data
preprocessing:
  audio:
    max_wav_value: 32768.0
    sampling_rate: 22050
  energy:
    feature: phoneme_level
    normalization: true
  mel:
    mel_fmax: 8000
    mel_fmin: 0
    n_mel_channels: 80
  pitch:
    feature: phoneme_level
    normalization: true
  stft:
    filter_length: 1024
    hop_length: 256
    win_length: 1024
  text:
    language: en
    max_length: 4096
    text_cleaners:
    - english_cleaners
  val_size: 64




#  
---



# Train a checkpoint 


### Launch TensorBoard

In [None]:
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir /content/efficientspeech/lightning_logs/

### Run training

In [None]:
import os
# Train
pp_config_path = os.path.join(dataset_config_dir, 'preprocess.yaml')
pp_config_arg = f'--preprocess-config {pp_config_path}'
training_opts = ' '.join([pp_config_arg, cmd_line_opts])

print(f'Running training with arguments: {training_opts}')

%cd /content/efficientspeech/
!python /content/efficientspeech/train.py $training_opts

Running training with arguments: --preprocess-config /content/efficientspeech/config/MyDataset/preprocess.yaml  --accelerator gpu --devices 1 --num_workers 4 --precision 16-mixed --batch-size 128 --head 1 --reduction 4 --expansion 1 --kernel-size 3 --n-blocks 2 --block-depth 2 --max_epochs 5000 --infer-device cuda
/content/efficientspeech
Removing weight norm...
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type        | Params
--------------------------------------------
0 | phoneme2mel | Phoneme2Mel | 266 K 
1 | hifigan     | Generator   | 925 K 
--------------------------------------------
266 K     Trainable params
926 K     Non-trainable params
1.2 M     Total params
4.770     Total estimated model params size (MB)
2023-06-01 02:53:51.811205: I tensorflow/core/platform/cpu_

# Run inference on latest trained checkpoint

In [None]:
from IPython.display import Audio, display
import os

sentence = 'The quick brown fox jumped over the lazy dog.' #@param {type:'string'}

%cd /content/efficientspeech/

# Get latest run checkpoint
latest_run_folder = !ls -td -- lightning_logs/* | head -n 1
latest_run_folder = latest_run_folder[0]
latest_run_name = os.path.basename(latest_run_folder)
ckpt_folder = os.path.join(latest_run_folder, 'checkpoints')
latest_ckpt = !ls -td -- $ckpt_folder\/* | head -n 1
latest_ckpt = os.path.abspath(latest_ckpt[0])
latest_ckpt_name = os.path.basename(latest_ckpt)
# Output wav 
output_wav_name = latest_run_name + '.wav'

print(f'Found checkpoint "{latest_ckpt}')

# Run inference with latest checkpoint
inference_args = f'--checkpoint {latest_ckpt} {model_opts} ' \
  f'--infer-device {infer_device} --text "{sentence}" ' \
  f'--wav-filename {output_wav_name}'
print(f'Running inference with arguments: {inference_args}')
!python demo.py $inference_args

# Display inference result
output_wav_path = os.path.join('/content/efficientspeech/outputs', output_wav_name)
print(f'\nInference result: {output_wav_path}')
display(Audio(os.path.abspath(output_wav_path)))



#  
---



# *VERY IMPORTANT* - Copy all checkpoints to your drive

In [None]:
import os
import shutil

output_folder_path = os.path.join(output_dir, dataset_name)
print(f'Copying checkpoints to "{output_folder_path}"')

# Create the target directory if it doesn't exist
if not os.path.exists(output_folder_path):
   os.makedirs(output_folder_path)
   print(f'Created folder "{output_folder_path}"')

# Iterate through all subdirectories in the log directory
source_dir = '/content/efficientspeech/lightning_logs'
!cp -v -r $source_dir\/* $output_folder_path

Copying checkpoints to "/content/drive/MyDrive/saved_checkpoints/MyDataset"
'/content/efficientspeech/lightning_logs/version_0/events.out.tfevents.1685587239.44c9d25ae1f2.2131.0' -> '/content/drive/MyDrive/saved_checkpoints/MyDataset/version_0/events.out.tfevents.1685587239.44c9d25ae1f2.2131.0'
'/content/efficientspeech/lightning_logs/version_0/hparams.yaml' -> '/content/drive/MyDrive/saved_checkpoints/MyDataset/version_0/hparams.yaml'
'/content/efficientspeech/lightning_logs/version_1/events.out.tfevents.1685587278.44c9d25ae1f2.2502.0' -> '/content/drive/MyDrive/saved_checkpoints/MyDataset/version_1/events.out.tfevents.1685587278.44c9d25ae1f2.2502.0'
'/content/efficientspeech/lightning_logs/version_1/hparams.yaml' -> '/content/drive/MyDrive/saved_checkpoints/MyDataset/version_1/hparams.yaml'
'/content/efficientspeech/lightning_logs/version_2/events.out.tfevents.1685587993.44c9d25ae1f2.10103.0' -> '/content/drive/MyDrive/saved_checkpoints/MyDataset/version_2/events.out.tfevents.1685587



#  
---

