In [2]:
import warnings
warnings.filterwarnings("ignore")

import argparse
import logging
import os
import sys
import time


from collections import defaultdict, OrderedDict

import matplotlib
import numpy as np
import soundfile as sf
import torch
import yaml

from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader
from tqdm import tqdm

import sys
sys.path.append("../../cuhksz-phd/sho_util/pyfiles/")
from basic import plot_spectrogram
from sound import play_audio

sys.path.append("../")
from pyfiles.utils import Dict2Obj
from pyfiles.dataset import Parallelo2oVCMelDataset, ParallelArcticDataset

import seq2seq_vc
import seq2seq_vc.models
import seq2seq_vc.losses
import seq2seq_vc.trainers
import seq2seq_vc.collaters

from seq2seq_vc.losses import GuidedMultiHeadAttentionLoss

# from seq2seq_vc.datasets import ParallelVCMelDataset
from torch.utils.data import Dataset

from seq2seq_vc.utils import read_hdf5
from seq2seq_vc.utils.types import str_or_none

# set to avoid matplotlib error in CLI environment
import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

from seq2seq_vc.schedulers.warmup_lr import WarmupLR

scheduler_classes = dict(warmuplr=WarmupLR)

import joblib
import glob
datasplitARCTIC = list(np.load("./data_split_ARCTIC.npy", allow_pickle=True))
datasplitVCTK = list(np.load("./data_split_VCTK.npy", allow_pickle=True))
datasplit = []
for i in range(3):
    datasplit += [datasplitARCTIC[i]+datasplitVCTK[i]]

In [3]:
# Dataset Variables
setup = 0

if setup==0: ### One-to-one Accent Addition ###
    src_dir = "/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"
    trg_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"
    setupname = "gt2syn"
    
elif setup==1: ### Many-to-Many Accent Removal ###
    spks_dataset = ["ASI", "RRBI", "SVBI", "TNI"]
    src_dir = [f"/mntcephfs/lab_data/shoinoue/Dataset/L2-ARCTIC/{spk}/" for spk in spks_dataset]
    trg_dir = [f"/mntcephfs/data/audiow/shoinoue/Dataset/CosyVoice/{spk}/English/" for spk in spks_dataset]
    setupname = "gt2synAR"
    
elif setup==2: ### Many-to-Many Accent Addition with mixed input (syn and gt) ###
    spks_dataset = ["ASI", "RRBI", "SVBI", "TNI"]
    rep_num = len(spks_dataset)
    src_dir = [f"/mntcephfs/data/audiow/shoinoue/Dataset/CosyVoice/{spk}/English/" for spk in spks_dataset]
    trg_dir = [f"/mntcephfs/lab_data/shoinoue/Dataset/L2-ARCTIC/{spk}/" for spk in spks_dataset]
    for _ in range(rep_num):
        src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
        trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]
    setupname = "mix2mixAA"
    
elif setup==3: ### Many-to-Many Accent Addition with Synthetic output data using CosyVoice ###
    spks_dataset = ["SLT", "BDL", "EEY", "RMS", "AEW", "CLB", "LJM", "LNH"]
    rep_num = len(spks_dataset)
    src_dir = [f"/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/{spk}/" for spk in spks_dataset]
    trg_dir = [f"/mntcephfs/data/audiow/shoinoue/Dataset/CosyVoice/{spk}/ASI/" for spk in spks_dataset]
    for _ in range(rep_num):
        src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
        trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]
    setupname = "gt2synAAcosy"
    
elif setup==4: ### Many-to-Many Accent Removal with Synthetic output data using CosyVoice ###
    spks_dataset = ["SLT", "BDL", "EEY", "RMS", "AEW", "CLB", "LJM", "LNH"]
    rep_num = len(spks_dataset)
    src_dir = [f"/mntcephfs/data/audiow/shoinoue/Dataset/CosyVoice/{spk}/ASI/" for spk in spks_dataset]
    trg_dir = [f"/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/{spk}/" for spk in spks_dataset]
    for _ in range(rep_num):
        src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]
        trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
    setupname = "syn2gtARcosy"
    
if setup==5: ### One-to-one Accent Addition from syn to syn ###
    src_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/"
    trg_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"
    setupname = "syn2syn"
    
if setup==6: ### One-to-one Accent Addition from mix to syn ###
    src_dir = []
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/"]
    trg_dir = ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]*len(src_dir)
    setupname = "mix2syn"
    
if setup==7: ### One-to-one Accent Addition Korean ###
    src_dir = "/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"
    trg_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Korean/"
    setupname = "gt2synKorean"
    
if setup==8: ### One-to-one Accent Addition (Korean) from syn to syn ###
    src_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/"
    trg_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Korean/"
    setupname = "syn2synKorean"
    
if setup==9: ### One-to-one Accent Addition (Korean) from mix to syn ###
    src_dir = []
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/"]
    trg_dir = ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Korean/"]*len(src_dir)
    setupname = "mix2synKorean"
    
if setup==10: ### One-to-one Accent Addition, additional vctk
    src_dir = []
    trg_dir = []
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/"]
    trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]
    
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/"]
    trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/"]
    
    src_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT_add/English/"]
    trg_dir += ["/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT_add/Hindi/"]
    setupname = "mix2synVCTK1hr"

scaler = {}
scaler_filename = f"ckpts/scalers/LibriTTS-R_80mel.save"
scaler["80mel"] = joblib.load(scaler_filename)
scaler_filename = f"ckpts/scalers/LibriTTS-R_16000.save"
scaler["mel"] = joblib.load(scaler_filename)
scaler_filename = f"ckpts/scalers/LibriTTS-R_wavlm.save"
scaler["wavlm"] = joblib.load(scaler_filename)

In [6]:
size = "small"
conditiontype = "nocondition"
inputoutput = ["80mel", "80mel"]

args = {}
args["rank"] = 0
sizename = "_"+size if size else ""
args["outdir"] = f"/mntcephfs/lab_data/shoinoue/Models/trained_models/AC_01/ckpts_16000/{conditiontype}_{setupname}_{''.join(inputoutput)}{sizename}/"
args["config_path"] = f"./../egs/l2-arctic/cascade/conf/{size}m2mvtn.melmel.yaml"
args["init_checkpoint"] = ""
args["resume"] = ""
args["distributed"] = False
args = Dict2Obj(args)

# load main config
with open(args.config_path) as f:
    config = yaml.load(f, Loader=yaml.Loader)
config.update(vars(args))

# Customization
config["batch_size"] = 64
config["model_params"]["conditiontype"] = conditiontype
config["optimizer_params"]["lr"] = 0.00008
config["train_max_steps"] = 100000
if inputoutput[0]=="80mel":
    config["model_params"]["idim"] = 80
if inputoutput[1]=="80mel":
    config["model_params"]["odim"] = 80

In [7]:
device = torch.device("cuda")
torch.backends.cudnn.benchmark = True
torch.cuda.set_device(args.rank)
if not os.path.exists(args.outdir):
    os.makedirs(args.outdir)
    
### Dataset Preparation ###
dataset = {
    "train": ParallelArcticDataset(src_dir, trg_dir, datasplit, scaler, "train", input_output=inputoutput, noembedding=True),
    "dev": ParallelArcticDataset(src_dir, trg_dir, datasplit, scaler, "valid", input_output=inputoutput, noembedding=True),
}

collater_class = getattr(
    seq2seq_vc.collaters,
    config.get("collater_type", "ARM2MVCCollater"),
)
collater = collater_class()

sampler = {"train": None, "dev": None}
data_loader = {
    "train": DataLoader(
        dataset=dataset["train"],
        shuffle=True,
        collate_fn=collater,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        sampler=sampler["train"],
        pin_memory=config["pin_memory"],
    ),
    "dev": DataLoader(
        dataset=dataset["dev"],
        shuffle=True,
        collate_fn=collater,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        sampler=sampler["dev"],
        pin_memory=config["pin_memory"],
    ),
}

### Model Preparation ###
model_class = getattr(
    seq2seq_vc.models,
    config.get("model_type", "M2MVTN"),
)
model = model_class(**config["model_params"]).to(device)

if config.get("criterions", None):
    criterion = {
        criterion_class: getattr(seq2seq_vc.losses, criterion_class)(
            **criterion_paramaters
        )
        for criterion_class, criterion_paramaters in config["criterions"].items()
    }
else:
    raise ValueError("Please specify criterions in the config file.")

### optimizers and schedulers ###
optimizer_class = getattr(
    torch.optim,
    # keep compatibility
    config.get("optimizer_type", "Adam"),
)
optimizer = optimizer_class(
    model.parameters(),
    **config["optimizer_params"],
)
scheduler_class = scheduler_classes.get(config.get("scheduler_type", "warmuplr"))
scheduler = scheduler_class(
    optimizer=optimizer,
    **config["scheduler_params"],
)

### define trainer ###
trainer_class = getattr(
    seq2seq_vc.trainers,
    config.get("trainer_type", "ARM2MVCTrainer"),
)
trainer = trainer_class(
    steps=0,
    epochs=0,
    data_loader=data_loader,
    sampler=sampler,
    model=model,
    vocoder=None,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    config=config,
    device=device,
)

# load pretrained parameters from checkpoint
if len(args.init_checkpoint) != 0:
    trainer.load_trained_modules(
        args.init_checkpoint, init_mods=config["init-mods"]
    )

# resume from checkpoint
if len(args.resume) != 0:
    trainer.load_checkpoint(args.resume)

# freeze modules if necessary
if config.get("freeze-mods", None) is not None:
    assert type(config["freeze-mods"]) is list
    trainer.freeze_modules(config["freeze-mods"])

932 /mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/
932 /mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/
1439 /mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT_add/English/
100 /mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/
100 /mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/English/
34 /mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT_add/English/


In [8]:
try:
    trainer.run()
finally:
    trainer.save_checkpoint(
        os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")
    )
    logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")

[train]:   0%|          | 5/100000 [00:06<30:15:17,  1.09s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x155550311dc0>>
Traceback (most recent call last):
  File "/mntcephfs/lab_data/shoinoue/miniconda3/envs/cuhk/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
# from pyfiles.feature_extractor import get_vocos
# ### Vocoder Preparation ###
# data_dir = "/mntcephfs/lab_data/shoinoue/"
# # fs = 24000
# fs = 16000

# if fs==24000:
#     config_path = f"{data_dir}Models/trained_models/vocos/24k/config.yaml"
#     model_path = f"{data_dir}Models/trained_models/vocos/24k/pytorch_model.bin"
# elif fs==16000:
#     config_path = f"{data_dir}Models/trained_models/vocos/vocos16k_noncausal_tealab/config16k.yaml"
#     model_path = f"{data_dir}Models/trained_models/vocos/vocos16k_noncausal_tealab/vocos16k_noncausal_last.ckpt"
# vocoder = get_vocos(config_path, model_path, fs)

# mel = dataset[0]["src_feat"]
# mel = scaler["mel"].inverse_transform(mel)
# y = vocoder.decode(torch.tensor(mel.T).unsqueeze(0)).cpu().numpy()
# play_audio(y, fs)

# mel = dataset[0]["trg_feat"]
# mel = scaler["mel"].inverse_transform(mel)
# y = vocoder.decode(torch.tensor(mel.T).unsqueeze(0)).cpu().numpy()
# play_audio(y, fs)