In [13]:
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd
from scipy.io.wavfile import write

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from emo_data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from emo_models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        test_text_sem_dic = json.load(file)
    for key in test_text_sem_dic:
        test_text_sem_dic[key] = torch.tensor(test_text_sem_dic[key])
    return test_text_sem_dic

def ensure_directory_exists(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

## LJ Speech

In [14]:
# choose pretrained model and the trained step
# model = 'ljs_emo_add_ave'
# model = 'ljs_emo_add_last'
# model = 'ljs_emo_add_pca'
# model = 'ljs_emo_add_eis_word'
# model = 'ljs_emo_add_eis_sentence'
# model = 'ljs_emo_add_bert_cls'
# model = 'onehour_ljs_emo_add_ave'
model = 'onehour_ljs_emo_add_last'
# model = 'onehour_ljs_emo_add_pca'
# model = 'onehour_ljs_emo_add_eis_word'
# model = 'onehour_ljs_emo_add_eis_sentence'

# model = 'librif_emo_add_ave'
# model = 'librif_emo_add_last'
# model = 'librif_emo_add_pca'
# model = 'librif_emo_add_eis_word'
# model = 'librif_emo_add_eis_sentence'
# model = 'librif_emo_add_bert_cls'

# model = 'emovdb_emo_add_ave'
# model = 'emovdb_emo_add_last'
# model = 'emovdb_emo_add_pca'
# model = 'emovdb_emo_add_eis_word'
# model = 'emovdb_emo_add_eis_sentence'
# model = 'emovdb_emo_add_bert_cls'

# step = 'G_50000'
# step = "G_100000"
# step = "G_150000"
# step = "G_200000"
step = "G_250000"
# step = "G_300000"


common_dir = '/data/vitsGPT/vits/'
log_dir = f'{common_dir}emo_vits/logs/'
save_dir = f'{log_dir}{model}/{step}/source_model_test_wav'
ensure_directory_exists(save_dir)
hps = utils.get_hparams_from_file(f"{log_dir}{model}/config.json")
sem_embedding = hps.data.sem_embedding
print(f"sem_embedding: {sem_embedding}")

# Dictionary to map the model to its corresponding test_text_sem_dic_file
model_to_test_text_sem_dic = {
    'ljs_emo_add_ave': 'ljs_text_sem_ave_5120.json',
    'ljs_emo_add_last': 'ljs_text_sem_last_5120.json',
    'ljs_emo_add_pca': 'ljs_text_sem_pca_5120.json',
    'ljs_emo_add_eis_word': 'ljs_text_sem_eis_word_5120.json',
    'ljs_emo_add_eis_sentence': 'ljs_text_sem_eis_sentence_5120.json',
    'ljs_emo_add_bert_cls': 'ljs_text_bert_cls_768.json',
    'onehour_ljs_emo_add_ave': 'ljs_text_sem_ave_5120.json',
    'onehour_ljs_emo_add_last': 'ljs_text_sem_last_5120.json',
    'onehour_ljs_emo_add_pca': 'ljs_text_sem_pca_5120.json',
    'onehour_ljs_emo_add_eis_word': 'ljs_text_sem_eis_word_5120.json',
    'onehour_ljs_emo_add_eis_sentence': 'ljs_text_sem_eis_sentence_5120.json',
    'librif_emo_add_ave': 'librif_text_sem_ave_5120.json',
    'librif_emo_add_last': 'librif_text_sem_last_5120.json',
    'librif_emo_add_pca': 'librif_text_sem_pca_5120.json',
    'librif_emo_add_eis_word': 'librif_text_sem_eis_word_5120.json',
    'librif_emo_add_eis_sentence': 'librif_text_sem_eis_sentence_5120.json',
    'librif_emo_add_bert_cls': 'librif_text_bert_cls_768.json',
    'emovdb_emo_add_ave': 'emovdb_text_sem_ave_5120.json',
    'emovdb_emo_add_last': 'emovdb_text_sem_last_5120.json',
    'emovdb_emo_add_pca': 'emovdb_text_sem_pca_5120.json',
    'emovdb_emo_add_eis_word': 'emovdb_text_sem_eis_word_5120.json',
    'emovdb_emo_add_eis_sentence': 'emovdb_text_sem_eis_sentence_5120.json',
    'emovdb_emo_add_bert_cls': 'emovdb_text_bert_cls_768.json',
}
# Get the corresponding test_text_sem_dic_file for the chosen model
test_text_sem_dic_file = model_to_test_text_sem_dic[model]
test_text_sem_dic = load_data_from_json(f"{common_dir}filelists/{test_text_sem_dic_file}")
# print(test_text_sem_dic['The fourth and fifth days passed without any developments.'])

sem_embedding: /data/vitsGPT/vits/filelists/ljs_audio_sem_last_5120.pt


In [15]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint(f"{log_dir}{model}/{step}.pth", net_g, None) 

In [16]:
keys = list(test_text_sem_dic.keys())
range_limit = len(keys)
range_limit = min(range_limit, len(keys))
print(f"range_limit: {range_limit}")
m = 5

for i in range(range_limit):
    key = keys[i]  # key 是字符串
    s = get_text(key, hps)  # 将 key 赋值给 s
    print(key)
    with torch.no_grad():
        x_tst = s.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([s.size(0)]).cuda()    
        emb_sem = test_text_sem_dic[key].cuda()  # 从字典中获取对应的张量
        print(f"x_tst: {x_tst.shape}")
        print(f"x_tst_lengths: {x_tst_lengths}")
        print(f"emb_sem: {emb_sem.shape}") 
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1, emb_sem=emb_sem)[0][0,0].data.cpu().float().numpy()
    if i < m:
        ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    write(f"{save_dir}/output_emo_{i}.wav", hps.data.sampling_rate, audio)


range_limit: 500
Mrs. De Mohrenschildt thought that Oswald,
x_tst: torch.Size([1, 89])
x_tst_lengths: tensor([89], device='cuda:0')
emb_sem: torch.Size([5120])


The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.
x_tst: torch.Size([1, 311])
x_tst_lengths: tensor([311], device='cuda:0')
emb_sem: torch.Size([5120])


Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house.
x_tst: torch.Size([1, 297])
x_tst_lengths: tensor([297], device='cuda:0')
emb_sem: torch.Size([5120])


The prisoner had nothing to deal with but wooden panels, and by dint of cutting and chopping he got both the lower panels out.
x_tst: torch.Size([1, 263])
x_tst_lengths: tensor([263], device='cuda:0')
emb_sem: torch.Size([5120])


Under these circumstances, unnatural as they are, with proper management, the bean will thrust forth its radicle and its plumule;
x_tst: torch.Size([1, 273])
x_tst_lengths: tensor([273], device='cuda:0')
emb_sem: torch.Size([5120])


Oswald demonstrated his thinking in connection with his return to the United States by preparing two sets of identical questions of the type which he might have thought
x_tst: torch.Size([1, 355])
x_tst_lengths: tensor([355], device='cuda:0')
emb_sem: torch.Size([5120])
it is not possible to state with scientific certainty that a particular small group of fibers come from a certain piece of clothing
x_tst: torch.Size([1, 275])
x_tst_lengths: tensor([275], device='cuda:0')
emb_sem: torch.Size([5120])
has confidence in the dedicated Secret Service men who are ready to lay down their lives for him
x_tst: torch.Size([1, 211])
x_tst_lengths: tensor([211], device='cuda:0')
emb_sem: torch.Size([5120])
Since these agencies are already obliged constantly to evaluate the activities of such groups,
x_tst: torch.Size([1, 211])
x_tst_lengths: tensor([211], device='cuda:0')
emb_sem: torch.Size([5120])
Jeanne De Mohrenschildt said, quote,
x_tst: torch.Size([1, 77])
x_tst_lengths: tensor([77], device=