In [13]:
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from sem_data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from sem_models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech

In [14]:
hps = utils.get_hparams_from_file("./configs/small_ljs_emo_pca.json")

In [15]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

# ave, pca > last > eis word > eis sentence

# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_add_eis_word/G_2700000.pth", net_g, None) # 1 singing
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_add_eis_sentence/G_700000.pth", net_g, None) # 1 natural quick
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_add_last/G_100000.pth", net_g, None) # 5 singing
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_add_pca/G_500000.pth", net_g, None) # 5 singing
_ = utils.load_checkpoint("/data/vitsGPT/vits/logs/small_ljs_emo_add_pca/G_26000.pth", net_g, None) # 5 singing

# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_att_eis_word/G_66000.pth", net_g, None) # -5 noise
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_att_eis_sentence/G_80000.pth", net_g, None) # 1 don't know good or bad
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_att_last/G_232000.pth", net_g, None) # 1 singing
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_att_ave/G_414000.pth", net_g, None) # -5 no sound ?
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_att_pca/G_298000.pth", net_g, None) # -2 drunk robot

# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_selfless_eis_word/G_200000.pth", net_g, None) # -2 robot
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_selfless_eis_sentence/G_104000.pth", net_g, None) # -5 noise
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_selfless_last/G_214000.pth", net_g, None) # -1 omitting
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_selfless_ave/G_100000.pth", net_g, None) # 5 singing
# _ = utils.load_checkpoint("/data/vitsGPT/vits/logs/ljs_emo_selfless_pca/G_400000.pth", net_g, None) # 5 singing

In [19]:
# s = get_text("The room erupted in jubilant cheers as the final whistle blew, marking their team's unexpected victory.", hps) # Joy/Happiness
# s = get_text("A pang of envy struck her as she watched her peers celebrate their accomplishments, wondering when her moment would come.", hps) # Envy/Jealousy
# s = get_text("The weight of remorse was almost unbearable as he realized the pain his thoughtless words had inflicted.", hps) # Remorse/Guilt
# s = get_text("She felt a sudden surge of hope, like a ray of sunlight piercing through a stormy sky, when she received the uplifting news.", hps) # Hope/Optimism
s = get_text("Suspicion clouded his judgment, every gesture and word from his friend now seeming like a potential deceit.", hps) # Suspicion/Distrust
# 強くて直接的な感情
# s = get_text("I am furious with you.", hps) # 怒り
# s = get_text("I feel so joyful right now.", hps) # 喜び
# s = get_text("I am completely devastated.", hps) # 絶望
# s = get_text("I am overwhelmed with gratitude.", hps) # 感謝 
# s = get_text("I'm terrified of losing you.", hps) # 恐怖
# s = get_text("I'm terrified of going to school.", hps) # 恐怖
# s = get_text("I'm terrified of going out.", hps) # 恐怖
# 強いが直接的でない感情
# s = get_text("The storm inside me rages quietly.", hps) # 内部の葛藤 
# s = get_text("My heart sings a tune only I can hear.", hps) # 個人的な喜び
# s = get_text("Behind my laughter, there's a pain you'll never understand.", hps) # 隠れた痛み
# s = get_text("The weight of the world feels light today, but only because I've become accustomed to its heaviness.", hps) # 諦め
# s = get_text("The sun shines, but not for me.", hps) # 哀愁
# むしろ複雑な感情
# s = get_text("The nostalgia of our memories brings both a smile to my face and a sting to my heart.", hps) # 懐かしさ、喜び、悲しみ
# s = get_text("I'm caught between the relief of the end and the grief of letting go.", hps) # 安堵と悲しみ
# s = get_text("The joy of your presence is tainted with the fear of your inevitable absence.", hps) # 喜びと予期の悲しみ/恐怖
# s = get_text("I find solace in chaos, for it reminds me of a time when life was unpredictable.", hps) # 快適と憧れ
# s = get_text("Your words are a bittersweet symphony; they lift me up and tear me down simultaneously.", hps) # 両義性
# 小説
# s = get_text("In a sunny backyard, a playful puppy chased its tail, delighting in the warmth and freedom. But as it wandered off, it found itself on a busy, loud street, filled with confusion and fear. Lost and tired, it sat down, feeling defeated. Just then, a child spotted the puppy, offering it a comforting pat and guiding it back home. By evening, the puppy, once lost, was now snuggled safe and sound, realizing the world was big, but there were always kind souls to help.", hps)
# 対話
# s = get_text("I landed the job in New York!; That's... great. But what about us?; I've been torn. We built so much here, but this is my dream.; Eyes moistening, I want to be happy for you, but it hurts.; Taking hands, I wish there was a way to have both.; Maybe there is. What if I came with you?; Surprised? Really!; For us, I'd brave the big city.; Tearing up, Together, then. Always.", hps)

# 文脈
# 句読点

with torch.no_grad():
    x_tst = s.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([s.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

write("output_emo_3.wav", hps.data.sampling_rate, audio)