In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [3]:
import os

# 使用するGPUデバイスを設定
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# 設定の確認
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")

CUDA_VISIBLE_DEVICES: 2


## LJ Speech

In [4]:
hps = utils.get_hparams_from_file("./configs/ljs_base.json")

In [5]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_files/vits/pretrained_ljs.pth", net_g, None)

In [6]:
#stn_tst = get_text("VITS is Awesome!", hps)
#x_tst = stn_tst.unsqueeze(0).cpu()
#x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu()
#audio = net_g.cpu().infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
#ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [8]:
stn_tst = get_text("VITS is Awesome!", hps)
net_g = net_g.cuda()
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

print(stn_tst.size(0))

ModuleAttributeError: 'StochasticDurationPredictor' object has no attribute 'cond'

## VCTK

In [4]:
hps = utils.get_hparams_from_file("./configs/vctk_base.json")

In [24]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

#_ = utils.load_checkpoint("/home/souma/workspace/pretrained_files/vits/pretrained_vctk.pth", net_g, None)
_ = utils.load_checkpoint("/home/souma/workspace/logs/vctk_base_speaker_gender_concat/G_203000.pth", net_g, None)

In [25]:
#stn_tst = get_text("VITS is Awesome!", hps)
#with torch.no_grad():
#    x_tst = stn_tst.unsqueeze(0).cpu()
#    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu()
#    sid = torch.LongTensor([4]).cpu()
#    net_g = net_g.cpu()
#    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
#ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


In [26]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([3]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [27]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    #gid = torch.LongTensor([1]).cuda()   # 年齢を指定（例：25歳）
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [31]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([2]).cuda()
    gid = torch.LongTensor([1]).cuda()   # 年齢を指定（例：25歳）
    audio = net_g.infer(x_tst, x_tst_lengths, gid=gid, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [12]:
# inference.ipynbに新しいセルを追加
import os
from scipy.io.wavfile import write
from IPython.display import FileLink
import numpy as np

# 音声を生成
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    gid = torch.LongTensor([0]).cuda()   # 性別を指定（0=男性）
    audio = net_g.infer(x_tst, x_tst_lengths, gid=gid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

# 音声を再生
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

# 音声ファイルとして保存
output_dir = "/home/souma/workspace/generated_audio"
os.makedirs(output_dir, exist_ok=True)

# WAVファイルとして保存
output_file = os.path.join(output_dir, "vits_awesome_male.wav")
write(output_file, hps.data.sampling_rate, (audio * 32767).astype(np.int16))

print(f"音声ファイルが保存されました: {output_file}")

# ダウンロードリンクを表示
display(FileLink(output_file))

音声ファイルが保存されました: /home/souma/workspace/generated_audio/vits_awesome_male.wav


In [49]:
# 音響特徴量での詳細分析
print("=== Acoustic Feature Analysis ===")

# librosaがインストールされていない場合のインストール
try:
    import librosa
except ImportError:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "librosa"])
    import librosa

import numpy as np

stn_tst = get_text("The quick brown fox jumps over the lazy dog", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([4]).cuda()
    
    # 男性版と女性版を生成（ノイズを完全に除去）
    torch.manual_seed(123)
    gid_0 = torch.LongTensor([0]).cuda()
    audio_male = net_g.infer(x_tst, x_tst_lengths, sid=sid, gid=gid_0, 
                            noise_scale=0.0, noise_scale_w=0.0, length_scale=1.0)[0][0,0].data.cpu().float().numpy()
    
    torch.manual_seed(123)
    gid_1 = torch.LongTensor([1]).cuda()
    audio_female = net_g.infer(x_tst, x_tst_lengths, sid=sid, gid=gid_1, 
                              noise_scale=0.0, noise_scale_w=0.0, length_scale=1.0)[0][0,0].data.cpu().float().numpy()
    
    sr = hps.data.sampling_rate
    
    # 基本周波数（ピッチ）の分析
    try:
        f0_male, voiced_flag_male, voiced_probs_male = librosa.pyin(audio_male, 
                                                                   fmin=librosa.note_to_hz('C2'), 
                                                                   fmax=librosa.note_to_hz('C7'),
                                                                   sr=sr)
        f0_male_mean = np.nanmean(f0_male[f0_male > 0]) if np.any(f0_male > 0) else 0
        
        f0_female, voiced_flag_female, voiced_probs_female = librosa.pyin(audio_female, 
                                                                         fmin=librosa.note_to_hz('C2'), 
                                                                         fmax=librosa.note_to_hz('C7'),
                                                                         sr=sr)
        f0_female_mean = np.nanmean(f0_female[f0_female > 0]) if np.any(f0_female > 0) else 0
        
        print(f"Male audio - Mean F0: {f0_male_mean:.2f} Hz")
        print(f"Female audio - Mean F0: {f0_female_mean:.2f} Hz")
        print(f"F0 difference: {f0_female_mean - f0_male_mean:.2f} Hz")
    except Exception as e:
        print(f"F0 analysis failed: {e}")
    
    # スペクトル重心の比較
    try:
        spec_centroid_male = np.mean(librosa.feature.spectral_centroid(y=audio_male, sr=sr))
        spec_centroid_female = np.mean(librosa.feature.spectral_centroid(y=audio_female, sr=sr))
        
        print(f"Male audio - Spectral centroid: {spec_centroid_male:.2f} Hz")
        print(f"Female audio - Spectral centroid: {spec_centroid_female:.2f} Hz")
        print(f"Spectral centroid difference: {spec_centroid_female - spec_centroid_male:.2f} Hz")
    except Exception as e:
        print(f"Spectral centroid analysis failed: {e}")
    
    # RMSエネルギーの比較
    rms_male = np.sqrt(np.mean(audio_male**2))
    rms_female = np.sqrt(np.mean(audio_female**2))
    print(f"Male audio - RMS energy: {rms_male:.6f}")
    print(f"Female audio - RMS energy: {rms_female:.6f}")
    
    # 音声ファイルとして保存して比較しやすくする
    from scipy.io.wavfile import write
    write('/tmp/male_voice.wav', sr, (audio_male * 32767).astype(np.int16))
    write('/tmp/female_voice.wav', sr, (audio_female * 32767).astype(np.int16))
    print("\nAudio files saved to:")
    print("Male: /tmp/male_voice.wav")
    print("Female: /tmp/female_voice.wav")
    
    # 音声再生
    print("\nMale version:")
    ipd.display(ipd.Audio(audio_male, rate=sr, normalize=False))
    
    print("Female version:")
    ipd.display(ipd.Audio(audio_female, rate=sr, normalize=False))

=== Acoustic Feature Analysis ===




Male audio - Mean F0: 91.71 Hz
Female audio - Mean F0: 91.69 Hz
F0 difference: -0.03 Hz
Male audio - Spectral centroid: 1421.61 Hz
Female audio - Spectral centroid: 1400.68 Hz
Spectral centroid difference: -20.93 Hz
Male audio - RMS energy: 0.069499
Female audio - RMS energy: 0.070873

Audio files saved to:
Male: /tmp/male_voice.wav
Female: /tmp/female_voice.wav

Male version:




Female version:


=== Enhanced Gender Control Test ===
Current gender embedding weights:
Gender 0 (male): tensor([ 1.2968,  0.8954, -0.5869, -0.5459,  0.0126], device='cuda:0',
       grad_fn=<SliceBackward>)
Gender 1 (female): tensor([ 0.5063, -0.0494, -0.1968, -1.6660,  0.7409], device='cuda:0',
       grad_fn=<SliceBackward>)

Testing with enhanced gender difference:
Enhanced Male version:


Enhanced Female version:


Enhanced Male F0: 195.13 Hz
Enhanced Female F0: 162.64 Hz
Enhanced F0 difference: -32.49 Hz




### Voice Conversion

In [16]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [17]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))

Original SID: 3


Converted SID: 1


Converted SID: 2


Converted SID: 4
