In [4]:
# Setup and Imports
# NOTE: If kernel died, restart it (Kernel ‚Üí Restart Kernel) and run all cells from the beginning

import os
import sys

# CRITICAL: Add project root to path BEFORE importing project modules
# Try relative path first, fallback to absolute
try:
    project_root = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
    if not os.path.exists(os.path.join(project_root, "utils", "util.py")):
        raise FileNotFoundError
except:
    # Fallback to absolute path
    project_root = r"C:\Users\a-rao\Amphion"

if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Verify path is set
print(f"Project root: {project_root}")
print(f"Python path includes project root: {project_root in sys.path}")

# Now import standard libraries
import torch
import numpy as np
import soundfile as sf
import shutil
import gc
from typing import List, Tuple

# Now import project modules (after path is set)
try:
    from utils.util import load_config
    from models.tts.metis.audio_tokenizer import AudioTokenizer
    from models.tts.metis.semantic_8d_wrappers import Metis8dEncoder, Metis8dDecoder
    print("‚úì All imports successful")
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print(f"Current sys.path: {sys.path[:3]}...")
    print(f"Looking for utils in: {os.path.join(project_root, 'utils')}")
    raise

# Memory management helper
def clear_memory():
    """Clear GPU and CPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

# Directories
test_audios_dir = os.path.join(project_root, "models", "tts", "metis", "test audios")
result_audios_dir = os.path.join(project_root, "models", "tts", "metis", "result audios")
os.makedirs(test_audios_dir, exist_ok=True)
os.makedirs(result_audios_dir, exist_ok=True)

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load config
cfg_path = os.path.join(project_root, "models", "tts", "metis", "config", "base.json")
cfg = load_config(cfg_path)
print("‚úì Config loaded")
clear_memory()


Project root: c:\Users\a-rao\Amphion
Python path includes project root: True
‚úì All imports successful
Using device: cpu
‚úì Config loaded


In [5]:

# Initialize Models
print("=" * 70)
print("INITIALIZING MODELS")
print("=" * 70)

print("\n1. Loading AudioTokenizer...")
audio_tokenizer = AudioTokenizer(cfg, device)
print("   ‚úì AudioTokenizer initialized")
clear_memory()

print("\n2. Loading Metis8dEncoder...")
encoder = Metis8dEncoder(audio_tokenizer)
print("   ‚úì Metis8dEncoder initialized")
clear_memory()

print("\n3. Loading Metis8dDecoder (this may take a few minutes)...")
decoder = Metis8dDecoder(cfg, audio_tokenizer)
print("   ‚úì Metis8dDecoder initialized")
clear_memory()

print("\n" + "=" * 70)
print("‚úÖ ALL MODELS LOADED")
print("=" * 70)


INITIALIZING MODELS

1. Loading AudioTokenizer...


  WeightNorm.apply(module, name, dim)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/177M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/170M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model_1.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

   ‚úì AudioTokenizer initialized

2. Loading Metis8dEncoder...
   ‚úì Metis8dEncoder initialized

3. Loading Metis8dDecoder (this may take a few minutes)...
Building S2A models...
Downloading S2A model checkpoints (this may take 5-10 minutes)...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

‚úì Downloaded s2a_model_1layer


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

‚úì Downloaded s2a_model_full
   ‚úì Metis8dDecoder initialized

‚úÖ ALL MODELS LOADED


In [6]:
# TEST 1: Basic Reconstruction
# Voice ‚Üí 8-D Latents ‚Üí Semantic Codes ‚Üí Reconstruction
print("=" * 70)
print("TEST 1: BASIC RECONSTRUCTION")
print("Voice ‚Üí 8-D Latents ‚Üí Semantic Codes ‚Üí Waveform")
print("=" * 70)

# Input audio
test_audio = os.path.join(test_audios_dir, "prompt.wav")
print(f"\nüìÅ Input: {os.path.basename(test_audio)}")

# Step 1: Encode to 8-D latents
print("\n[STEP 1] Encoding audio to 8-D latents...")
feat_1024d, z_8d = encoder.encode_from_path(test_audio)
print(f"   ‚úì feat_1024d shape: {feat_1024d.shape} (1024-D SSL features)")
print(f"   ‚úì z_8d shape: {z_8d.shape} (8-D continuous latents)")
print(f"   ‚úì z_8d dtype: {z_8d.dtype}")
print(f"   ‚úì z_8d range: [{z_8d.min().item():.4f}, {z_8d.max().item():.4f}]")
clear_memory()

# Step 2: 8-D latents ‚Üí Semantic codes
print("\n[STEP 2] Quantizing 8-D latents to semantic codes...")
semantic_code = decoder._latent8d_to_semantic_ids(z_8d)
print(f"   ‚úì semantic_code shape: {semantic_code.shape} (discrete token IDs)")
print(f"   ‚úì semantic_code dtype: {semantic_code.dtype}")
print(f"   ‚úì semantic_code range: [{semantic_code.min().item()}, {semantic_code.max().item()}] (0-8191)")
clear_memory()

# Step 3: Semantic codes ‚Üí Acoustic codes ‚Üí Waveform
print("\n[STEP 3] Converting semantic codes to acoustic codes...")
acoustic_code = decoder._semantic2acoustic(semantic_code, prompt_acoustic_code=None)
print(f"   ‚úì acoustic_code shape: {acoustic_code.shape} (12 quantizers)")
print(f"   ‚úì acoustic_code dtype: {acoustic_code.dtype}")
print(f"   ‚úì acoustic_code range: [{acoustic_code.min().item()}, {acoustic_code.max().item()}]")
clear_memory()

# Step 4: Acoustic codes ‚Üí Waveform
print("\n[STEP 4] Decoding acoustic codes to waveform...")
wav_reconstructed = decoder.audio_tok.code2wav(acoustic_code)
print(f"   ‚úì wav_reconstructed shape: {wav_reconstructed.shape}")
print(f"   ‚úì wav_reconstructed dtype: {wav_reconstructed.dtype}")
print(f"   ‚úì wav_reconstructed range: [{wav_reconstructed.min():.4f}, {wav_reconstructed.max():.4f}]")
print(f"   ‚úì Sample rate: 24000 Hz")
print(f"   ‚úì Duration: {len(wav_reconstructed) / 24000:.2f} seconds")

# Save result
output_path = os.path.join(result_audios_dir, "test1_basic_reconstruction.wav")
sf.write(output_path, wav_reconstructed, 24000)
print(f"\nüíæ Saved: {os.path.basename(output_path)}")

print("\n" + "=" * 70)
print("‚úÖ TEST 1 COMPLETE")
print("=" * 70)
clear_memory()


TEST 1: BASIC RECONSTRUCTION
Voice ‚Üí 8-D Latents ‚Üí Semantic Codes ‚Üí Waveform

üìÅ Input: prompt.wav

[STEP 1] Encoding audio to 8-D latents...


  return self.encode_waveform_16k(librosa.load(wav_path, sr=16000, mono=True)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\a-rao\\Amphion\\models\\tts\\metis\\test audios\\prompt.wav'

In [None]:
# TEST 2: Voice Conversion with Acoustic Prompt
# Voice (prompt.wav) ‚Üí 8-D Latents ‚Üí Semantic Codes + Acoustic Prompt (acoustic.wav) ‚Üí Reconstruction
print("=" * 70)
print("TEST 2: VOICE CONVERSION WITH ACOUSTIC PROMPT")
print("Voice (prompt.wav) ‚Üí 8-D Latents ‚Üí Semantic Codes")
print("+ Acoustic Prompt (acoustic.wav) ‚Üí Reconstruction")
print("=" * 70)

# Input files
prompt_audio = os.path.join(test_audios_dir, "prompt.wav")
acoustic_audio = os.path.join(test_audios_dir, "acoustic.wav")
print(f"\nüìÅ Source voice (content): {os.path.basename(prompt_audio)}")
print(f"üìÅ Reference voice (style): {os.path.basename(acoustic_audio)}")

# Step 1: Encode source voice to 8-D latents
print("\n[STEP 1] Encoding source voice to 8-D latents...")
feat_1024d_source, z_8d_source = encoder.encode_from_path(prompt_audio)
print(f"   ‚úì feat_1024d_source shape: {feat_1024d_source.shape}")
print(f"   ‚úì z_8d_source shape: {z_8d_source.shape} (8-D continuous latents)")
print(f"   ‚úì z_8d_source dtype: {z_8d_source.dtype}")
print(f"   ‚úì z_8d_source range: [{z_8d_source.min().item():.4f}, {z_8d_source.max().item():.4f}]")
clear_memory()

# Step 2: Extract acoustic codes from reference voice
print("\n[STEP 2] Extracting acoustic codes from reference voice...")
prompt_acoustic_code = encoder.encode_acoustic_from_path(acoustic_audio)
print(f"   ‚úì prompt_acoustic_code shape: {prompt_acoustic_code.shape} (12 quantizers)")
print(f"   ‚úì prompt_acoustic_code dtype: {prompt_acoustic_code.dtype}")
print(f"   ‚úì prompt_acoustic_code range: [{prompt_acoustic_code.min().item()}, {prompt_acoustic_code.max().item()}]")
clear_memory()

# Step 3: 8-D latents ‚Üí Semantic codes
print("\n[STEP 3] Quantizing 8-D latents to semantic codes...")
semantic_code = decoder._latent8d_to_semantic_ids(z_8d_source)
print(f"   ‚úì semantic_code shape: {semantic_code.shape} (discrete token IDs)")
print(f"   ‚úì semantic_code dtype: {semantic_code.dtype}")
print(f"   ‚úì semantic_code range: [{semantic_code.min().item()}, {semantic_code.max().item()}] (0-8191)")
clear_memory()

# Step 4: Semantic codes + Acoustic prompt ‚Üí Acoustic codes
print("\n[STEP 4] Converting semantic codes to acoustic codes (with acoustic prompt)...")
acoustic_code = decoder._semantic2acoustic(semantic_code, prompt_acoustic_code=prompt_acoustic_code)
print(f"   ‚úì acoustic_code shape: {acoustic_code.shape} (12 quantizers)")
print(f"   ‚úì acoustic_code dtype: {acoustic_code.dtype}")
print(f"   ‚úì acoustic_code range: [{acoustic_code.min().item()}, {acoustic_code.max().item()}]")
clear_memory()

# Step 5: Acoustic codes ‚Üí Waveform
print("\n[STEP 5] Decoding acoustic codes to waveform...")
wav_converted = decoder.audio_tok.code2wav(acoustic_code)
print(f"   ‚úì wav_converted shape: {wav_converted.shape}")
print(f"   ‚úì wav_converted dtype: {wav_converted.dtype}")
print(f"   ‚úì wav_converted range: [{wav_converted.min():.4f}, {wav_converted.max():.4f}]")
print(f"   ‚úì Sample rate: 24000 Hz")
print(f"   ‚úì Duration: {len(wav_converted) / 24000:.2f} seconds")

# Save results
shutil.copy2(prompt_audio, os.path.join(result_audios_dir, "test2_source_prompt.wav"))
shutil.copy2(acoustic_audio, os.path.join(result_audios_dir, "test2_reference_acoustic.wav"))
output_path = os.path.join(result_audios_dir, "test2_voice_converted.wav")
sf.write(output_path, wav_converted, 24000)
print(f"\nüíæ Saved:")
print(f"   - Source: test2_source_prompt.wav")
print(f"   - Reference: test2_reference_acoustic.wav")
print(f"   - Output: {os.path.basename(output_path)}")

print("\n" + "=" * 70)
print("‚úÖ TEST 2 COMPLETE")
print("Expected: Output should have prompt.wav content in acoustic.wav voice")
print("=" * 70)
clear_memory()


In [None]:
# Summary: Dimension Flow
print("=" * 70)
print("DIMENSION FLOW SUMMARY")
print("=" * 70)

print("\nüìä TEST 1: Basic Reconstruction")
print("   Input audio (24kHz) ‚Üí 16kHz for encoding")
print("   ‚Üí feat_1024d: [1, T_ssl, 1024]  (SSL features at ~50Hz)")
print("   ‚Üí z_8d: [1, T_ssl, 8]           (8-D continuous latents)")
print("   ‚Üí semantic_code: [1, T_ssl]     (discrete IDs 0-8191)")
print("   ‚Üí acoustic_code: [1, T_ac, 12]  (12 quantizers)")
print("   ‚Üí waveform: [T_wav]             (24kHz samples)")

print("\nüìä TEST 2: Voice Conversion")
print("   Source audio ‚Üí z_8d_source: [1, T_ssl, 8]")
print("   Reference audio ‚Üí prompt_acoustic_code: [1, T_prompt, 12]")
print("   ‚Üí semantic_code: [1, T_ssl]     (from source)")
print("   ‚Üí acoustic_code: [1, T_ac, 12]  (with reference prompt)")
print("   ‚Üí waveform: [T_wav]             (source content, reference voice)")

print("\n" + "=" * 70)
print("‚úÖ ALL TESTS COMPLETE")
print("=" * 70)
