In [1]:
import torch
import json
import pandas as pd
from diffusers import LTXPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "a-r-r-o-w/LTX-Video-0.9.1-diffusers"
LORA_WEIGHT = 1.0
device="cuda:1"

pipe = LTXPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16, local_files_only=True)

lora_path = "/mnt/ssd0/saksham/i2av/ltx_lora_training_i2v_t2v/audioldm_lora/checkpoint-7000"
pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="ltx_lora")
pipe.set_adapters("ltx_lora", LORA_WEIGHT)
# ----------
_ = pipe.to(device)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:02<00:00,  2.46it/s]


In [3]:
meta_path = '/mnt/ssd0/saksham/i2av/AVSync15/metadata.csv'
df = pd.read_csv(meta_path)
path = '/mnt/ssd0/saksham/i2av/AVSync15/aud_caption.json'
data = json.load(open(path))

In [4]:
negative_prompt = ""
prefix = "sounding object, "

label = 'hammering'
caption = 'Someone is hammering a nail into wood.'#'A baby is crying and making sounds.'

prompt = prefix + f'{label}, {caption}'

video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=128*32,
    height=16*32,
    num_frames=1,
    num_inference_steps=50,
    decode_timestep=0.03,
    decode_noise_scale=0.025,
    output_type='latent'
).frames


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


In [5]:
def _unpack_latents(
        latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
    ) -> torch.Tensor:
    batch_size = latents.size(0)
    latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size)
    latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
    return latents

def _normalize_latents(
    latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0,
    reverse=False,
) -> torch.Tensor:
    # Normalize latents across the channel dimension [B, C, F, H, W]
    latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
    latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
    if not reverse:
        latents = (latents - latents_mean) * scaling_factor / latents_std
    else:
        latents = latents * latents_std / scaling_factor + latents_mean
    return latents

In [6]:
from ltx_video_lora import load_latent_models

In [7]:
device = "cuda:1"
dtype = torch.bfloat16
vae = load_latent_models()["vae"].to(device, dtype=dtype)

In [8]:
ll = video

In [9]:
num_frames = 1; height = 16; width = 128
device = "cuda"; dtype = torch.bfloat16
lt = _unpack_latents(ll.to(device, dtype=dtype), 1, 16, 128)
lt = _normalize_latents(lt, vae.latents_mean, vae.latents_std, reverse=True)

In [10]:
lt.shape

torch.Size([1, 128, 1, 16, 128])

In [11]:
lt1 = lt[0].permute(1,0,3,2)
lt1 = lt1[:,:8]
lt1.shape

torch.Size([1, 8, 128, 16])

In [12]:
import sys

path = 'audioldm_vae/'
sys.path.append(path)
from aud_utils import AudioLDM_VAE
from IPython.display import Audio

In [13]:
vae_obj = AudioLDM_VAE()

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 137769.11it/s]
  WeightNorm.apply(module, name, dim)
  fft_window = pad_center(fft_window, filter_length)
  mel_basis = librosa_mel_fn(


In [14]:
rec_audio = vae_obj.latent_to_audio(lt1.float())

In [15]:
Audio(rec_audio, rate=16000)