In [1]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
if torch.cuda.device_count() > 1:
    print("Using ", torch.cuda.device_count(), "GPUs!")

Using device: cuda

TITAN Xp
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
Using  4 GPUs!


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
from loguru import logger
import functools
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from datetime import datetime
import numpy as np

import torchaudio
from torchaudio.functional import resample
from torch.nn.utils.rnn import pad_sequence

import copy
import os
import random
import cv2
import numpy as np
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import functools
from pathlib import Path

from accelerate import Accelerator
from accelerate.utils import set_seed

from core.soundstream import SoundStream
from trainer.trainer import SoundStreamTrainer
from datasets.data import SoundDataset, get_dataloader

In [19]:
accelerator = Accelerator(mixed_precision="fp16")

In [5]:
def cycle(dl):
    while True:
        for data in dl:
            yield data

In [46]:

soundstream = SoundStream(
    codebook_size = 1024,
    use_local_attn = True,
    use_mhesa = True,
    rq_num_quantizers = 8,
    attn_window_size = 128,       # local attention receptive field at bottleneck
    attn_depth = 2                # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)


In [7]:
folder = "/srv/share4/sanisetty3/MagnaTagATune/data"
num_outputs = 1
max_length = 2*24000
# max_length = cast_tuple(max_length, num_outputs)
target_sample_hz = 24000
# target_sample_hz = cast_tuple(target_sample_hz)
seq_len_multiple_of = 480
# seq_len_multiple_of = cast_tuple(seq_len_multiple_of, num_outputs)

In [49]:
files = glob(f"{folder}/**/*.mp3" , recursive=True) 

In [50]:
data, sample_hz = torchaudio.load(files[0])

In [51]:
data.shape

torch.Size([1, 465984])

In [8]:
ds = SoundDataset(
            folder,
            max_length = max_length,
            target_sample_hz = target_sample_hz,
            seq_len_multiple_of = seq_len_multiple_of
        )

In [9]:
dl = get_dataloader(ds, batch_size = 2, num_workers = 0, shuffle = True)


In [47]:
soundstream, dl = accelerator.prepare(
    soundstream, dl
)

In [21]:
dl_iter = cycle(dl)

In [22]:
wave, = next(dl_iter)

In [44]:
for nme,param in soundstream.named_parameters():
    print(nme , param.dtype)

encoder.0.conv.weight torch.float32
encoder.0.conv.bias torch.float32
encoder.1.0.fn.0.conv.weight torch.float32
encoder.1.0.fn.0.conv.bias torch.float32
encoder.1.0.fn.2.conv.weight torch.float32
encoder.1.0.fn.2.conv.bias torch.float32
encoder.1.1.fn.0.conv.weight torch.float32
encoder.1.1.fn.0.conv.bias torch.float32
encoder.1.1.fn.2.conv.weight torch.float32
encoder.1.1.fn.2.conv.bias torch.float32
encoder.1.2.fn.0.conv.weight torch.float32
encoder.1.2.fn.0.conv.bias torch.float32
encoder.1.2.fn.2.conv.weight torch.float32
encoder.1.2.fn.2.conv.bias torch.float32
encoder.1.3.conv.weight torch.float32
encoder.1.3.conv.bias torch.float32
encoder.2.0.fn.0.conv.weight torch.float32
encoder.2.0.fn.0.conv.bias torch.float32
encoder.2.0.fn.2.conv.weight torch.float32
encoder.2.0.fn.2.conv.bias torch.float32
encoder.2.1.fn.0.conv.weight torch.float32
encoder.2.1.fn.0.conv.bias torch.float32
encoder.2.1.fn.2.conv.weight torch.float32
encoder.2.1.fn.2.conv.bias torch.float32
encoder.2.2.fn.0

In [16]:
from torch.cuda.amp import autocast, GradScaler

In [48]:
# with autocast():
loss, (recon_loss, multi_spectral_recon_loss, adversarial_loss, feature_loss, all_commitment_loss) = soundstream(wave, return_loss_breakdown = True)


RuntimeError: cuFFT only supports dimensions whose sizes are powers of two when computing in half precision, but got a signal size of[48000]

In [None]:

soundstream = SoundStream(
    codebook_dim = 768,
    codebook_size = 1024,
    rq_num_quantizers = 8,
    attn_window_size = 128,       # local attention receptive field at bottleneck
    attn_depth = 2                # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)


In [None]:
trainer = SoundStreamTrainer(
    soundstream,
    folder = "/srv/share4/sanisetty3/MagnaTagATune/data",
    batch_size = 24,
    grad_accum_every = 2,         # effective batch size of 32
    data_max_length_seconds = 2,  # train on 2 second audio
    num_train_steps = 10000,
    results_folder = "./checkpoints/no_deepspeed/fixed_input_length/",
    
).cuda()