In [1]:
import os
os.chdir("/home/yuan/Desktop/SaxGPT/")

In [2]:
from transformers import AutoProcessor
from transformers import EncodecModel, EncodecConfig
import torch
from torch import Tensor
import argparse
from tqdm import tqdm
from pathlib import Path
import yaml
import pandas as pd
from typing import List, Tuple, Dict
import json
from datetime import timedelta
import torchaudio

from src.data.audio_util import trim_wav_file
from src.data.augmentation import AudioAugmenter, augment_examples
from src.data.tokenization import tokenize, detokenize


def clip_valid_windows(metadata: List[Dict]) -> List[Tuple[Tensor, Tensor]]:
   examples = []
   for metadata_entry in tqdm(metadata):
       video_id = metadata_entry["video_id"]
       windows = json.loads(metadata_entry["valid_windows"])
       for window in windows:
           start, end = window[0], window[1]
           lead_audio = trim_wav_file(
               stem_path / f"sax_{video_id}.wav",
               timedelta(seconds=start),
               timedelta(seconds=end),
           )
           backing_audio = trim_wav_file(
               stem_path / f"rhythm_{video_id}.wav",
               timedelta(seconds=start),
               timedelta(seconds=end),
           )
           examples.append((backing_audio, lead_audio))
   return examples

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:

with open("config/data/main.yaml", "r") as file:
   config = yaml.safe_load(file)
try:
   stem_path_str = config["data_paths"]["stem_dest"]
   metadata_path_str = config["data_paths"]["metadata_path"]
   codes_dest_str = config["data_paths"]["codes_dest"]
   aug_cfg = config["augmentation"]
   encodec_config_override = config["encodec"]
except KeyError as e:
   print(f"Error: Missing key in configuration file: {e}")
   raise

device = torch.device("cuda")

model = EncodecModel.from_pretrained("facebook/encodec_32khz").to(device)
processor = AutoProcessor.from_pretrained("facebook/encodec_32khz")

In [4]:
audio, sr = torchaudio.load("/home/yuan/Desktop/SaxGPT/data/main/stems/rhythm_3RShrYcbCxs.wav")
codes = tokenize(audio, processor, model, chunk_len_s=28.9)
codes.shape



AcceleratorError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [11]:
torchaudio.save(
    "notebooks/outputs/chunked_decoded.wav",
    detokenize(codes.cpu(), EncodecModel.from_pretrained("facebook/encodec_32khz")),
    sample_rate=32000
)

