In [None]:
# Process the audio and split into 1 second chunks as raw bytes

import wave
import math


audio_path = "lib/audio/6930-75918-0018.wav"

chunks = []


with wave.open(audio_path, "rb") as wf:
   frame_rate = wf.getframerate()
   n_frames = wf.getnframes()
   duration = n_frames / frame_rate

   frames_per_chunk = int(frame_rate * 1.0)  # 1-second chunks
   num_chunks = math.ceil(duration)

   print(f"Splitting into {num_chunks} chunks")

   for i in range(num_chunks):
      start_frame = i * frames_per_chunk
      wf.setpos(start_frame)
      frames = wf.readframes(frames_per_chunk)
      chunks.append(frames)

print(f"Created {len(chunks)} chunks")



Splitting into 11 chunks
Created 11 chunks


In [None]:
# Making the model

import torch

from lib.bgpt.config import *
from lib.bgpt.utils import bGPTLMHeadModel

from transformers import  GPT2Config


if torch.cuda.is_available():    
   device = torch.device("cuda")
else:
   device = torch.device("cpu")


patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS, 
                  max_length=PATCH_LENGTH, 
                  max_position_embeddings=PATCH_LENGTH,
                  hidden_size=HIDDEN_SIZE,
                  n_head=HIDDEN_SIZE//64,
                  vocab_size=1)

byte_config = GPT2Config(num_hidden_layers=BYTE_NUM_LAYERS, 
                  max_length=PATCH_SIZE+1, 
                  max_position_embeddings=PATCH_SIZE+1,
                  hidden_size=HIDDEN_SIZE,
                  n_head=HIDDEN_SIZE//64,
                  vocab_size=256+1)

model = bGPTLMHeadModel(patch_config, byte_config)


model_weights_path = "pretrained/weights-audio.pth"

checkpoint = torch.load(model_weights_path, map_location=torch.device(device))
model.load_state_dict(checkpoint['model'], strict=False)
model = model.to(device)
model.eval()




bGPTLMHeadModel(
  (patch_level_decoder): PatchLevelDecoder(
    (patch_embedding): Linear(in_features=4112, out_features=768, bias=True)
    (base): GPT2Model(
      (wte): Embedding(1, 768)
      (wpe): Embedding(512, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05

In [None]:
# Inputting the chunks into the model should give us the logits to be used in compressing.
# Needs to be done by probably doing something like 
# 
### output = model(input_patches, input_masks)
### logits = output.logits
#  
# Or something similar (google a bit maybe).
# Example of this can be found in train-gen.py in line 194:
#
### loss = model(input_patches, input_masks).loss
#
# Todo: need to figure out the structure of input_patches and input_masks from this and 
# then structure the audio chunks above (raw bytes) in that way before putting it into
# the model and getting the logits

