In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from cleo.cleoImageBind import CLEOImageBind
from datasets import load_from_disk, load_dataset
import tqdm
import numpy as np
import os

In [None]:
libri_dataset = load_dataset("patrickvonplaten/librispeech_asr_self_contained", split="train.clean.100")

In [None]:
ib_model = imagebind_model.imagebind_huge(pretrained=True)
cleo_model = CLEOImageBind(
    llm_model_path = "/home/models/Llama-2-7b-hf",
    audio_features = 1024, # 1024 if ImageBind,
    imageBind_model = ib_model,
    host_llm_on_cuda = False,
    audio_gpu="cpu"
)

In [None]:
#def get_sentence_length(example):
#    example["sentence_length"] = len(cleo_model.llm_tokenizer.encode(example["text"], add_special_tokens=False))
#    return example

#updated_datset = dataset.map(get_sentence_length, batched=True, batch_size=1000)

In [None]:
sentences = libri_dataset["text"]
sentence_length = []
for sentence in tqdm.tqdm(sentences):
    sentence_length.append(len(cleo_model.llm_tokenizer.encode(sentence, add_special_tokens=False)))
sentence_length = np.array(sentence_length)
dataset = libri_dataset.add_column("sentence_length", sentence_length)
dataset = dataset.select(np.where(sentence_length < 20)[0])

In [None]:
experiment_name = "ImageBind_model"
cleo_model.load_state_dict(torch.load(f"/home/CS546-CLEO/models/{experiment_name}/model.pt"))
cleo_model.eval()

In [None]:
from torch.utils.data import Dataset, DataLoader
from scipy.io.wavfile import write as write_wav
import uuid
class CLEODataset(Dataset):
    def __init__(self, dataset, instruction):
        self.dataset = dataset
        self.instruction = instruction

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        ## Create the label
        label = self.dataset[idx]["text"].lower()
        
        ## Save the audio
        file_name = f"/home/CS546-CLEO/wav_samples/{str(uuid.uuid4())}.wav"
        audio_file = np.array(self.dataset[idx]["audio"]["array"], dtype=np.float32)
        write_wav(file_name, 16000, audio_file)

        return self.instruction, file_name, label

instruction = """Repeat back the information that you see below:
<wav>

Information:
"""
cleoDataset = CLEODataset(dataset, instruction)

In [None]:
def generate(instruction, audioPath, label, max_new_tokens=15, top_p=.5, top_k=50, temperature=1.5, repetition_penalty=1.5):
    ## Create the batch
    batch = {
        "instructions": [instruction],
        "audio_paths": [[audioPath]],
        "labels": [label]
    }

    ## Get the embeddings
    input_embs, input_attn, labels = cleo_model.__prepare_batch__(batch)

    output = cleo_model.llm_model.generate(
        inputs_embeds=input_embs,
        attention_mask=input_attn,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
    )
    return output


instruction, audioPath, label = cleoDataset.__getitem__(0)
output = generate(instruction, audioPath, label)
cleo_model.llm_tokenizer.decode(output[0], skip_special_tokens=True)

In [5]:
import torch.nn as nn
import torch
encoder_layer = nn.TransformerEncoderLayer(d_model=1024, nhead=8, batch_first=True)
src = torch.rand(8, 1024)
out = encoder_layer(src)

In [6]:
src.shape

torch.Size([8, 1024])

In [7]:
out.shape

torch.Size([8, 1024])