In [2]:
import glob
import os
import numpy as np

import torch
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset


from PIL import Image
from re import sub

In [3]:
img_size = 64
num_workers = 0
batch_size = 4


embeddingPath = "Embeddings/audio/subURMPClean/train/all_embeddings.pt"
embeddingNamePath = "Embeddings/audio/subURMPClean/train/all_file_names.csv"
dataset_path = "Data/SubURMP64/images/clean"
train_folder = "train"
val_folder = "trial"

In [114]:
glob.glob(os.path.join(dataset_path, train_folder) + "/*")

['Data/SubURMP64/images/clean/train/cello',
 'Data/SubURMP64/images/clean/train/violin',
 'Data/SubURMP64/images/clean/train/trombone',
 'Data/SubURMP64/images/clean/train/bassoon',
 'Data/SubURMP64/images/clean/train/clarinet',
 'Data/SubURMP64/images/clean/train/sax',
 'Data/SubURMP64/images/clean/train/oboe',
 'Data/SubURMP64/images/clean/train/trumpet',
 'Data/SubURMP64/images/clean/train/tuba',
 'Data/SubURMP64/images/clean/train/horn',
 'Data/SubURMP64/images/clean/train/viola',
 'Data/SubURMP64/images/clean/train/flute',
 'Data/SubURMP64/images/clean/train/double_bass']

In [4]:
dir_names = glob.glob(f"{dataset_path}/{train_folder}/" + '*')
dir_names = [name.split('/')[-1] for name in dir_names]
dir_names.sort()
dir_names



['bassoon',
 'cello',
 'clarinet',
 'double_bass',
 'flute',
 'horn',
 'oboe',
 'sax',
 'trombone',
 'trumpet',
 'tuba',
 'viola',
 'violin']

In [139]:
a = []
for instrument in dir_names:
    instrument_files = [file for file in os.listdir(f"{dataset_path}/{train_folder}/{instrument}")if file[-3:] == "jpg"]
    random_idx = np.random.randint(0, len(instrument_files))
    sample = instrument_files[random_idx]
    idx = np.where(names == sample)[0][0]
    a.append(embeddingArr[idx])

a = torch.stack(a)

In [147]:
b = torch.stack(a)
b.shape[0]

13

In [138]:
for i in a:
    print(names[i])

bassoon00_41000.jpg
cello07_122500.jpg
clarinet06_24700.jpg
double_bass00_54800.jpg
flute00_27000.jpg
horn03_117000.jpg
oboe02_46600.jpg
sax03_30000.jpg
trombone07_10800.jpg
trumpet00_64900.jpg
tuba03_190400.jpg
viola09_28700.jpg
violin03_24100.jpg


In [116]:
embeddingArr = torch.load(embeddingPath)
embeddingArr

tensor([[ 0.0282, -0.0097,  0.0404,  ...,  0.0205,  0.0225, -0.0015],
        [ 0.0186,  0.0081,  0.0299,  ...,  0.0175,  0.0199, -0.0075],
        [ 0.0140,  0.0025,  0.0328,  ...,  0.0235,  0.0262, -0.0143],
        ...,
        [ 0.0170,  0.0042,  0.0342,  ...,  0.0125,  0.0208, -0.0133],
        [ 0.0193, -0.0198,  0.0453,  ...,  0.0187,  0.0277, -0.0152],
        [ 0.0134,  0.0091,  0.0422,  ...,  0.0106,  0.0230, -0.0014]])

In [14]:

len([entry for entry in os.listdir(f"{dataset_path}/train/bassoon") if os.path.isfile(os.path.join(dataset_path, "train/bassoon", entry))])




1307

In [22]:



train_transforms = torchvision.transforms.Compose([
    T.Resize(img_size + int(.25*img_size)),  # args.img_size + 1/4 *args.img_size
    T.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    T.ToTensor(),
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

val_transforms = torchvision.transforms.Compose([
    T.Resize(img_size),
    T.ToTensor(),
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])


In [24]:
train_dataset = torchvision.datasets.ImageFolder(f"{dataset_path}/{train_folder}", transform=train_transforms)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [25]:
imgs_list = []
labs_list = []

for imgs,labs in train_dataloader:
    imgs_list.append(imgs)
    labs_list.append(labs)


    

In [26]:
labs_list[0]

tensor([0, 0, 1, 1])

In [27]:
label_emb = torch.nn.Embedding(2, 256)

In [30]:
label_emb(labs_list[0])

tensor([[-0.7207,  0.9875,  0.3469,  ..., -0.6233,  1.3831, -1.0459],
        [-0.7207,  0.9875,  0.3469,  ..., -0.6233,  1.3831, -1.0459],
        [-1.0133,  1.3338,  0.4389,  ...,  0.3592,  0.4197, -1.2003],
        [-1.0133,  1.3338,  0.4389,  ...,  0.3592,  0.4197, -1.2003]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, args, image_path, transform=None):
        self.image_path = image_path
        self.transfrom = transform
        self.name_path = args.name_path
        self.image_dim = (args.img_size, args.img_size)

        self.embedding_array = torch.load(args.embedding_path)
        self.embedding_names = np.loadtxt(args.name_path, delimiter=',', dtype=str)

        self.file_list = glob.glob(self.image_path + "*")
        self.data = []
        for class_path in self.file_list:
            for img_pth in glob.glob(class_path + "/*.jpg"):
                self.data.append(img_pth)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path = self.data[idx]
        with Image.open(img_path) as image:
            image.Load()
        image = image.convert("RGB")
        img_path = img_path.split("/")[-1]
        embedding_index = np.where(self.embedding_names == img_path)
        embedding = self.embedding_array[embedding_index]
        embedding = embedding.squeeze()

        if self.transform:
            image = self.transform(image)

        return image, embedding

In [3]:
class CustomDataset(Dataset):
    def __init__(self, image_path, embeddingPath, namePath, transform=None):
        
        self.image_path = image_path # Set image path e.g. trial for demonstration, train for application
        self.transform = transform
        file_list = glob.glob(self.image_path + "*")

        # Getting image names
        self.data = []
        for class_path in file_list:
            for img_path in glob.glob(class_path + "/*.jpg"):
                self.data.append(img_path)

        # Reading in embedding file and associated names
        self.embeddingArr = torch.load(embeddingPath)
        self.fileNames = np.loadtxt(namePath, delimiter=',', dtype=str)
        self.img_dim = (64, 64)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path = self.data[idx]  # Gets path to image

        with Image.open(img_path) as img:  # Loads image
            img.load()
        img = img.convert("RGB")  # Converts image to rgb
        img_path = img_path.split("/")[-1]
        #TODO use img_path to lookup embedding from embedding file created in __init__

        embeddingIdx = np.where(self.fileNames == img_path)  # Index for embeddings where it corresponds to the desired file name
        embedding = self.embeddingArr[embeddingIdx]  # Embeddings for associated index
        embedding = embedding.squeeze()


        if self.transform:
            img = self.transform(img)

        return img, embedding  

In [4]:

# Any paramater in get_data() should be found in args in actual implementation
def get_data(img_size, dataset_path, embeddingPath, namePath, train_folder, val_folder, batch_size, num_workers):  # Defines dataloaders and transformations for data
    train_transforms = torchvision.transforms.Compose([
        T.Resize(img_size + int(.25*img_size)),  # args.img_size + 1/4 *args.img_size
        T.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
        T.ToTensor(),
        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    val_transforms = torchvision.transforms.Compose([
        T.Resize(img_size),
        T.ToTensor(),
        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    train_dataset = CustomDataset(image_path=f"{dataset_path}/{train_folder}/", embeddingPath=embeddingPath, namePath=namePath,transform=train_transforms)
    val_dataset = CustomDataset(image_path=f"{dataset_path}/{val_folder}/", embeddingPath=embeddingPath, namePath=namePath,transform=val_transforms)
    

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=num_workers) #Defines the train dataloader
    val_dataset = DataLoader(val_dataset, batch_size=2*batch_size, shuffle=False, num_workers=num_workers)
    
    return train_dataloader, val_dataset

In [5]:
train_dataloader, val_dataloader = get_data(img_size, dataset_path, embeddingPath, embeddingNamePath, 
                                            train_folder, val_folder, batch_size, num_workers)

In [6]:
imgsList = []
labsList = []
embList = []
for imgs,  emb in train_dataloader:
    imgsList.append(imgs)
    embList.append(emb)




In [15]:
names = np.loadtxt(embeddingNamePath, delimiter=',', dtype=str)
names

array(['bassoon00_126300.jpg', 'bassoon00_122600.jpg',
       'bassoon00_61200.jpg', ..., 'violin08_84800.jpg',
       'violin01_8900.jpg', 'violin09_102600.jpg'], dtype='<U24')

In [20]:
for name in names:
    name = sub('[^a-zA-Z]+', '', name)
    print(name)

bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg
bassoonjpg

### Next piece of the puzzle 

The next part of the source code that is relevant is the one_epoch method in ddpm_conditional. Specifically line 118. 

In [None]:
    # This uses a dataloader that is predfined in the prepare function
def one_epoch(self, train=True, use_wandb=False): #TODO This is where I beleive code should be edited to incorporate AudioCLIPEmbeddings
    avg_loss = 0.
    if train: self.model.train()
    else: self.model.eval()
    pbar = progress_bar(self.train_dataloader, leave=False)  # Dataloader gets image and associated label. Found in utils.py
    for i, (images, embeddings) in enumerate(pbar):  # Gets image and label from dataloader
        with torch.autocast("cuda") and (torch.inference_mode() if not train else torch.enable_grad()):
            images = images.to(self.device)
            embeddings = embeddings.to(self.device)  # Sends labels to cuda or cpu
            t = self.sample_timesteps(images.shape[0]).to(self.device)
            x_t, noise = self.noise_images(images, t)
            if np.random.random() < 0.1:
                embeddings = None
            predicted_noise = self.model(x_t, t, embeddings)  # TODO swap label information with embedding info
            loss = self.mse(noise, predicted_noise)
            avg_loss += loss
        if train:
            self.train_step(loss)
            if use_wandb: 
                wandb.log({"train_mse": loss.item(),
                            "learning_rate": self.scheduler.get_last_lr()[0]})
        pbar.comment = f"MSE={loss.item():2.3f}"        
    return avg_loss.mean().item()