# Main

This model will take in a valence-arousal vector and a Bouba-Kiki vector. It will then output an encoded earcon representation, which will be passed to the MusicGen Decoder to generate the final earcon.

The output of the MusicGen Decoder will then be encoded by EncodecFeatureExtractor, and the vectors will be used to calculate the loss

## Dataset

Each row in the dataset will consist of:
- An Earcon represented by an Encodec vector
- An image represented in a Valence Arousal Vector
- A Bouba-Kiki Value derived from the image
- A Pseudoword

The rows will be paired by cosine similarity between the Earcon's Encodec vector and the VA Vector from the image. The Bouba-Kiki Value and Pseudoword will be generated after the images are paired with the audio

In [2]:
# import relevant libraries
import pandas as pd
import numpy as np
import ast
import torch
import random


random.seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
pd.set_option('display.max_columns', None)

### Earcons

In [None]:
# load in earcons
earcons = pd.read_csv('dataset\earcon_dataset\earcon_dataset.csv')

earcons['query'] = earcons['query'].apply(ast.literal_eval)
earcons["query"] = earcons["query"].apply(lambda x: x[0])

earcons = earcons[["query", "name"]]

earcons["filepaths"] = earcons["name"].apply(lambda x: f"dataset/earcon_dataset/earcons/{x}")

earcons

In [None]:
# prep encodec model
from encodec import EncodecModel


encodec_model = EncodecModel.encodec_model_24khz().to(device)

In [None]:
import torchaudio


def extract_earcon_features(filepaths, encodec_model, target_sample_rate=24000, target_length=512):
    earcon_features = []

    for path in filepaths:
        # load in the audio file
        waveform, sample_rate = torchaudio.load(path)
        waveform = waveform.to(device)

        # if stereo, convert to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True).to(device)

        # resample if necessary
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate).to(device)
            waveform = resampler(waveform).to(device)

        # add batch dimension so that the shape is
        # [1, 1, num_samples] because encodec
        # expects that format
        waveform = waveform.unsqueeze(0)

        # encode the waveform
        with torch.no_grad():
            encoded_frames = encodec_model.encode(waveform)
            compressed_features = encoded_frames[0][0].to(device)  # Take the first codebook
        
        # truncate and pad
        length = compressed_features.shape[2]
        if length > target_length:
            compressed_features = compressed_features[:, :, :target_length].to(device)
        else:
            pad = torch.zeros((compressed_features.shape[0], compressed_features.shape[1], target_length - length)).to(device)
            compressed_features = torch.cat((compressed_features, pad), dim=2).to(device)

        # remove the first dimension
        compressed_features = compressed_features.squeeze(0)
        earcon_features.append(compressed_features.to("cpu"))

    return earcon_features


# Apply the function to all rows in the earcons dataframe
earcons["earcon_features"] = extract_earcon_features(earcons["filepaths"], encodec_model)
earcons

In [None]:
earcons["earcon_features"][0].shape

In [None]:
# find the largest value embedded in the earcon features

smallest = 1000000
largest = 0
for i in range(len(earcons["earcon_features"])):
    if largest < int(earcons["earcon_features"][i].max()):
        largest = int(earcons["earcon_features"][i].max())
    if smallest > int(earcons["earcon_features"][i].min()):
        smallest = int(earcons["earcon_features"][i].min())

print(largest)
print(smallest)

### Images

In [None]:
# load in images
images = pd.read_csv('dataset\landscape1\csvs\image_classification.csv')

# extract top tag and similarity score
images['top_tags'] = images['top_tags'].apply(ast.literal_eval)
images["top_tags"] = images["top_tags"].apply(lambda x: x[0])
images["similarity_scores"] = images["similarity_scores"].apply(ast.literal_eval)
images["similarity_scores"] = images["similarity_scores"].apply(lambda x: x[0])

images["image_path"] = images["image_path"].str.lstrip("../")

In [None]:
# load CLIP model
from transformers import CLIPProcessor, CLIPModel
from PIL import Image


# Function to calculate image vectors
def calculate_image_vectors(image_paths, clip_model, clip_processor):
    image_features = []
    count = 0
    for image_path in image_paths:
        count += 1
        image = clip_processor(images=Image.open(image_path), return_tensors="pt").pixel_values.to(device)
        with torch.no_grad():
            image = clip_model.get_image_features(image)
            print(image)
            image = image.squeeze(0)
            image_features.append(image.to("cpu"))
        if count % 1000 == 0:
            print(f"Processed {count} images")
    return image_features


# Apply the function to the images dataframe
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

images["image_features"] = calculate_image_vectors(images["image_path"].tolist(), clip_model, clip_processor)
images

In [None]:
images["image_features"][0].shape

### Calculate Cosine Similarity

This will be used to build the dataset for our model

In [None]:
# calculate the cosine similarity & store in a new df
from sklearn.metrics.pairwise import cosine_similarity


def compute_torch_cosine_similarity(image_vectors, earcon_vectors):
    # Convert to PyTorch tensors
    image_tensor = torch.tensor(np.stack(image_vectors))
    earcon_tensor = torch.tensor(np.mean(np.stack(earcon_vectors), axis=1)).float()
    
    # Normalize vectors
    image_tensor_norm = image_tensor / image_tensor.norm(dim=1, keepdim=True)
    earcon_tensor_norm = earcon_tensor / earcon_tensor.norm(dim=1, keepdim=True)
    
    # Compute cosine similarity
    similarity_matrix = torch.mm(image_tensor_norm, earcon_tensor_norm.t())
    
    return similarity_matrix.numpy()


def process_similarities(images, earcons):
    # Compute similarity matrix (using one of the methods above)
    similarity_matrix = compute_torch_cosine_similarity(
        images['image_features'].tolist(), 
        earcons['earcon_features'].tolist()
    )
    
    # Find the index of the most similar earcon for each image
    most_similar_indices = np.argmax(similarity_matrix, axis=1)
    
    # Extract the most similar earcon details
    result_df = pd.DataFrame({
        'split': images['split'],
        'earcon_filename': earcons.iloc[most_similar_indices]['name'].values,
        'earcon_filepath': earcons.iloc[most_similar_indices]['filepaths'].values,
        'earcon_features': earcons.iloc[most_similar_indices]['earcon_features'].values,
        'image_filename': images['filename'],
        'image_filepath': images['image_path'],
        'image_features': images['image_features'],
        'image_tag': images['top_tags'],
        'image_tag_similarity': images['similarity_scores'],
        'similarity_score': similarity_matrix[np.arange(len(most_similar_indices)), most_similar_indices]
    })
    
    return result_df

earcon_image_dataset = process_similarities(images, earcons)
earcon_image_dataset

In [None]:
earcon_image_dataset.earcon_filename.value_counts()

### Build Pseudowords and Bouba-Kiki value

In [None]:
import random
from joblib import Parallel, delayed
import utils.psword_gen as psword_gen
import utils.psword_utils as psword_utils


def generate_pseudoword_and_bouba_kiki(image_path, sound_dict):
    x_values, y_values = psword_utils.process_image(image_path, 50, 150)
    weighted_angles, roundness = psword_utils.calculate_weighted_angles_by_edge_length(x_values, y_values)
    
    random.seed(42)
    
    psword = psword_gen.pseudoword_generator(
        roundness,
        len(x_values),
        sound_dict=sound_dict
    )

    roundness = torch.tensor(roundness).float()
    
    return roundness, psword, weighted_angles

# Parallelized function
def process_row(row):
    return generate_pseudoword_and_bouba_kiki(row['image_filepath'], sound_dict)


# sound dict
sound_dict = psword_gen.load_sound_mappings('utils/sound_mappings.json')

# Parallelize using joblib
results = Parallel(n_jobs=-1)(delayed(process_row)(row) for _, row in earcon_image_dataset.iterrows())

# Extract and assign results
earcon_image_dataset[['roundness', 'pseudoword', 'weighted_angles']] = pd.DataFrame(results, index=earcon_image_dataset.index)

earcon_image_dataset

### Save to pickle

In [None]:
earcon_image_dataset.to_pickle('dataset/combined_dataset/earcon_image_dataset2.pkl')

## Training

### Load from pickle

In [1]:
# import relevant libraries
import pandas as pd
import torch
import random


random.seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
pd.set_option('display.max_columns', None)

In [2]:
earcon_image_dataset = pd.read_pickle('dataset/combined_dataset/earcon_image_dataset2.pkl')
earcon_image_dataset

Unnamed: 0,split,earcon_filename,earcon_filepath,earcon_features,image_filename,image_filepath,image_features,image_tag,image_tag_similarity,similarity_score,roundness,pseudoword,weighted_angles
0,test,BS_Bend_20.wav,dataset/earcon_dataset/earcons/BS_Bend_20.wav,"[[tensor(930.), tensor(534.), tensor(530.), te...",Coast-Test (1).jpeg,dataset/landscape1/Testing Data/Coast\Coast-Te...,"[tensor(0.5427), tensor(-0.2114), tensor(-0.54...",a natural landscape,0.098083,0.059398,tensor(0.5484),juxuluji,98.713105
1,test,Failure_01.wav,dataset/earcon_dataset/earcons/Failure_01.wav,"[[tensor(913.), tensor(945.), tensor(530.), te...",Coast-Test (100).jpeg,dataset/landscape1/Testing Data/Coast\Coast-Te...,"[tensor(0.1623), tensor(-0.0117), tensor(-0.17...",a natural landscape,0.120972,0.057352,tensor(0.5156),juxulu,92.807444
2,test,BS_Bend_17.wav,dataset/earcon_dataset/earcons/BS_Bend_17.wav,"[[tensor(319.), tensor(698.), tensor(857.), te...",Coast-Test (101).jpeg,dataset/landscape1/Testing Data/Coast\Coast-Te...,"[tensor(0.5334), tensor(-0.1099), tensor(-0.36...",a calm landscape,0.133911,0.072991,tensor(0.5630),geleje,101.336326
3,test,clock.wav,dataset/earcon_dataset/earcons/clock.wav,"[[tensor(62.), tensor(62.), tensor(62.), tenso...",Coast-Test (102).jpeg,dataset/landscape1/Testing Data/Coast\Coast-Te...,"[tensor(0.8153), tensor(-0.2710), tensor(0.351...",a bright landscape,0.113892,0.056997,tensor(0.5332),juxulu,95.974916
4,test,Failure_01.wav,dataset/earcon_dataset/earcons/Failure_01.wav,"[[tensor(913.), tensor(945.), tensor(530.), te...",Coast-Test (103).jpeg,dataset/landscape1/Testing Data/Coast\Coast-Te...,"[tensor(0.3290), tensor(-0.2365), tensor(-0.14...",a bright landscape,0.127563,0.066242,tensor(0.5516),geleje,99.289937
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,validation,Testeregitar_-_F.wav,dataset/earcon_dataset/earcons/Testeregitar_-_...,"[[tensor(834.), tensor(604.), tensor(432.), te...",Mountain-Valid (95).jpeg,dataset/landscape1/Validation Data/Mountain\Mo...,"[tensor(-0.1323), tensor(0.4341), tensor(0.136...",a intense landscape,0.072266,0.040474,tensor(0.5445),juxuluji,98.015541
11996,validation,Anvil_-_Lokomo_A_100_kg_-_Hammer_on_back_1_tim...,dataset/earcon_dataset/earcons/Anvil_-_Lokomo_...,"[[tensor(170.), tensor(748.), tensor(748.), te...",Mountain-Valid (96).jpeg,dataset/landscape1/Validation Data/Mountain\Mo...,"[tensor(-0.3210), tensor(0.1672), tensor(0.141...",a complex landscape,0.069275,0.056088,tensor(0.5127),juxuluja,92.280799
11997,validation,Short_Cut-Off_Beep.wav,dataset/earcon_dataset/earcons/Short_Cut-Off_B...,"[[tensor(103.), tensor(56.), tensor(0.), tenso...",Mountain-Valid (97).jpeg,dataset/landscape1/Validation Data/Mountain\Mo...,"[tensor(0.3450), tensor(0.3041), tensor(0.0990...",a narrow landscape,0.117004,0.043641,tensor(0.5323),juxuluji,95.818241
11998,validation,BS_Bend_17.wav,dataset/earcon_dataset/earcons/BS_Bend_17.wav,"[[tensor(319.), tensor(698.), tensor(857.), te...",Mountain-Valid (98).jpeg,dataset/landscape1/Validation Data/Mountain\Mo...,"[tensor(0.1858), tensor(-0.0550), tensor(-0.14...",a sharp landscape,0.099854,0.048247,tensor(0.5541),gelejegi,99.729109


### Build Dataloaders

In [3]:
# Split the dataset based on the "split" column
train_df = earcon_image_dataset[earcon_image_dataset['split'] == 'train']
train_df = train_df.drop(columns='split')
val_df = earcon_image_dataset[earcon_image_dataset['split'] == 'validation']
val_df = val_df.drop(columns='split')
test_df = earcon_image_dataset[earcon_image_dataset['split'] == 'test']
test_df = test_df.drop(columns='split')

In [4]:
train_df = train_df.sample(frac=0.01, random_state=42)
val_df = val_df.sample(frac=0.05, random_state=42)
test_df = test_df.sample(frac=0.1, random_state=42)
len(train_df), len(val_df), len(test_df)

(100, 75, 50)

In [5]:
# Create dataloaders
from utils.musicgen_utils import create_earcon_dataloaders


batch_size = 5
train_loader, val_loader, test_loader = create_earcon_dataloaders(
    train_df,
    val_df,
    test_df,
    batch_size=batch_size,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Model

The model pipeline is as follows:
- The Earcon Encodec Vector is the target
- The VA Vector and Bouba-Kiki Value will be inputs to the model
- The model will output a set of vectors which will be fed to the MusicGen Decoder along with the Pseudoword
- The output of MusicGen Decoder will be encoded by Encodec
- The output of Encodec will be considered the final output, and loss will be calculated based on the difference between this output and the target Encodec vector from the Earcon

In [None]:
# init model
from utils.musicgen_model import *
from utils.musicgen_utils import *
from transformers import MusicgenDecoderConfig


# model_options = {
#     "freeze_musicgen_text_encoder": False,
#     "freeze_musicgen_decoder": False,
#     "freeze_encodec": True,
#     "num_projection_layers": 2,
#     "fusion_hidden_dims": [256]
# }

model = MusicgenForImageLM(
    MusicgenDecoderConfig(
        num_codebooks=1,
        # hidden_size=2048
    ),
    freeze_encodec=True,
    freeze_musicgen=False
)

In [None]:
model

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

In [9]:
image_processor = MusicgenImageProcessor()

In [None]:
image_processor

In [None]:
for name, param in image_processor.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

### Training the model

In [12]:
# optimizer
import torch.optim as optim
from torch.nn import CrossEntropyLoss

# hyper parameters
patience = 10
epochs = 100
lr = 1e-4
weight_decay = 1e-5

In [13]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
# train model
train_musicgen_model(
    model=model,
    image_processor=image_processor,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    test_dataloader=test_loader,
    epochs=epochs,
    model_learning_rate=lr,
    processor_learning_rate=lr,
    weight_decay=weight_decay,
    patience=patience,
)

## Saving the model

In [3]:
version = "9"

In [None]:
save_musicgen_image_model(model, filename=f"MusicGenModel_0{version}.pt")

In [None]:
save_musicgen_image_processor(image_processor, filename=f"MusicGenImageProcessor_0{version}.pt")

## Testing

### Loading the model

In [6]:
version = "9"

In [7]:
# import relevant libraries
import pandas as pd
import torch
import random


random.seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
pd.set_option('display.max_columns', None)

In [8]:
# Loading model
from utils.musicgen_model import *

model = load_musicgen_image_model(filename=f"MusicGenModel_0{version}.pt")
image_processor = load_musicgen_image_processor(filename=f"MusicGenImageProcessor_0{version}.pt")

  WeightNorm.apply(module, name, dim)


Model loaded from outputs/MusicGenModel_09.pt
Processor loaded from outputs/MusicGenImageProcessor_09.pt


### Test Generation

In [9]:
for temp in test_loader:
    batch = temp
    break

inputs = image_processor.forward(batch["image_features"].to(device), batch["roundness"].to(device))

In [10]:
result = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
)
result

torch.Size([5, 128])
torch.Size([5, 128])


  attn_output = torch.nn.functional.scaled_dot_product_attention(


torch.Size([5, 129])
torch.Size([5, 128])


RuntimeError: The size of tensor a (128) must match the size of tensor b (129) at non-singleton dimension 1

### Choosing images

In [None]:
# choose 10 unique random image/earcon pairs for testing
elements = random.sample(range(0, len(test_df)), 10)
elements.sort()
elements

### Generating Earcons 

In [None]:
# generate the audio for the selected pairs
from transformers import T5Tokenizer


audio_list = []
tokenizer = T5Tokenizer.from_pretrained("facebook/musicgen-small")

for i in range(len(elements)):
    audio_list.append(
        generate_earcon(
            model,
            test_df["image_features"].iloc[elements[i]],
            test_df["roundness"].iloc[elements[i]],
        )
    )

### Display results

In [None]:
from IPython.display import Audio, display, Image


def display_image(image_path):
    display(Image(filename=image_path))


def play_audio(filepath):
    display(Audio(filepath))


def play_generated_audio(audio, sampling_rate=24000):
    display(Audio(audio[0].cpu().numpy(), rate=sampling_rate))

In [None]:
# display the images and play the audio for every pair
for i in range(len(elements)):
    # display image
    print(f"Image {i+1}:")
    display_image(test_df["image_filepath"].iloc[elements[i]])
    # play paired earcon
    print(f"Original Paired Earcon {i+1}:")
    play_audio(test_df["earcon_filepath"].iloc[elements[i]])
    # play generated earcon
    print(f"Generated Earcon {i+1}:")
    play_generated_audio(audio_list[i])