In [257]:
import huggingface_hub

# paths to various models
model_path_configs = {
        "Humpback Whales":      ("Intelligent-Instruments-Lab/rave-models", "humpbacks_pondbrain_b2048_r48000_z20.ts"), 
        "Magnets":              ("Intelligent-Instruments-Lab/rave-models", "magnets_b2048_r48000_z8.ts"), 
        "BigEnsemble":          ("Intelligent-Instruments-Lab/rave-models", "crozzoli_bigensemblesmusic_18d.ts"),
        "BirdDawnChorus":       ("Intelligent-Instruments-Lab/rave-models", "birds_dawnchorus_b2048_r48000_z8.ts"), 
        "SpeakingAndSinging":   ("Intelligent-Instruments-Lab/rave-models", "voice-multi-b2048-r48000-z11.ts"), 
        "Resonator Piano":      ("Intelligent-Instruments-Lab/rave-models", "mrp_strengjavera_b2048_r44100_z16.ts"),
        "Multimbral Guitar":    ("Intelligent-Instruments-Lab/rave-models", "guitar_iil_b2048_r48000_z16.ts"),
        "Organ Archive":        ("Intelligent-Instruments-Lab/rave-models", "organ_archive_b2048_r48000_z16.ts"),
        "Water":                ("Intelligent-Instruments-Lab/rave-models", "water_pondbrain_b2048_r48000_z16.ts"),
        "Brass Sax":            ("shuoyang-zheng/jaspers-rave-models", "aam_brass_sax_b2048_r44100_z8_noncausal.ts"),
        "Speech":               ("shuoyang-zheng/jaspers-rave-models", "librispeech100_b2048_r44100_z8_noncausal.ts"),
        "String":               ("shuoyang-zheng/jaspers-rave-models" ,"aam_string_b2048_r44100_z16_noncausal.ts"),
        "Singer":               ("shuoyang-zheng/jaspers-rave-models","gtsinger_b2048_r44100_z16_noncausal.ts"),
        "Bass":                 ("shuoyang-zheng/jaspers-rave-models","aam_bass_b2048_r44100_z16_noncausal.ts"),
        "Drum":                 ("shuoyang-zheng/jaspers-rave-models","aam_drum_b2048_r44100_z16_noncausal.ts"),
        "Gtr Picking":          ("shuoyang-zheng/jaspers-rave-models","guitar_picking_dm_b2048_r44100_z8_causal.ts"),
        # "Percussion-LB":        ("lancelotblanchard/rave_percussion", "Percussion_LB': 'percussion.ts")
    }

model_path_config_keys = sorted(model_path_configs)
model_paths_cache = {}

def GetModelPath(model_path_name):
    model_path = ()

    if model_path_name in model_paths_cache.keys():
        model_path = model_paths_cache[model_path_name]
    else:
        repo_id, filename = model_path_configs[model_path_name]
        # print("repo_id", repo_id)
        # print("filename", filename)
        model_path = huggingface_hub.hf_hub_download(
        repo_id =repo_id,
        filename = filename,
        cache_dir="../huggingface_hub_cache",
        force_download=False,
        )
        # print(f"Generated Model Path for {filename}.")

        model_paths_cache[model_path_name] = model_path
        
    return model_path 

def saveAudio(file_path, audio):
    with open(file_path + '.wav', 'wb') as f:
        f.write(audio.data)
        



In [258]:
import torch
import copy
import librosa
from IPython.display import Audio, display
import pandas as pd
import ast
import matplotlib.pyplot as plt
import os
import h5py
import pandas as pd
import numpy as np
import copy

def AverageRaveModels(rave_a, rave_b, bias = 0):

    r1_ratio = .5
    r2_ratio = .5

    messages = {}
    # bias between -1 and 1
    if abs(bias) <= 1:
        if bias > 0:
            r1_ratio = .5 + bias/2
            r2_ratio = 1.0 - r1_ratio

            rave_temp = rave_a
        elif bias < 0:
            r2_ratio = .5 + abs(bias)/2
            r1_ratio = 1.0 - r2_ratio
    else:
        print(f"Unable to apply bias {bias} - bias must be between -1 and 1.")
    
    # Get state dictionaries of both models
    rave_a_params = rave_a.state_dict()
    rave_b_params = rave_b.state_dict()
    
    rave_avg = copy.deepcopy(rave_a)
    avg = rave_avg.state_dict()    

    keys_averaged={}
    keys_not_averaged={}
    for key in rave_a_params:
        if key in rave_b_params:
            try:
                avg[key] = ((rave_a_params[key] * r1_ratio) + (rave_b_params[key] * r2_ratio)) 
                # keys_averaged.append(key)
                keys_averaged[key]=(key, rave_a_params[key].shape, rave_b_params[key].shape, "")
            except Exception as e:
                print(f"Error averaging key {key}: {e}")
                # keys_not_averaged.append(key)
                keys_not_averaged[key]=(key, rave_a_params[key].shape, rave_b_params[key].shape, e)
        else:
            print(f"Key {key} not found in rave_b parameters, skipping.")
            # keys_not_averaged(key)
            keys_not_averaged[key]=(key, rave_a_params[key].shape, "Key not found in rave_b parameters.", "")
        
    messages["keys_averaged"] = keys_averaged
    messages["keys_not_averaged"] = keys_not_averaged

    # Commit the changes
    rave_avg.load_state_dict(avg) 
   
    # for key in rave_avg.state_dict():
    #     if key in keys_not_averaged:
    #         print(f'Key: {key}: Unable to average.')
    #     else:
    #         try:
    #             print(key)
    #             print(f'rave_a: {rave_a_params[key][...,0]}')
    #             print(f'rave_b: {rave_b_params[key][...,0]}')
    #             print(f'rave_avg: {rave_avg.state_dict()[key][...,0]}\n\n')
    #         except KeyError as e:
    #             print (f'Only one of the original models had a key for {key}')
    #         except IndexError as e:
    #             print(f'Key: {key} - ', e)
    
    return rave_avg, messages


In [259]:
available_audio_files=[
    "RJM1240-Gestures.wav",
    "SilverCaneAbby_ThreeVoices_v1r1.wav",
    "SingingBowl_Singing-Omni_sixInchesAbove_nm.wav"
]

def GenerateAudio(model_name_a, model_name_b, audio_file_name, bias=0): #audio_file_name="RJM1240-Gestures.wav"

    ###############################################
    # Choose models from filenames dictionary created in previous cell
    # Note: model_path_a is always used to initialize the averaged model.
    # Switching them gets different results if the parameters are not all matched.
    ###############################################
    # Examples - this matches only 21 params, but it sounds like maybe sosme of both are in the result.
    model_path_a = GetModelPath(model_name_a)
    model_path_b = GetModelPath(model_name_b)

    # Examples: This has 76 params averaged
    # model_path_a = model_paths['Water']
    # model_path_b = model_paths['Organ Archive']

    # Examples: All Params Match but high pitch for averaged version
    # model_path_a = model_paths['Organ Archive']
    # model_path_b = model_paths['Multimbral Guitar']
    #
    # model_path_a = model_paths['String']
    # model_path_b = model_paths['Singer']


    #####################################
    # Set biases between -1 and 1 to bias the result towards one of the models
    #   0 = standard average
    # > 0 = biased towards model_a
    # < 0 = biases towards  model_b
    #####################################
    # biases=[0, .2, .4, .5, .7, -.4, -.6, -.9]
    biases=[bias]

    ####################################
    # Choose Audio File to encode/decode
    #####################################
    # audio_file_name = "RJM1240-Gestures.wav"
    print("Audio File Name:", audio_file_name)
    audio_file = os.path.join('assets', audio_file_name)
    print("Audio File:", audio_file)


    ####################################
    # Generate Audio Files
    # Audio files are created in the assets folder
    generate_audio_files = False

    rave_a = torch.jit.load(model_path_a)
    rave_b = torch.jit.load(model_path_b)

    # Let's load a sample audio file
    y, sr = librosa.load(audio_file)

    # Convert audio to a PyTorch tensor and reshape it to the
    # required shape: (batch_size, n_channels, n_samples)
    audio = torch.from_numpy(y).float()
    audio = audio.reshape(1, 1, -1) 

    messages={}
    audio_outputs={}
    for bias in biases:

        # Average the rave models
        # rave_avg, numb_params_mod, numb_params_unable_to_mod = AverageRaveModels(rave_a, rave_b, bias=bias)
        rave_avg, new_msgs = AverageRaveModels(rave_a, rave_b)
        messages |= new_msgs 

        # no decode the results back to audio
        with torch.no_grad():
            # encode the audio with the new averaged models
            try:
                latent_a = rave_a.encode(audio)
                latent_b = rave_b.encode(audio)
                latent_avg = rave_avg.encode(audio)

                # decode individual and averaged models
                decoded_a = rave_a.decode(latent_a)
                decoded_b = rave_b.decode(latent_b)
                decoded_avg = rave_avg.decode(latent_avg)
                audio_outputs[bias] = decoded_avg[0]
                # print(bias)
                # print (audio)
            except:
                print(f'Bias {bias} generated an error. Removing it from list of biases.')
                biases.remove(bias)
                # print(biases)

        # print(decoded_a.shape) 
        # print(decoded_b.shape) 
        # print(decoded_avg.shape) 

        model_a_file=model_path_a.rsplit("/")[-1]
        model_b_file=model_path_b.rsplit("/")[-1]

        print("---------------------------------------\n")
        # Original Audio
        print("\n")
        # Let's listen to the decoded audio and compare
        # print("Original Audio")
        # original_audio = Audio(y, rate=sr)
        original_audio = (sr, y)
        # display(original_audio)
        print("Y:", y)
        print("---------------------------------------\n")

        # Decoded Audio
        print("Encoded and Decoded using original models")
        print ("Decoded A: ", decoded_a[0])
        model_a_audio =  decoded_a[0].detach().numpy()
        model_a_audio = (sr, model_a_audio.squeeze())
        print ("model_a_audio A: ", model_a_audio)
        # display(model_a_audio)
        # saveAudio('assets/' + model_a_file[: 7] + '_only.wav', a)

        model_b_audio = (sr, decoded_b[0].detach().numpy().squeeze())
        # model_b_audio = Audio(decoded_b[0].detach().numpy(), rate=sr)
        # display(model_b_audio)
        # # saveAudio('assets/' + model_b_file[: 7] + '_only.wav', a)

        print("Encoded and Decoded using Averaged Models")
        print("with Biases: ", biases)
        print("\nNumber of params able to average:", len(messages["keys_averaged"]))
        print("Number of params unable to average:", len(messages["keys_not_averaged"]))

        output_file_prefix = f'assets/{model_a_file[: 7]}-{model_b_file[: 7]}_'

        bias = biases[0]
        averaged_audio = (sr, audio_outputs[bias].detach().numpy().squeeze()) 
        #         averaged_audio[bias] = a       
        #         # display(a)
        #         # print(f"Average of Models, bias = {bias}")
        #         # if generate_audio_files:
        #         #     saveAudio(output_file_prefix + 'bias_' + str(bias), a)
        #     else:
        #         print(f"Average of Models with bias {bias} is unavailable.")

        # This version is for implementing multiple bias processing
        # averaged_audio = {}
        # for bias in biases:
        #     if bias in audio_outputs.keys(): 
        #         a = (sr, audio_outputs[bias].detach().numpy().squeeze()) 
        #         averaged_audio[bias] = a       
        #         # display(a)
        #         # print(f"Average of Models, bias = {bias}")
        #         # if generate_audio_files:
        #         #     saveAudio(output_file_prefix + 'bias_' + str(bias), a)

        #         print("---------------------------------------\n")
        #     else:
        #         print(f"Average of Models with bias {bias} is unavailable.")

        # mesages_df = pd.DataFrame.from_dict(messages)
        # messages_passed={"keys_averaged" :messages['keys_averaged'], "keys_unable_to_average": messages['keys_not_averaged']}
        
        return original_audio, model_a_audio, model_b_audio, averaged_audio, pd.DataFrame(messages['keys_averaged']).transpose(), pd.DataFrame(messages["keys_not_averaged"]).transpose()

In [260]:
# o, a, b, avgs = GenerateAudio('Humpback Whales', 'Humpback Whales', 'RJM1240-Gestures.wav', 0)

# print('***********************************')
# print('Original Audio:')
# display(o)
# print('Model A Encode/Decode:')
# display(a)
# print('Model B Encode/Decode:')
# display(b)

# for key in avgs.keys():
#     print(f'Averaged with Bias {key}:')
#     display(avgs[key])


In [261]:
import gradio as gr

AverageModels = gr.Interface(title="Process Audio Through Averaged Models.",
    fn=GenerateAudio,
    inputs=[
        # gr.Dropdown(model_path_config_keys),
        gr.Radio(model_path_config_keys, label="Select Model A", container=True),
        gr.Radio(model_path_config_keys, label="Select Model B", container=True),
        gr.Dropdown(available_audio_files, label="Available Audio Files to Encode/Decode")
        #gr.UploadButton(label="Upload Audio", file_types=["audio"], file_count=1)
        # "number"
        ],
    # if no way to pass dictionary, pass separate keys and values and zip them.
    outputs=[
        gr.Audio(label="Original Audio"),
        gr.Audio(label="Encoded/Decoded through Model A"),
        gr.Audio(label="Encoded/Decoded through Model B"),
        gr.Audio(label="Audio Encoded/Decoded through averaged model"),
        gr.Dataframe(label="Params Averagerd", show_copy_button="True", min_width=400),
        gr.Dataframe(label="Params Not Averaged", show_copy_button="True", min_width=400)]
        # gr.TextArea(label="Messages:")]

    #  upload_button.upload()
)

AverageModels.launch(max_file_size=50 * gr.FileSize.MB)

* Running on local URL:  http://127.0.0.1:7902
* To create a public link, set `share=True` in `launch()`.




Audio File Name: RJM1240-Gestures.wav
Audio File: assets/RJM1240-Gestures.wav
---------------------------------------



Y: [-1.6850501e-04 -1.7852047e-04 -1.0835980e-04 ...  2.4873929e-08
  3.4752581e-08  1.3153058e-08]
---------------------------------------

Encoded and Decoded using original models
Decoded A:  tensor([[ 1.3161e-06,  1.3001e-06,  1.7076e-06,  ..., -5.1978e-06,
         -5.5370e-06, -5.0753e-06]])
model_a_audio A:  (22050, array([ 1.3161197e-06,  1.3000845e-06,  1.7075702e-06, ...,
       -5.1978500e-06, -5.5370192e-06, -5.0753383e-06],
      shape=(1474560,), dtype=float32))
Encoded and Decoded using Averaged Models
with Biases:  [0]

Number of params able to average: 286
Number of params unable to average: 0


  data = data.astype(np.int16)
