In [6]:
from IPython.display import Audio, display
import librosa as li
import torch
torch.set_grad_enabled(False)

import numpy as np
from sklearn.neighbors import NearestNeighbors
from mix_utils import normalize
import pandas as pd

FEATURES_FILE_PATH = 'data/features2.csv'

In [7]:
arousal_coordinate = 0.5
valence_coordinate = 0.6
neighbors = 5

# # Verify three command line arguments are provided
# if len(sys.argv) != 4:
#     print('Usage: python mixing.py <arousal_coordinate> <valence_coordinate> <neighbors>')
#     sys.exit(1)

# # Command line arguments
# arousal_coordinate = float(sys.argv[1])  # Convert the first argument to a float
# valence_coordinate = float(sys.argv[2])  # Convert the second argument to a float
# neighbors = int(sys.argv[3])  # Convert the third argument to an integer

# Read features file path
df = pd.read_csv(FEATURES_FILE_PATH, header=None)

# Extract the first, fourth, and fifth columns into numpy arrays
names = df.iloc[:, 0].to_numpy()
arousal = df.iloc[:, 3].to_numpy()
valence = df.iloc[:, 4].to_numpy()

# Normalize the arousal and valence values
arousal, valence = normalize(arousal, valence)

# Combine the arousal and valence values into a single numpy array
points = np.column_stack((arousal, valence))

# Create a NearestNeighbors instance
nn = NearestNeighbors(n_neighbors=neighbors, metric='euclidean')

# Fit the model to your data
nn.fit(points)

# Given a random point (x, y)
point = np.array([[arousal_coordinate, valence_coordinate]])

# Find the k nearest neighbors
_, indices = nn.kneighbors(point)

# Get the names of the k nearest neighbors
nearest_neighbors = names[indices]

print(nearest_neighbors)

[['audio2/archive/musicnet/musicnet/train_data/2365.wav'
  'audio2/archive/musicnet/musicnet/test_data/2106.wav'
  'audio2/archive/musicnet/musicnet/train_data/1923.wav'
  'audio2/archive/musicnet/musicnet/train_data/2390.wav'
  'audio2/archive/musicnet/musicnet/train_data/2348.wav']]


In [9]:
model = torch.jit.load("models/musicnet.ts").eval()

audio_data = []
for audiofile in nearest_neighbors[0]:
    y, sr = li.load(audiofile, sr = 22050)
    audio_data.append(y)


In [10]:
cut_audio_data=[]

for audio in audio_data:
   cut_audio_data.append(audio[:22050 * 10])

for cutted_audio in cut_audio_data:
    display(Audio(cutted_audio, rate=22050))


In [11]:
latent_data = []

for cut_audio in cut_audio_data:
    latent_data.append(model.encode(torch.from_numpy(cut_audio).reshape(1,1,-1)))

merged_latent=sum(latent_data)/neighbors
    
merged_audio = model.decode(merged_latent).numpy().reshape(-1)

#sf.write("output.wav", merged_audio, sampling_rate)
display(Audio(merged_audio, rate=22050))