In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import train_test_split
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import IPython.display as ipd
import torchaudio
from tqdm import tqdm
import gc
from enum import Enum

# Model Classes

In [2]:
class logmel_rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(logmel_rnn, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, h_n = self.gru(x)
        output = self.fc(h_n[-1])  # Use the hidden state from the last layer
        return output
    

class spec_rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=5):
        super(spec_rnn, self).__init__()
        # Initialize LSTM; note the additional argument compared to GRU
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        # Linear layer to map from hidden state space to output space
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass through LSTM layer
        # h_n is the final hidden state for each element in the batch
        # c_n is the final cell state for each element in the batch
        _, (h_n, c_n) = self.lstm(x)
        # We use the last hidden state to feed into the fully connected layer.
        # The LSTM output `h_n` has dimensions [num_layers, batch, hidden_size]
        # We take the last layer's hidden state
        output = self.fc(h_n[-1])
        return output
    

class chroma_rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(chroma_rnn, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: [batch_size, sequence_length, input_size]
        _, h_n = self.gru(x)
        # h_n: [num_layers * num_directions, batch_size, hidden_size]
        output = self.fc(h_n.squeeze(0))  # Squeeze to remove the sequence dimension
        # output: [batch_size, output_size]
        return output
    

class wav_rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=3):
        super(wav_rnn, self).__init__()
        # Initialize LSTM; note the additional argument compared to GRU
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        # Linear layer to map from hidden state space to output space
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass through LSTM layer
        # h_n is the final hidden state for each element in the batch
        # c_n is the final cell state for each element in the batch
        _, (h_n, c_n) = self.lstm(x)
        # We use the last hidden state to feed into the fully connected layer.
        # The LSTM output `h_n` has dimensions [num_layers, batch, hidden_size]
        # We take the last layer's hidden state
        output = self.fc(h_n[-1])
        return output

# Dataset Class

In [69]:
class MultiInputDatset(Dataset):
  def __init__(self, logmel_files, spec_files, chroma_files, wav_files, labels=None):
    self.logmel_files = logmel_files
    self.spec_files = spec_files
    self.chroma_files = chroma_files
    self.wav_files = wav_files
    self.files = []
    self.classes = os.listdir(self.logmel_files)
    self.len = 0
    if labels == None:
      for cls in self.classes:
        for file in os.listdir(os.path.join(self.logmel_files, cls)):
          self.len += 1
          wav_file = os.path.join(self.wav_files, cls, os.path.splitext(file)[0] + '.wav')
          self.files.append({
              'logmel': os.path.join(self.logmel_files, cls, file),
              'spectrogram': os.path.join(self.spec_files, cls, file),
              'chromagram': os.path.join(self.chroma_files, cls, file),
              'wav': wav_file,
              'label': cls
          })
    else:
      for file in os.listdir(os.path.join(self.logmel_files)):
        self.len += 1
        wav_file = os.path.join(self.wav_files, os.path.splitext(file)[0] + '.wav')
        self.files.append({
            'logmel': os.path.join(self.logmel_files, file),
            'spectrogram': os.path.join(self.spec_files, file),
            'chromagram': os.path.join(self.chroma_files, file),
            'wav': wav_file,
            'label': labels
        })

  def __len__(self):
    return self.len

  def __getitem__(self, idx):
    item = self.files[idx]
    logmel_file = np.load(item['logmel'])
    spec_file = np.load(item['spectrogram'])
    chroma_file = np.load(item['chromagram'])
    # wav_file = np.load(item['wav'])
    wav_file = torchaudio.load(item['wav'])[0]
    label = int(item['label'])

    return {
        'LOGMEL': torch.tensor(logmel_file, dtype=torch.float32),
        'SPEC': torch.tensor(spec_file, dtype=torch.float32),
        'CHROMA': torch.tensor(chroma_file, dtype=torch.float32),
        'WAV': torch.tensor(wav_file, dtype=torch.float32),
        'LABEL': label
    }

In [41]:
test_dataset = MultiInputDatset(logmel_files='../Dataset_2_split_logmel/test', spec_files='../Dataset_2_split_spec/test', chroma_files='../Dataset_2_split_chroma/test', wav_files='../Dataset_2_split/test')
data_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Model Instantiations

In [4]:
logmel_input_size = 128
spec_input_size = 1025
chroma_input_size = 12
wav_input_size = 1
hidden_size = 128
output_size = 4  # Number of classes

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class ModelName(Enum):
    LOGMEL = 0
    SPEC = 1
    CHROMA = 2
    WAV = 3

class Genre(Enum):
    CLASSICAL = 0
    POP = 1
    RNB = 2
    ROCK = 3

In [5]:
def model_predict_data(data, model, model_name):
    tensor_data = data[model_name]
    tensor_data = tensor_data.permute(0, 2, 1)
    tensor_data = tensor_data.to(device)

    with torch.no_grad():
        model_prediction = model(tensor_data)

    probabilities = [F.softmax(pred, dim=0) for pred in model_prediction]

    prediction_index = np.argmax(probabilities)
    
    prediction = 0

    if prediction_index == 0:
        prediction = 1
    elif prediction_index == 1:
        prediction = 4
    elif prediction_index == 2:
        prediction = 7
    else: # prediction_index == 8
        prediction = 8

    return probabilities, prediction, prediction == data['LABEL'], prediction_index 

### Log Mel Spectrogram Model

In [41]:
model_logmel = logmel_rnn(logmel_input_size, hidden_size, output_size)
model_logmel.state_dict(torch.load('./logmel_rnn.pth', map_location=torch.device('cpu')))
model_logmel.to(device)
model_logmel.eval()
logmel_predictions = [0, 0, 0, 0]
logmel_correct = [0, 0, 0, 0]

for i, data in enumerate(data_loader):
    probabilities, prediction, correct, index = model_predict_data(data, model_logmel, 'LOGMEL')
    logmel_predictions[index] = logmel_predictions[index] + 1
    if correct:
        print(f"Label: {prediction}")
        logmel_correct[index] = logmel_correct[index] + 1

del model_logmel
gc.collect()

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


Label: 8


472

### Spectrogram Model

In [8]:
model_spectrogram = spec_rnn(spec_input_size, hidden_size, output_size)
model_spectrogram.state_dict(torch.load('./spec_rnn.pth', map_location=torch.device('cpu')))
model_spectrogram.to(device)
model_spectrogram.eval()
spec_predictions = [0, 0, 0, 0]
spec_correct = [0, 0, 0, 0]

for i, data in enumerate(data_loader):
    probabilities, prediction, correct, index = model_predict_data(data, model_spectrogram, 'SPEC')
    spec_predictions[index] = spec_predictions[index] + 1
    if correct:
        print(f"Label: {prediction}")
        spec_correct[index] = spec_correct[index] + 1

del model_spectrogram
gc.collect()

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
L

968

### Chromagram Model

In [9]:
model_chromagram = chroma_rnn(chroma_input_size, hidden_size, output_size)
model_chromagram.state_dict(torch.load('./chroma_rnn.pth', map_location=torch.device('cpu')))
model_chromagram.to(device)
model_chromagram.eval()
chroma_predictions = [0, 0, 0, 0]
chroma_correct = [0, 0, 0, 0]

for i, data in enumerate(data_loader):
    probabilities, prediction, correct, index = model_predict_data(data, model_chromagram, 'CHROMA')
    chroma_predictions[index] = chroma_predictions[index] + 1
    if correct:
        print(f"Label: {prediction}")
        chroma_correct[index] = chroma_correct[index] + 1

del model_chromagram
gc.collect()

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 1
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 7
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
Label: 8
L

66

### WaveForm Model

In [10]:
model_wav = wav_rnn(wav_input_size, hidden_size, output_size)
model_wav.state_dict(torch.load('./wav_rnn.pth', map_location=torch.device('cpu')))
model_wav.to(device)
model_wav.eval()
wav_predictions = [0, 0, 0, 0]
wav_correct = [0, 0, 0, 0]

for i, data in enumerate(data_loader):
    probabilities, prediction, correct, index = model_predict_data(data, model_wav, 'WAV')
    wav_predictions[index] = wav_predictions[index] + 1
    if correct:
        print(f"Label: {prediction}")
        wav_correct[index] = wav_correct[index] + 1

del model_wav
gc.collect()

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
Label: 4
L

858

In [11]:
print(f"logmel_correct: {logmel_correct}")
print(f"spec_correct: {spec_correct}")
print(f"chroma_correct: {chroma_correct}")
print(f"wav_correct: {wav_correct}")

logmel_correct: [0, 0, 2, 204]
spec_correct: [0, 241, 0, 0]
chroma_correct: [20, 0, 22, 97]
wav_correct: [0, 241, 0, 0]


In [13]:
print(f"logmel_predictions: {logmel_predictions}")
print(f"spec_predictions: {spec_predictions}")
print(f"chroma_predictions: {chroma_predictions}")
print(f"wav_predictions: {wav_predictions}")

logmel_predictions: [0, 0, 16, 636]
spec_predictions: [0, 652, 0, 0]
chroma_predictions: [247, 0, 121, 284]
wav_predictions: [0, 652, 0, 0]


## Selective Activation

In [42]:
# Correct classifications for each model by genre
logmel_correct = [0, 0, 2, 204]
spec_correct = [0, 241, 0, 0]
chroma_correct = [20, 0, 22, 97]
wav_correct = [0, 241, 0, 0]

# Combining all models' correct classifications by genre
correct_by_genre = list(zip(logmel_correct, spec_correct, chroma_correct, wav_correct))

# Set the threshold for activating a model, e.g., 50% of the maximum correct predictions for each genre
thresholds = [max(genre_correct) * 0.5 for genre_correct in correct_by_genre]

# Determine which models are activated for each genre
activated_models = [[count >= threshold for count in genre_correct] for genre_correct, threshold in zip(correct_by_genre, thresholds)]

In [43]:
def use_activated_models(predictions):
    final_predictions = []

    for i, genre_activations in enumerate(activated_models):
        active_predictions = [predictions[model_index] for model_index, is_active in enumerate(genre_activations) if is_active]
        if active_predictions:
            # Majority vote among active models
            predicted_genre = max(set(active_predictions), key=active_predictions.count)
        else:
            predicted_genre = Genre.CLASSICAL  # Default if no model is activated (choose any default as needed)
        final_predictions.append(predicted_genre)

    # Convert enum to string for readability
    print(f"final_predictions: {final_predictions}")
    # final_predictions_str = [genre.name.value for genre in final_predictions]
    final_predictions_str = [Genre(genre_id).name for genre_id in final_predictions]
    return max(set(final_predictions_str), key=final_predictions_str.count)

In [44]:
def get_predictions(data):
    predictions = [0, 0, 0, 0]

    model_logmel = logmel_rnn(logmel_input_size, hidden_size, output_size)
    model_logmel.state_dict(torch.load('./logmel_rnn.pth', map_location=torch.device('cpu')))
    model_logmel.to(device)
    model_logmel.eval()
    probabilities, prediction, correct, index = model_predict_data(data, model_logmel, 'LOGMEL')
    predictions[ModelName.LOGMEL.value] = index
    del model_logmel
    gc.collect()

    model_spectrogram = spec_rnn(spec_input_size, hidden_size, output_size)
    model_spectrogram.state_dict(torch.load('./spec_rnn.pth', map_location=torch.device('cpu')))
    model_spectrogram.to(device)
    model_spectrogram.eval()
    probabilities, prediction, correct, index = model_predict_data(data, model_spectrogram, 'SPEC')
    predictions[ModelName.SPEC.value] = index
    del model_spectrogram
    gc.collect()

    model_chromagram = chroma_rnn(chroma_input_size, hidden_size, output_size)
    model_chromagram.state_dict(torch.load('./chroma_rnn.pth', map_location=torch.device('cpu')))
    model_chromagram.to(device)
    model_chromagram.eval()
    probabilities, prediction, correct, index = model_predict_data(data, model_chromagram, 'CHROMA')
    predictions[ModelName.CHROMA.value] = index
    del model_chromagram
    gc.collect()

    model_wav = wav_rnn(wav_input_size, hidden_size, output_size)
    model_wav.state_dict(torch.load('./wav_rnn.pth', map_location=torch.device('cpu')))
    model_wav.to(device)
    model_wav.eval()
    probabilities, prediction, correct, index = model_predict_data(data, model_wav, 'WAV')
    predictions[ModelName.WAV.value] = index
    del model_wav
    gc.collect()


    return use_activated_models(predictions)

## Single Song Input

In [45]:
song = '1/6434dd126cf84c84bffa7a6ba4c8140d_snippet_0'

logmel_file_path = f"../Dataset_2_split_logmel/test/{song}.npy"
spec_file_path = f"../Dataset_2_split_spec/test/{song}.npy"
chroma_file_path = f"../Dataset_2_split_chroma/test/{song}.npy"
waveform_file_path = f"../Dataset_2_split/test/{song}.wav"

logmel_file = np.load(logmel_file_path)
spec_file = np.load(spec_file_path)
chroma_file = np.load(chroma_file_path)
wav_file = torchaudio.load(waveform_file_path)[0]

data = {
    'LOGMEL': torch.tensor(logmel_file, dtype=torch.float32).unsqueeze(0),
    'SPEC': torch.tensor(spec_file, dtype=torch.float32).unsqueeze(0),
    'CHROMA': torch.tensor(chroma_file, dtype=torch.float32).unsqueeze(0),
    'WAV': torch.tensor(wav_file, dtype=torch.float32).unsqueeze(0),
    'LABEL': '1'
}


get_predictions(data)


  'WAV': torch.tensor(wav_file, dtype=torch.float32).unsqueeze(0),


final_predictions: [0, 1, 0, 1]


'POP'

# Experiments

## Experiment 1

In [67]:
experiment_1_opera = MultiInputDatset(logmel_files = '../experiment_1/1/LogMelSpectogram', spec_files='../experiment_1/1/spectogram', chroma_files='../experiment_1/1/chromagram', wav_files='../experiment_1/1/audio', labels=1)
data_loader = DataLoader(experiment_1_opera, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'OPERA':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")


file 0.npy
file 1.npy
file 2.npy


  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [3, 0, 3, 2]
prediction ROCK
final_predictions: [3, 3, 3, 3]
prediction ROCK
final_predictions: [3, 0, 3, 1]
prediction ROCK
Accuracy: 0.0


In [47]:
experiment_1_pop = MultiInputDatset(logmel_files = '../experiment_1/4/LogMelSpectogram', spec_files='../experiment_1/4/spectogram', chroma_files='../experiment_1/4/chromagram', wav_files='../experiment_1/4/audio', labels=4)
data_loader = DataLoader(experiment_1_pop, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'POP':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

logMel: ../experiment_1/4/LogMelSpectogram
spec: ../experiment_1/4/spectogram
labels 4


  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [3, 3, 3, 3]
prediction ROCK
final_predictions: [2, 0, 2, 1]
prediction RNB
final_predictions: [1, 1, 1, 2]
prediction POP
final_predictions: [0, 0, 0, 3]
prediction CLASSICAL
final_predictions: [3, 1, 3, 2]
prediction ROCK
final_predictions: [0, 0, 0, 2]
prediction CLASSICAL
final_predictions: [0, 2, 0, 1]
prediction CLASSICAL
final_predictions: [1, 1, 1, 3]
prediction POP
final_predictions: [2, 2, 2, 3]
prediction RNB
final_predictions: [3, 3, 3, 1]
prediction ROCK
final_predictions: [2, 0, 2, 0]
prediction RNB
final_predictions: [1, 1, 1, 0]
prediction POP
Accuracy: 0.25


In [48]:
experiment_1_rnb = MultiInputDatset(logmel_files = '../experiment_1/7/LogMelSpectogram', spec_files='../experiment_1/7/spectogram', chroma_files='../experiment_1/7/chromagram', wav_files='../experiment_1/7/audio', labels=7)
data_loader = DataLoader(experiment_1_rnb, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'RNB':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

logMel: ../experiment_1/7/LogMelSpectogram
spec: ../experiment_1/7/spectogram
labels 7


  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [2, 2, 2, 3]
prediction RNB
final_predictions: [2, 1, 2, 0]
prediction RNB
final_predictions: [0, 1, 0, 2]
prediction CLASSICAL
final_predictions: [3, 1, 3, 0]
prediction ROCK
final_predictions: [3, 3, 3, 0]
prediction ROCK
final_predictions: [0, 0, 0, 0]
prediction CLASSICAL
final_predictions: [1, 3, 1, 2]
prediction POP
final_predictions: [0, 1, 0, 2]
prediction CLASSICAL
final_predictions: [2, 0, 2, 1]
prediction RNB
final_predictions: [0, 0, 0, 1]
prediction CLASSICAL
final_predictions: [1, 3, 1, 0]
prediction POP
final_predictions: [3, 1, 3, 3]
prediction ROCK
final_predictions: [3, 0, 3, 0]
prediction ROCK
final_predictions: [0, 0, 0, 3]
prediction CLASSICAL
final_predictions: [3, 0, 3, 0]
prediction ROCK
final_predictions: [1, 1, 1, 2]
prediction POP
final_predictions: [2, 1, 2, 1]
prediction POP
final_predictions: [2, 0, 2, 2]
prediction RNB
Accuracy: 0.2222222222222222


In [49]:
experiment_1_rock = MultiInputDatset(logmel_files = '../experiment_1/8/LogMelSpectogram', spec_files='../experiment_1/8/spectogram', chroma_files='../experiment_1/8/chromagram', wav_files='../experiment_1/8/audio', labels=8)
data_loader = DataLoader(experiment_1_rock, batch_size=1, shuffle=False)

counter_correct = 0
counter_total =0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'ROCK':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

logMel: ../experiment_1/8/LogMelSpectogram
spec: ../experiment_1/8/spectogram
labels 8


  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [3, 1, 3, 2]
prediction ROCK
final_predictions: [2, 0, 2, 2]
prediction RNB
final_predictions: [2, 0, 2, 0]
prediction RNB
final_predictions: [0, 0, 0, 0]
prediction CLASSICAL
final_predictions: [2, 1, 2, 1]
prediction POP
final_predictions: [1, 1, 1, 2]
prediction POP
final_predictions: [3, 0, 3, 1]
prediction ROCK
final_predictions: [3, 0, 3, 2]
prediction ROCK
final_predictions: [1, 0, 1, 1]
prediction POP
final_predictions: [0, 0, 0, 0]
prediction CLASSICAL
final_predictions: [0, 0, 0, 1]
prediction CLASSICAL
final_predictions: [1, 0, 1, 3]
prediction POP
final_predictions: [1, 1, 1, 0]
prediction POP
final_predictions: [1, 0, 1, 3]
prediction POP
Accuracy: 0.21428571428571427


## Experiment 2

### Acapella

In [70]:
experiment_2_acapella_opera = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Acapella/1/logMelSpectogram', spec_files='../experiment_2/song_generic/Acapella/1/spectogram', chroma_files='../experiment_2/song_generic/Acapella/1/chromogram', wav_files='../experiment_2/song_generic/Acapella/1/audio', labels=1)
data_loader = DataLoader(experiment_2_acapella_opera, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'OPERA':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [1, 1, 1, 2]
prediction POP
final_predictions: [2, 3, 2, 2]
prediction RNB
final_predictions: [3, 0, 3, 1]
prediction ROCK
final_predictions: [3, 2, 3, 0]
prediction ROCK
Accuracy: 0.0


In [71]:
experiment_2_acapella_pop = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Acapella/4/LogMelSpectogram', spec_files='../experiment_2/song_generic/Acapella/4/spectogram', chroma_files='../experiment_2/song_generic/Acapella/4/chromogram', wav_files='../experiment_2/song_generic/Acapella/4/audio', labels=4)
data_loader = DataLoader(experiment_2_acapella_pop, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'POP':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [1, 2, 1, 1]
prediction POP
final_predictions: [2, 0, 2, 3]
prediction RNB
final_predictions: [3, 0, 3, 1]
prediction ROCK
final_predictions: [2, 0, 2, 3]
prediction RNB
final_predictions: [1, 0, 1, 1]
prediction POP
final_predictions: [3, 2, 3, 1]
prediction ROCK
Accuracy: 0.3333333333333333


In [72]:
experiment_2_acapella_rnb = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Acapella/7/LogMelSpectogram', spec_files='../experiment_2/song_generic/Acapella/7/spectogram', chroma_files='../experiment_2/song_generic/Acapella/7/chromogram', wav_files='../experiment_2/song_generic/Acapella/7/audio', labels=7)
data_loader = DataLoader(experiment_2_acapella_rnb, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'RNB':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [1, 0, 1, 3]
prediction POP
final_predictions: [2, 0, 2, 1]
prediction RNB
final_predictions: [0, 2, 0, 0]
prediction CLASSICAL
final_predictions: [1, 1, 1, 3]
prediction POP
final_predictions: [3, 0, 3, 3]
prediction ROCK
final_predictions: [3, 0, 3, 3]
prediction ROCK
final_predictions: [3, 1, 3, 3]
prediction ROCK
final_predictions: [0, 1, 0, 3]
prediction CLASSICAL
final_predictions: [1, 1, 1, 1]
prediction POP
Accuracy: 0.1111111111111111


In [74]:
experiment_2_acapella_rock = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Acapella/8/rock/logMelSpectogram', spec_files='../experiment_2/song_generic/Acapella/8/spectogram', chroma_files='../experiment_2/song_generic/Acapella/8/chromagram', wav_files='../experiment_2/song_generic/Acapella/8/audio', labels=8)
data_loader = DataLoader(experiment_2_acapella_rock, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'ROCK':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [3, 0, 3, 0]
prediction ROCK
final_predictions: [3, 1, 3, 0]
prediction ROCK
final_predictions: [3, 1, 3, 3]
prediction ROCK
final_predictions: [2, 1, 2, 1]
prediction POP
final_predictions: [0, 0, 0, 2]
prediction CLASSICAL
final_predictions: [0, 0, 0, 0]
prediction CLASSICAL
final_predictions: [1, 0, 1, 2]
prediction POP
final_predictions: [2, 0, 2, 3]
prediction RNB
final_predictions: [1, 0, 1, 2]
prediction POP
Accuracy: 0.3333333333333333


### Instrumental

In [87]:
experiment_2_instrumental_opera = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Instrumental/1/logMelSpectogram', spec_files='../experiment_2/song_generic/Instrumental/1/spectogram', chroma_files='../experiment_2/song_generic/Instrumental/1/chromogram2', wav_files='../experiment_2/song_generic/Instrumental/1/audio', labels=1)
data_loader = DataLoader(experiment_2_instrumental_opera, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'OPERA':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [2, 0, 2, 1]
prediction RNB
final_predictions: [3, 1, 3, 2]
prediction ROCK
final_predictions: [2, 1, 2, 3]
prediction RNB
Accuracy: 0.0


In [88]:
experiment_2_instrumental_pop = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Instrumental/4/LogMelSpectogram', spec_files='../experiment_2/song_generic/Instrumental/4/spectogram', chroma_files='../experiment_2/song_generic/Instrumental/4/chromogram2', wav_files='../experiment_2/song_generic/Instrumental/4/audio', labels=4)
data_loader = DataLoader(experiment_2_instrumental_pop, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'POP':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [0, 1, 0, 3]
prediction CLASSICAL
final_predictions: [3, 1, 3, 2]
prediction ROCK
final_predictions: [1, 0, 1, 2]
prediction POP
final_predictions: [3, 2, 3, 3]
prediction ROCK
final_predictions: [0, 0, 0, 2]
prediction CLASSICAL
final_predictions: [2, 3, 2, 1]
prediction RNB
Accuracy: 0.16666666666666666


In [91]:
experiment_2_instrumental_rnb = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Instrumental/7/LogMelSpectogram', spec_files='../experiment_2/song_generic/Instrumental/7/spectogram', chroma_files='../experiment_2/song_generic/Instrumental/7/chromogram2__2', wav_files='../experiment_2/song_generic/Instrumental/7/audio', labels=7)
data_loader = DataLoader(experiment_2_instrumental_rnb, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'RNB':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [0, 0, 0, 1]
prediction CLASSICAL
final_predictions: [3, 0, 3, 2]
prediction ROCK
final_predictions: [1, 1, 1, 2]
prediction POP
final_predictions: [0, 0, 0, 3]
prediction CLASSICAL
final_predictions: [0, 0, 0, 2]
prediction CLASSICAL
final_predictions: [2, 0, 2, 2]
prediction RNB
final_predictions: [2, 1, 2, 2]
prediction RNB
final_predictions: [2, 0, 2, 0]
prediction RNB
final_predictions: [2, 2, 2, 1]
prediction RNB
Accuracy: 0.4444444444444444


In [90]:
experiment_2_instrumental_rock = MultiInputDatset(logmel_files = '../experiment_2/song_generic/Instrumental/8/LogMelSpectogram', spec_files='../experiment_2/song_generic/Instrumental/8/spectogram', chroma_files='../experiment_2/song_generic/Instrumental/8/chromogram2__2', wav_files='../experiment_2/song_generic/Instrumental/8/audio', labels=8)
data_loader = DataLoader(experiment_2_instrumental_rock, batch_size=1, shuffle=False)

counter_correct = 0
counter_total = 0

for i, data in enumerate(data_loader):
    prediction = get_predictions(data)
    print(f"prediction {prediction}")
    counter_total = counter_total + 1
    if prediction ==  'ROCK':
        counter_correct = counter_correct + 1

print(f"Accuracy: {counter_correct / counter_total}")

  'WAV': torch.tensor(wav_file, dtype=torch.float32),


final_predictions: [1, 0, 1, 0]
prediction POP
final_predictions: [0, 3, 0, 2]
prediction CLASSICAL
final_predictions: [0, 0, 0, 1]
prediction CLASSICAL
final_predictions: [2, 2, 2, 0]
prediction RNB
final_predictions: [0, 1, 0, 2]
prediction CLASSICAL
final_predictions: [1, 1, 1, 0]
prediction POP
final_predictions: [2, 0, 2, 3]
prediction RNB
final_predictions: [2, 0, 2, 0]
prediction RNB
final_predictions: [3, 1, 3, 2]
prediction ROCK
Accuracy: 0.1111111111111111
