In [0]:
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet.npz
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet_metadata.csv
!sudo apt-get install sox libsox-dev libsox-fmt-all
!pip install git+git://github.com/pytorch/audio

In [0]:
import numpy as np
import cv2
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import shutil
import pickle
from google.colab import drive, files
import pandas as pd
import torch
import librosa
from torch.utils.data import Dataset, DataLoader, random_split
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from torchvision.datasets import ImageFolder

In [0]:
fs = 44100
composers = ['Schubert', 'Beethoven', 'Brahms', 'Mozart', 'Bach']
segment_duration = 20
n_samples = 300
train_test_partition = 0.8
metadata_path = 'musicnet_metadata.csv'
dataset_path = 'musicnet.npz'

transform_params = {'sample_rate':fs, \
                    'n_fft':2048, \
                    'win_length':None, \
                    'hop_length':1024, \
                    'f_min':0.0, \
                    'f_max':None, \
                    'pad':0, \
                    'n_mels':512, \
                    }

save_path = '/drive/My Drive/Academic/EE473/composerDataset.dill'
save_image_path = '/drive/My Drive/Academic/EE473/specDataset.dill'
figure_path = 'data/'

In [0]:
class MusicnetComposers(Dataset):
    def __init__(self, csv_path, dataset_path, composers, segment_duration, n_samples, fs, transform):
        musicnet_metadata = pd.read_csv(csv_path) #Load metada file 
        with open(dataset_path, 'rb') as npz: #Load sound dataset
          musicnet_dataset = np.load(dataset_path, encoding = 'latin1', allow_pickle=True)


        self.transform = MelSpectrogram(sample_rate=fs, \
                                        n_fft = transform['n_fft'], \
                                        win_length = transform['win_length'], \
                                        hop_length = transform['hop_length'], \
                                        f_min = transform['f_min'], \
                                        f_max = transform['f_max'], \
                                        pad = transform['pad'], \
                                        n_mels = transform['n_mels'], \
                                        )

        #Allocate arrays
        self.dataset = torch.zeros((len(composers)*n_samples, \
                                    1, \
                                 transform['n_mels'], \
                                 np.ceil(segment_duration*fs/transform['hop_length']).astype('int64')\
                                ))
        self.labels = torch.zeros(len(composers)*n_samples, dtype=int)

        for composer_id, composer in enumerate(composers): #Do for each composer

          
          composerData = musicnet_metadata.composer == composer #Locate the data of a composer
          composerMetadata = musicnet_metadata.loc[composerData] #Extract data related to a composer
          composer_data = [] #Temporary sound list

          for row in composerMetadata.itertuples(): #Do for each music of a composer

            #Extract information related to a music
            id = str(row.id)
            duration = row.seconds
            sound, _ = musicnet_dataset[id]
            n_splits = np.floor(duration/segment_duration).astype('int64')

            for i in range(n_splits): #Do for each segment of a music
              start = i*fs*segment_duration #Starting time of a music segment
              end = (i+1)*fs*segment_duration #Ending time of a music segment
              segment = sound[start:end]   #Extract a segment
              segment = torch.Tensor(segment).reshape(1,-1)
              segment = self.transform(segment)
              segment = AmplitudeToDB('power', top_db = 80)(segment)
              segment = segment - 10 * torch.log10(torch.max(segment))
              composer_data.append(segment)
              #if len(composer_data)>=n_samples: break
          #Create a random index to select n_samples many samples for a composer
          index = np.arange(0, len(composer_data), 1)
          index = np.random.choice(index, size=n_samples, replace=False)
          #index = np.arange(0, n_samples, 1)
          composer_data = [composer_data[i] for i in index]

          #Create index vector to add a composer data to MusicnetComposers dataset
          index = np.arange(composer_id*n_samples, (composer_id+1)*n_samples, 1)

          #Add a composer data to MusicnetComposers dataset
          self.dataset[index] = torch.stack(composer_data)
          self.labels[index] = composer_id
          print('Composer {} is done.'.format(composer))
          #if len(composer_data)>=n_samples: break
        #Save hyper-parameters
        self.composers = composers
        self.segment_duration = segment_duration
        self.n_samples = n_samples
        self.fs = fs
        self.transform = transform

    def __getitem__(self, index):

        #Get sound from original Musicnet dataset and convert it to torch.Tensor
        #Also each sound sample needs to be of the form (channel, time), so reshape it

        #Obtain spectogram of each sound
      
        return self.dataset[index], self.labels[index]
    
    def __len__(self):
        return len(self.labels)

In [0]:
composerDataset = MusicnetComposers(metadata_path, \
                                    dataset_path, \
                                    composers, \
                                    segment_duration, \
                                    n_samples, \
                                    fs, \
                                    transform_params \
                                    )


In [0]:
import librosa, librosa.display

a , _ = composerDataset[0]
a.shape
a = a.squeeze()
b = AmplitudeToDB('power', top_db = 80)(a)
b = b - 10 * torch.log10(torch.max(a))
plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(a.numpy(), ref=np.max)
librosa.display.specshow(b.numpy(), x_axis='time', y_axis='mel', sr=fs)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()


In [0]:
torch.max(a).item()

In [0]:
import librosa, librosa.display

a , _ = composerDataset[0]
a.shape
a = a.squeeze()
b = AmplitudeToDB()(a)
plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(a.numpy(), ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=fs)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()


In [0]:
from google.colab import drive
drive.mount('/drive', force_remount=True)

In [0]:
import dill
with open(save_path, 'wb') as file:
  dill.dump(composerDataset, file)

In [0]:
if os.path.isdir(figure_path):
  shutil.rmtree(figure_path)
  os.mkdir(figure_path)
else:
  os.mkdir(figure_path)

for composer in composers:
  composerFolder = figure_path+composer
  os.mkdir(composerFolder)

In [0]:
for count, (spec, label) in enumerate(composerDataset):
  print(count)
  composer = composers[label]
  fig = plt.figure(figsize=(10,10))
  S_dB = librosa.power_to_db(spec, ref=np.max)
  plt.pcolormesh(S_dB, cmap = 'magma')
  plt.axis('off')
  spec_path = figure_path + composer + '/{}.png'.format(count)
  plt.savefig(spec_path, transparent = True, format = 'png')
  plt.close()

In [0]:
from torchvision.datasets import ImageFolder
specFigureData = ImageFolder(figure_path)

In [0]:
save_image_path = '/drive/My Drive/Academic/EE473/specDataset.dill'
import dill
with open(save_image_path, 'wb') as file:
  dill.dump(specFigureData, file)

In [0]:
model(torchvision.transforms.ToTensor()(a))

In [0]:
model(b)

In [0]:
import torchvision
import torch.nn as nn
model = torchvision.models.resnet18(pretrained=False, progress=True)
num_classes = 5
model.fc = nn.Linear(512, num_classes)

model.train()

In [0]:
a, _ =specFigureData[0]

In [0]:
b = torchvision.transforms.ToTensor()(a)

In [0]:
model.eval()
model(c.unsqueeze(0))

In [0]:
import cv2

c = torch.Tensor(cv2.resize(b.permute(1,2,0).numpy(), (224,224))).permute(2,0,1)

In [0]:

np.array(a).shape

In [0]:
np.array(a)

In [0]:
b.shape