In [0]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
import shutil
import pickle
from google.colab import drive, files
import pandas as pd
import librosa
import librosa.display
import IPython.display as ipd

import torch 
import torch.nn as nn
from torch import optim

In [0]:
fs = 44100
composers = ['Schubert', 'Beethoven', 'Brahms', 'Mozart', 'Bach']
split_duration = 20
save_path = 'spectogram/'

In [0]:
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet.npz
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet_metadata.csv
!sudo apt-get install sox libsox-dev libsox-fmt-all
!pip install git+git://github.com/pytorch/audio

In [0]:
with open('musicnet.npz', 'rb') as npz:
  musicnet_dataset = np.load('musicnet.npz', encoding = 'latin1', allow_pickle=True)

musicnet_metadata = pd.read_csv('musicnet_metadata.csv')

In [0]:
metadata = {}
musicnet_metadata = pd.read_csv('musicnet_metadata.csv')
for composer in composers:
  metadata[composer] = musicnet_metadata.loc[musicnet_metadata.composer == composer]

In [0]:
musicnet_metadata = pd.read_csv('musicnet_metadata.csv')
musicnet_metadata.loc[musicnet_metadata.composer == 'Schubert']

In [0]:
sound, _ = dataset['1788']
ipd.Audio(sound[0:20*fs], rate=fs)


In [0]:
spec = librosa.feature.melspectrogram(y=sound, \
                                      sr=fs, \
)

plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(spec, ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', \
                          y_axis='mel', sr=fs, \
                          fmax=8000)


In [0]:
plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(spec, ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=fs, fmax=8000)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()
plt.axis('off')

In [0]:
plt.figure(figsize=(10, 4))
import matplotlib
matplotlib.use('Agg')
a = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=fs)
plt.axis('off')
plt.savefig('deneme1.png', transparent = True, format = 'png')

In [0]:
if os.path.isdir(save_path):
  shutil.rmtree(save_path)
  os.mkdir(save_path)
else:
  os.mkdir(save_path)

for composer in composers:
  composerFolder = save_path+composer
  os.mkdir(composerFolder)

In [0]:
for composer in composers:
  composerMetadata = metadata[composer]
  composerPath = save_path + composer + '/'
  count = 0
  for row in composerMetadata.itertuples():
    print(row.Index)
    id = str(row.id)
    duration = row.seconds
    sound, _ = musicnet_dataset[id]
    n_splits = np.floor(duration/split_duration).astype('int64')
    for i in range(n_splits):
      start = i*fs
      end = (i+1)*fs
      split = sound[start:end]
      spectogram = librosa.feature.melspectrogram(y=split, \
                                        sr=fs, \
                  )
      plt.figure(figsize=(10, 4))
      S_dB = librosa.power_to_db(spec, ref=np.max)
      librosa.display.specshow(S_dB, x_axis='time', \
                                y_axis='mel', sr=fs)
      
      plt.savefig('{}.png'.format(count), format = 'png', transparent = True)


print(row)

In [0]:
musicnet_metadata

In [0]:
sound, _ = musicnet_dataset['1727']

In [0]:
from torchaudio.transforms import MelSpectrogram
spec =       MelSpectrogram(\
                      n_fft=400, \
                      win_length=None, \
                      hop_length=None, \
                      f_min=0.0, \
                      f_max=None, \
                      pad=0, \
                      n_mels=128, \
                      \
                      )(fs)(sound) 

In [0]:
a = MelSpectrogram(fs)(sound) 

In [0]:
sound, _ = musicnet_dataset['1727']

In [0]:
from torch.utils.data import Dataset, DataLoader

class MusicnetComposers(Dataset):
    def __init__(self, csv_path, dataset_path, composers, split_duration, n_samples, fs):
        musicnet_metadata = pd.read_csv(csv_path)
        with open(dataset_path, 'rb') as npz:
          musicnet_dataset = np.load(dataset_path, encoding = 'latin1', allow_pickle=True)
        self.dataset = np.zeros((len(composers)*n_samples, fs*split_duration))
        self.labels = np.zeros(len(composers)*n_samples, dtype=int)
        count = 0
        for composer in composers:
          composerMetadata = musicnet_metadata.loc[musicnet_metadata.composer == composer]
          composer_data = []
          for row in composerMetadata.itertuples():
            print(row.Index)
            id = str(row.id)
            duration = row.seconds
            sound, _ = musicnet_dataset[id]
            n_splits = np.floor(duration/split_duration).astype('int64')
            for i in range(n_splits):
              start = i*fs*split_duration
              end = (i+1)*fs*split_duration
              split = sound[start:end]
              composer_data.append(split)
            if len(composer_data) >= n_samples:
              break
          index = np.random.randint(low=0, high=len(composer_data), size=n_samples)
          composer_data = [composer_data[i] for i in index]
          print(len(composer_data))
          print(np.array(composer_data).shape)
          print(self.dataset[count*n_samples:(count+1)*n_samples].shape)

          self.dataset[count*n_samples:(count+1)*n_samples] = np.array(composer_data)
          self.labels[count*n_samples:(count+1)*n_samples] = count
          count += 1
          if len(composer_data) >= n_samples:
            break

        self.composers = composers
        self.split_duration = split_duration
        self.n_samples = n_samples
        self.fs = fs
        
    def __getitem__(self, index):

        sound = torch.Tensor(self.dataset[index]).reshape(1,-1)
        spectogram = MelSpectrogram(sample_rate=self.fs, \
                                     n_fft=2048, \
                                     win_length=None, \
                                     hop_length=512, \
                                     f_min=0.0, \
                                     f_max=None, \
                                     pad=0, \
                                     n_mels=128, \
                                     \
                                     )(sound)
      
        return self.dataset[index], self.labels[index]
    
    def __len__(self):
        return len(self.labels)

    
csv_path = 'musicnet_metadata.csv'
dataset_path = 'musicnet.npz'
fs = 44100
composers = ['Schubert', 'Beethoven', 'Brahms', 'Mozart', 'Bach']
split_duration = 20
n_samples = 100
train_set = MusicnetComposers(csv_path, dataset_path, composers, split_duration, n_samples, fs)
print("Train set size: " + str(len(train_set)))



In [0]:
import torch
from torchaudio.transforms import MelSpectrogram
train_set[0]

In [0]:
#train_set[2]
t = torch.Tensor(sound)
spec = MelSpectrogram(sample_rate=fs, n_fft=2048, hop_length=512)(t.reshape(1,-1))

In [0]:
fig = plt.figure(figsize=(10,4))
plt.pcolormesh(np.log(spec[0].numpy()))

In [0]:
      spectogram = librosa.feature.melspectrogram(y=sound, \
                                        sr=fs, \
                  )
plt.figure(figsize=(10, 4))
S_dB = librosa.power_to_db(spec[0].numpy(), ref=np.max)
librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=fs)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-frequency spectrogram')
plt.tight_layout()
plt.show()
plt.axis('off')

In [0]:

train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True)
#test_loader = torch.utils.data.DataLoader(test_set, batch_size = 128, shuffle = True)

In [0]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30)
        self.fc1 = nn.Linear(512, 10)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = Net()
model.to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [0]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        data = data.requires_grad_() #set requires_grad to True for training
        output = model(data)
        output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10 
        loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))

In [0]:
train(model,1)

In [0]:
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet.tar.gz

In [0]:
!	tar xvzf musicnet.tar.gz

In [0]:
a = np.zeros((5*300, 44100*20))
b = np.zeros(5*300, dtype=int)

In [0]:
from torch.utils.data import Dataset, DataLoader

class MusicnetComposers2(Dataset):
    def __init__(self, csv_path, dataset_path, composers, split_duration, n_samples, fs):
        musicnet_metadata = pd.read_csv(csv_path)
        self.dataset = np.zeros((len(composers)*n_samples, fs*split_duration))
        self.labels = np.zeros(len(composers)*n_samples, dtype=int)
        count = 0
        for composer in composers:
          composerMetadata = musicnet_metadata.loc[musicnet_metadata.composer == composer]
          composer_data = []
          for row in composerMetadata.itertuples():
            print(row.Index)
            id = str(row.id)
            duration = row.seconds
            file_
            sound, _ = torchaudio.load(filepath, normalization=False)
            n_splits = np.floor(duration/split_duration).astype('int64')
            for i in range(n_splits):
              start = i*fs*split_duration
              end = (i+1)*fs*split_duration
              split = sound[start:end]
              composer_data.append(split)
            if len(composer_data) >= n_samples:
              break
          index = np.random.randint(low=0, high=len(composer_data), size=n_samples)
          composer_data = [composer_data[i] for i in index]
          print(len(composer_data))
          print(np.array(composer_data).shape)
          print(self.dataset[count*n_samples:(count+1)*n_samples].shape)

          self.dataset[count*n_samples:(count+1)*n_samples] = np.array(composer_data)
          self.labels[count*n_samples:(count+1)*n_samples] = count
          count += 1
          if len(composer_data) >= n_samples:
            break

        self.composers = composers
        self.split_duration = split_duration
        self.n_samples = n_samples
        self.fs = fs
        
    def __getitem__(self, index):

        sound = torch.Tensor(self.dataset[index]).reshape(1,-1)
        spectogram = MelSpectrogram(sample_rate=self.fs, \
                                     n_fft=2048, \
                                     win_length=None, \
                                     hop_length=512, \
                                     f_min=0.0, \
                                     f_max=None, \
                                     pad=0, \
                                     n_mels=128, \
                                     \
                                     )(sound)
      
        return self.dataset[index], self.labels[index]
    
    def __len__(self):
        return len(self.labels)

    
csv_path = 'musicnet_metadata.csv'
dataset_path = 'musicnet.npz'
fs = 44100
composers = ['Schubert', 'Beethoven', 'Brahms', 'Mozart', 'Bach']
split_duration = 20
n_samples = 100
train_set = MusicnetComposers(csv_path, dataset_path, composers, split_duration, n_samples, fs)
print("Train set size: " + str(len(train_set)))



In [0]:
!wget https://homes.cs.washington.edu/~thickstn/media/musicnet_metadata.csv
musicnet_metadata = pd.read_csv('musicnet_metadata.csv')

In [0]:
musicnet_metadata

In [0]:
a = musicnet_metadata.loc[musicnet_metadata.id == 1727]
a

In [0]:
a = np.zeros((5*300, 44100*20))