In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
import librosa
import numpy as np
import os

In [90]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return np.hstack([mfccs,chroma,mel,contrast,tonnetz])

In [91]:
class AudioDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.root_dir = root_dir
        self.label = pd.read_csv(csv_file) 
        self.transform = transform
        self.file_arr = np.asarray(self.label.iloc[:, 2])
        self.label_arr = np.asarray(self.label.iloc[:, 3])
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        file_name = os.path.join(self.root_dir, str(self.file_arr[idx])+".wav")
        feature = extract_feature(file_name)
        label = self.label_arr[idx]
        if self.transform:
            feature = self.transform(feature)
        sample = {
            'feature': feature,
            'label': label,
            'path': file_name
        }
        return sample

In [4]:
train_set = AudioDataset(csv_file='./new_train.csv', root_dir="./Train/")
test_set = AudioDataset(csv_file='./new_test.csv', root_dir="./Train/")

In [87]:
trainloader = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True)
testloader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True)

In [78]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNN, self).__init__()
        
        self.hidden_dim=hidden_dim

        # define an RNN with specified parameters
        # batch_first means that the first dim of the input and output will be the batch_size
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        
        # last, fully-connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
        self.ls = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        # x (batch_size, seq_length, input_size)
        # hidden (n_layers, batch_size, hidden_dim)
        # r_out (batch_size, time_step, hidden_size)
        batch_size = x.size(0)
        
        # get RNN outputs
        r_out, hidden = self.rnn(x, hidden)
        # shape output to be (batch_size*seq_length, hidden_dim)
        r_out = r_out.view(-1, self.hidden_dim)  
        
        # get final output 
        output = self.ls(self.fc(r_out))
        
        return output, hidden

In [79]:
input_size = 193 
output_size = 10
hidden_dim = 256
n_layers = 3
rnn = RNN(input_size, output_size, hidden_dim, n_layers)

In [80]:
rnn.double()

RNN(
  (rnn): RNN(193, 4, batch_first=True)
  (fc): Linear(in_features=4, out_features=10, bias=True)
  (ls): LogSoftmax()
)

In [81]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.01) 

In [82]:
train_on_gpu=torch.cuda.is_available()
if train_on_gpu:
    rnn.cuda()
rnn.train()

RNN(
  (rnn): RNN(193, 4, batch_first=True)
  (fc): Linear(in_features=4, out_features=10, bias=True)
  (ls): LogSoftmax()
)

In [88]:
# train the RNN
def train(rnn, n_steps, print_every):
    
    # initialize the hidden state
    hidden = None      
    
    for sample in trainloader:
        
        inputs = sample["feature"]
        labels = sample["label"]
        
        
        
#         # defining the training data 
#         time_steps = np.linspace(step * np.pi, (step+1)*np.pi, seq_length + 1)
#         data = np.sin(time_steps)
#         data.resize((seq_length + 1, 1)) # input_size=1

#         x = data[:-1]
#         y = data[1:]
        
        # convert data into Tensors
#         x_tensor = torch.Tensor(x).unsqueeze(0) # unsqueeze gives a 1, batch_size dimension
#         y_tensor = torch.Tensor(y)

        inputs = inputs.unsqueeze(0)
        
        if True:
            inputs, labels = inputs.cuda(), labels.cuda()
#         print(inputs)
#         print(inputs.shape)
#         print(labels.shape)

            
        
        # outputs from the rnn
        prediction, hidden = rnn(inputs, hidden)

        
#         print("yes")
        ## Representing Memory ##
        # make a new variable for hidden and detach the hidden state from its history
        # this way, we don't backpropagate through the entire history
        hidden = hidden.data

#         print(prediction)
#         print(labels.unsqueeze(0))
        # calculate the loss
        
        print(prediction)
        print(labels)
        
        loss = criterion(prediction, labels)
        
#         print("yes2")
        
        
        # zero gradients
        optimizer.zero_grad()
        # perform backprop and update weights
        loss.backward()
        
        
#         rnn.to("cpu")
        
        optimizer.step()

        # display loss and predictions
                
        print('Loss: ', loss.item())
#         plt.plot(time_steps[1:], x, 'r.') # input
#         plt.plot(time_steps[1:], prediction.data.numpy().flatten(), 'b.') # predictions
#         plt.show()
    
    return rnn


In [89]:
n_steps = 75
print_every = 15

trained_rnn = train(rnn, n_steps, print_every)

tensor([[-2.4455, -2.4791, -1.9386, -2.1575, -3.1354, -3.7482, -2.1738, -1.8238,
         -1.8637, -2.6216]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
tensor([5], device='cuda:0')
Loss:  3.7481856974270666
tensor([[-2.8789, -2.3444, -2.3515, -1.8955, -2.7131, -3.2260, -1.7360, -1.9242,
         -2.6989, -2.2360]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
tensor([3], device='cuda:0')
Loss:  1.8955472059963472
tensor([[-2.8438, -2.3680, -2.3291, -1.8610, -2.6801, -3.1971, -1.7688, -1.9404,
         -2.7074, -2.2547]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
tensor([8], device='cuda:0')
Loss:  2.707374301396764
tensor([[-2.8166, -2.3929, -2.3140, -1.8318, -2.6552, -3.1694, -1.8011, -1.9604,
         -2.6735, -2.2769]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
tensor([8], device='cuda:0')
Loss:  2.673520635669476
tensor([[-2.7948, -2.4173, -2.3035, -1.806

KeyboardInterrupt: 