In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
import librosa
import numpy as np
import os

In [2]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
#     print ("Features :",len(X), "sampled at ", sample_rate, "hz")
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return np.hstack([mfccs,chroma,mel,contrast,tonnetz])

# Create custom dataloader

In [3]:
class AudioDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.root_dir = root_dir
        self.label = pd.read_csv(csv_file) 
        self.transform = transform
        self.file_arr = np.asarray(self.label.iloc[:, 2])
        self.label_arr = np.asarray(self.label.iloc[:, 3])
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
#         print(self.label.iloc[idx])
        file_name = os.path.join(self.root_dir, str(self.file_arr[idx])+".wav")
        feature = extract_feature(file_name)
#         feature = feature[0:192]
#         feature = feature.reshape(1,192)
        label = self.label_arr[idx]
        if self.transform:
            feature = self.transform(feature)
        
        
        sample = {
            'feature': feature,
            'label': label,
            'path': file_name
        }
        return sample

In [4]:
train_set = AudioDataset(csv_file='new_train.csv', root_dir="./Train/")
test_set = AudioDataset(csv_file='new_test.csv', root_dir="./Train/")

In [19]:
trainloader = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True)
testloader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True)

In [192]:
for sample in trainloader:
    print(sample)
    print(sample["feature"].shape)
    break

{'feature': tensor([[[-2.7809e+02,  1.6555e+02,  3.2482e+01, -1.3510e+01, -2.8555e+01,
          -2.9463e+01, -1.9443e+01,  1.1790e+00, -1.3600e+01, -9.4426e+00,
          -6.8983e+00,  9.1908e+00,  1.4023e+01, -1.3031e+01, -3.7774e+00,
           1.1573e+01,  8.5581e+00,  1.4359e+01,  2.5850e+01,  1.5572e+01,
           5.7387e+00,  1.1703e+01,  4.1805e+00,  1.5850e+00,  1.0963e+01,
           9.6809e-01,  2.6062e+00,  7.4083e+00,  5.0846e+00,  4.8101e+00,
           5.5918e+00,  8.3724e+00,  2.0809e+00,  6.2174e+00,  6.0613e+00,
           5.5879e+00,  6.6431e+00,  5.1613e+00,  7.0689e+00,  2.2142e+00,
           3.5324e-01,  4.1810e-01,  4.6483e-01,  4.7792e-01,  5.4179e-01,
           6.5794e-01,  7.4060e-01,  7.1677e-01,  6.0711e-01,  5.4696e-01,
           4.4089e-01,  3.2264e-01,  3.1743e+01,  9.1086e+00,  1.6255e+00,
           4.4560e-01,  7.4427e-02,  2.5994e-02,  6.3624e-02,  8.9661e-02,
           1.4397e-01,  4.1701e-01,  1.2108e+00,  3.3454e+00,  1.0550e+01,
           5.

# Defining the model

In [6]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(RNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
#         print(input_size)

        # embedding and LSTM layers
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        print(x.shape)
        batch_size = x.size(0)

#         x = x.view()
        # embeddings and lstm_out
#         embeds = self.embedding(x)

#         x = x.view(4, batch_size, 48)
        
        lstm_out, hidden = self.lstm(x, hidden)
#         print(lstm_out.shape)

        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.logsoftmax(out)

        # reshape to be batch_size first
#         sig_out = sig_out.view(batch_size, -1)
#         sig_out = sig_out[:, -1] # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

In [151]:
# class RNN(nn.Module):
#     def __init__(self, input_size, output_size, hidden_dim, n_layers):
#         super(RNN, self).__init__()
        
#         self.hidden_dim=hidden_dim
        
#         self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        
#         self.fc = nn.Linear(hidden_dim, output_size)
        
#     def forward(self, x, hidden):
        
#         batch_size = x.size(0)
        
#         r_out, hidden = self.rnn(x, hidden)
        
#         r_out = r_out.view(-1, self.hidden_dim)
        
#         output = self.fc(r_out)
        
#         return output, hidden

# Training the model

In [153]:
# For simple RNN
# input_size = 193
# output_size = 1
# hidden_dim = 256
# n_layers = 3

# model = RNN(input_size, output_size, hidden_dim, n_layers)
# model

RNN(
  (rnn): RNN(193, 256, num_layers=3, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [21]:
# For LSTM
input_size = 193
output_size = 10
hidden_dim = 256
n_layers = 3

model = RNN(input_size, output_size, hidden_dim, n_layers)
model = model.double()
model

RNN(
  (lstm): LSTM(193, 256, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=10, bias=True)
  (logsoftmax): LogSoftmax()
)

In [8]:
train_on_gpu=torch.cuda.is_available()
lr = 0.001
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [22]:
epochs = 1
counter = 0
print_every = 100
clip = 5
batch_size = 1

if train_on_gpu:
    model.cuda()
model.train()

for e in range(epochs):
    h = model.init_hidden(batch_size)
    
    for sample in trainloader:
        inputs = sample["feature"]
        labels = sample["label"]
        
        inputs = inputs.unsqueeze(0)

        
        if train_on_gpu:
            inputs, labels = inputs.cuda(), labels.cuda()
        
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        
        print("yesd")
        
        output, h = model(inputs, h)
        
        print("yes2")
        
        
        print(output)
#         output = (output.unsqueeze(0))
#         labels = (labels.unsqueeze(0))
        
        loss = criterion(output, labels)
    
        print(loss)
    
        loss.backward()
        
        
        print("yes3")
        
        
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter % print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            
            for sample in testloader:
                inputs = sample["feature"]
                labels = sample["label"]
                
                val_h = tuple([each.data for each in val_h])
                
                inputs = inputs.unsqueeze(0)

                
                if train_on_gpu:
                    inputs, labels = inputs.cuda(), labels.cuda()
                    
                output, val_h = model(inputs, val_h)
                
                print(output)
                val_loss = criterion(output, labels)
                
                val_losses.append(val_loss.item())
            
            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


yesd
torch.Size([1, 1, 193])
yes2
tensor([[-2.2408, -2.2962, -2.3131, -2.2686, -2.3215, -2.3580, -2.2872, -2.2588,
         -2.3285, -2.3606]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
tensor(2.2588, device='cuda:0', dtype=torch.float64, grad_fn=<NllLossBackward>)
yes3
torch.Size([1, 1, 193])
tensor([[-2.2379, -2.3028, -2.3144, -2.2757, -2.3253, -2.3450, -2.2738, -2.2547,
         -2.3337, -2.3705]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
torch.Size([1, 1, 193])
tensor([[-2.2279, -2.3013, -2.3173, -2.2772, -2.3296, -2.3454, -2.2716, -2.2565,
         -2.3309, -2.3774]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
torch.Size([1, 1, 193])
tensor([[-2.2208, -2.3010, -2.3144, -2.2814, -2.3348, -2.3483, -2.2664, -2.2605,
         -2.3274, -2.3809]], device='cuda:0', dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward>)
torch.Size([1, 1, 193])
tensor([[-2.2148, -2.3006, -2.3116, -2.2857, 

KeyboardInterrupt: 