In [1]:
import pandas as pd
import numpy as np
import Config
import os, librosa
import torch, torchaudio
from torch.autograd import Variable 
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.nn import init
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class Preprocessing_df():
    def __init__(self, data_path):
        self.path = data_path
        self.df = self.__load_data()

    def __load_data(self):
        return pd.read_csv(self.path)

    def audio_path(self):
        self.df['audio_path'] = 'audio' + '\\fold' + self.df['fold'].astype(str) + '\\'  + self.df['slice_file_name'].astype(str)
        #self.df['audio_path'] = os.path.join(os.getcwd(), 'audio') + '\\fold' + self.df['fold'].astype(str) + '\\'  + self.df['slice_file_name'].astype(str)

    def create_dataset(self):
        self.dataset = self.df[[ 'audio_path', 'classID']]
        
    def get_dataset(self):
        self.audio_path()
        self.create_dataset()
        return self.dataset

class AudioPreprocessing():
    @staticmethod
    def __openaudio(filepath, sr):
        sig, rs = librosa.load(filepath, sr=sr)
        return sig, rs
    
    @staticmethod
    def print_sample_shape(filepath, sr):
        sig, sr = AudioPreprocessing.__openaudio(filepath, sr)
        print(sig.shape)

    @staticmethod
    def padding(filepath, sr):
        sig, rs = AudioPreprocessing.__openaudio(filepath, sr)
        padded_audio = librosa.util.fix_length(sig, size=4*rs)
        # if sig.shape[0] < 4*rs:
        #     padded_audio = librosa.util.fix_length(sig, size=4*rs)
        #     #padded_audio = np.pad(sig, int(np.ceil((4*rs-sig.shape[0])/2)), mode='reflect')
        # else:
        #     padded_audio = sig[:4*rs]
        return padded_audio, rs
    
    @staticmethod
    def spectro_gram(filepath, sr, n_fft=2048, hop_length=None, n_mels=32, fmin=20, fmax=8500, top_db=80):
        padded_audio, rs = AudioPreprocessing.padding(filepath, sr)
        spec = librosa.feature.melspectrogram(y=padded_audio, sr=rs, n_fft=n_fft,
                hop_length=hop_length, n_mels=n_mels)
        spec_db = librosa.power_to_db(spec, top_db=top_db)
        return spec_db
    
    @staticmethod
    def spec_to_image(filepath, eps=1e-6):
        spec_db, rs = AudioPreprocessing.spectro_gram(filepath)
        mean = spec_db.mean()
        std = spec_db.std()
        spec_norm = (spec_db - mean) / (std + eps)
        spec_min, spec_max = spec_norm.min(), spec_norm.max()
        spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
        spec_scaled = spec_scaled.astype(np.uint8)
        return spec_scaled, rs


class AudioPreprocessing_data():
    def __init__(self):
        self.path = Config.csv_path
        self.df = self.__load_data()
        self.RoS = Config.sampling_rate
        self.bs = Config.batch_size

    def __load_data(self):
        data = Preprocessing_df(self.path)
        print("Done Loading data")
        return data.get_dataset()

    def __prepare_data(self):
        x = []
        spec_db = self.df['audio_path'].apply(AudioPreprocessing.spectro_gram, sr=self.RoS)
        for i in range(len(spec_db)):
            x.append(spec_db[i])
        self.x = np.array(x)
        self.y = self.df['classID'].astype(np.uint8).to_numpy()
        print("Done Audio Processing - Spectrogram, x, y")

    def __prepare_loaders(self):
        self.__prepare_data()
        X_train, X_test, y_train, y_test = train_test_split(self.x, self.y, test_size = 0.2, random_state = 42)
        X_train = torch.tensor(X_train, dtype=torch.float32)
        y_train = torch.tensor(y_train, dtype=torch.uint8)
        X_test = torch.tensor(X_test, dtype=torch.float32)
        y_test = torch.tensor(y_test, dtype=torch.uint8)
        train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
        test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
        self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.bs, shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False)
        print("Done preparing loaders")

    def get_df(self):
        return self.df

    def get_x_y(self):
        self.__prepare_data()
        return self.x, self.y

    def get_data(self):
        self.__prepare_loaders()
        return self.train_loader, self.test_loader

In [3]:
data = AudioPreprocessing_data()
df = data.get_df()

Done Loading data


In [4]:
data = AudioPreprocessing_data()
X, y = data.get_x_y()

Done Loading data
Done Audio Processing - Spectrogram, x, y


In [5]:
X_expanded = np.expand_dims(X, axis=-1)
X_train, X_test, y_train, y_test = train_test_split(X_expanded, y, stratify=y, test_size = 0.2, random_state = 42)
X_train = X_train/255
X_test = X_test/255

In [6]:
print(X_train.shape, y_train.shape)

(6985, 32, 345, 1) (6985,)


In [8]:
import tensorflow as tf
from tensorflow.keras import models, layers

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 345, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=30, 
                    validation_data=(X_test, y_test),
                    batch_size=32)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [21]:
data = AudioPreprocessing_data()
train_loader, test_loader = data.get_data()

Done Loading data
Done Audio Processing - Spectrogram


In [87]:
from torch import flatten, nn
class AudioClassifier(nn.Module):
    def __init__(self, numChannels, classes):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=numChannels, out_channels=20, kernel_size=(5, 5))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
		# initialize second set of CONV => RELU => POOL layers
        self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
		# initialize first (and only) set of FC => RELU layers
        self.fc1 = nn.Linear(in_features=800, out_features=500)
        self.relu3 = nn.ReLU()
		# initialize our softmax classifier
        self.fc2 = nn.Linear(in_features=500, out_features=classes)
        self.logSoftmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        # pass the output from the previous layer through the second
        # set of CONV => RELU => POOL layers
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        # flatten the output from the previous layer and pass it
        # through our only set of FC => RELU layers
        x = flatten(x, 1)
        x = self.fc1(x)
        x = self.relu3(x)
        # pass the output to our softmax classifier to get our output
        # predictions
        x = self.fc2(x)
        output = self.logSoftmax(x)
        # return the output predictions
        return output
#RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x6640 and 400x120)
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(6640, 16)
        self.fc2 = nn.Linear(16, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create the model and put it on the GPU if available
#myModel = AudioClassifier(numChannels=1, classes=10)
myModel = Net()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [88]:
INIT_LR = 1e-3
BATCH_SIZE = 64
EPOCHS = 10
ip_shape = (32, 345)
trainSteps = len(train_loader.dataset) // BATCH_SIZE
testSteps = len(test_loader.dataset) // BATCH_SIZE

In [91]:
def training(model, train_dl, val_dl, num_epochs=EPOCHS):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')
    H = {
	"train_loss": [],
	"train_acc": [],
	"val_loss": [],
	"val_acc": []
    }
    print("[INFO] training the network...")
    for e in range(0, num_epochs):
        model.train()
        totalTrainLoss = 0
        trainCorrect = 0
        totalValLoss = 0
        valCorrect = 0
        for (x, y) in train_loader:
            x = x.to(device)
            y = y.to(device)
            x_mean, x_std = x.mean(), x.std()
            x = (x - x_mean) / x_std
            x = x.unsqueeze(1)

            y_pred = model(x)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            totalTrainLoss += loss
            trainCorrect += (y_pred.argmax(1) == y).type(torch.float).sum().item()
            with torch.no_grad():
                model.eval()
                for (x_val, y_val) in val_dl:
                    x_val = x_val.to(device)
                    y_val = y_val.to(device)
                    x_val_mean, x_val_std = x_val.mean(), x_val.std()
                    x_val = (x_val - x_val_mean) / x_val_std
                    x_val = x_val.unsqueeze(1)
                    pred = model(x)
                    totalValLoss += criterion(pred, y)
                    valCorrect += (pred.argmax(1) == y_val).type(torch.float).sum().item()
        avgTrainLoss = totalTrainLoss / trainSteps
        avgValLoss = totalValLoss / testSteps
        trainCorrect = trainCorrect / len(train_dl.dataset)
        valCorrect = valCorrect / len(val_dl.dataset)
        H["train_loss"].append(avgTrainLoss.cpu().detach().numpy())
        H["train_acc"].append(trainCorrect)
        H["val_loss"].append(avgValLoss.cpu().detach().numpy())
        H["val_acc"].append(valCorrect)
        print("[INFO] EPOCH: {}/{}".format(e + 1, EPOCHS))
        print("Train loss: {:.6f}, Train accuracy: {:.4f}".format(avgTrainLoss, trainCorrect))
        print("Val loss: {:.6f}, Val accuracy: {:.4f}\n".format(avgValLoss, valCorrect))

training(myModel, train_loader, test_loader, num_epochs=EPOCHS)


[INFO] training the network...
[INFO] EPOCH: 1/10
Train loss: 7.928432, Train accuracy: 0.2677
Val loss: 55523.046875, Val accuracy: 1924.0000



KeyboardInterrupt: 

In [34]:
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')
  
num_epochs=2   # Just for demo, adjust this higher.
training(myModel, train_loader, num_epochs)

RuntimeError: Given groups=1, weight of size [8, 1, 5, 5], expected input[1, 16, 32, 345] to have 1 channels, but got 16 channels instead

In [60]:
def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set
inference(myModel, test_loader)

RuntimeError: Given groups=1, weight of size [8, 2, 5, 5], expected input[1, 1, 32, 345] to have 2 channels, but got 1 channels instead