# Convolution Neural Network with MFCC audio data


Assumes that preprocessing step is already done using the matlab functions (output is table which we will convert to pandas dataframe)
* `generate_data.m`
* `resampleRun.m`

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [2]:
numCoeffs = 12;
batchSize = 32;
filename  = "mfcc_data.csv"

In [3]:
def load_data(filename, numCoeffs):
    data_raw = pd.read_csv(filename)
    numSamples = data_raw.shape[0]
    numFrames  = data_raw.shape[1]//numCoeffs
    #labels = data_raw['label'].values
    labels = []
    mfcc_spectrograms = np.empty((numSamples, numFrames, numCoeffs), dtype=np.float32)  # Preallocate array
    for index, row in data_raw.iterrows():
        if row.iloc[0] == 'ad':
            labels.append(0)
        else:
            labels.append(1)
        mfcc = row[2:]
        mfcc = mfcc.values.reshape(numCoeffs, numFrames)
        mfcc = mfcc.T
        mfcc_spectrograms[index] = mfcc
    labels = np.array(labels)
    return labels, mfcc_spectrograms

In [4]:
# load up mfcc data
labels, mfcc_data = load_data(filename, numCoeffs)
print(mfcc_data.shape)

(967, 498, 12)


In [5]:
# Convert data to tensor compliant dataset
mfccs_train, mfccs_test, labels_train, labels_test = train_test_split(mfcc_data, labels, test_size=0.2, random_state=100)
mfccs_train_tensor  = torch.tensor(mfccs_train).unsqueeze(1)
mfccs_test_tensor   = torch.tensor(mfccs_test).unsqueeze(1)
labels_train_tensor = torch.tensor(labels_train)
labels_test_tensor  = torch.tensor(labels_test)
train_dataset = TensorDataset(mfccs_train_tensor, labels_train_tensor)
test_dataset = TensorDataset(mfccs_test_tensor, labels_test_tensor)
train_loader = DataLoader(train_dataset, batch_size = batchSize, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batchSize, shuffle = False)

In [14]:
# Define our CNN
viewSize  = 128 * 6 * 6;
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(7936, 512)
        self.fc2 = nn.Linear(512, 2)

    def forward(self, x):
        x = x.view(-1, 1, 498, 12) #(batch size, channels, height, width)
        # Convolutional layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        
        # Flatten the output for fully connected layers
        print(x.shape)
        size = x.size()[1:]  #all dimensions except the batch dimension
        print(size)
        numFeatures = 1
        for s in size:
            numFeatures *= s
        print(numFeatures)
        x = x.view(-1, numFeatures)
        print(numFeatures)
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [15]:
# The Model
model = CNN()

# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
# Train
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

torch.Size([32, 128, 62, 1])
torch.Size([128, 62, 1])
7936
7936


RuntimeError: expected scalar type Long but found Int

In [None]:
# Evaluate
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")