### Importing Libraries

In [5]:
import os
import csv
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

### Importing Dataset

In [6]:
dataset_path = os.path.join(os.getcwd(),'dataset')
classes = os.listdir(dataset_path)

### Feature Extraction Function

In [7]:
def preprocess_audio_file(file_path):
    feature = []
    audio_data, sr = librosa.load(file_path)

    # Extract audio features and append to feature list
    rms = librosa.feature.rms(y=audio_data)
    rms_mean = np.mean(rms, axis=1)
    rms_median = np.median(rms, axis=1)
    feature.extend([rms_mean[0], rms_median[0]])

    spec_bw = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
    spec_bw_mean = np.mean(spec_bw,axis=1)
    spec_bw_median = np.median(spec_bw,axis=1)
    feature.extend([spec_bw_mean[0],spec_bw_median[0]])

    S = np.abs(librosa.stft(y=audio_data))
    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
    contrast_row_mean = np.mean(np.mean(contrast,axis=1))
    contrast_col_mean = np.mean(np.mean(contrast,axis=0))
    contrast_row_median = np.median(np.median(contrast,axis=1))
    contrast_col_median = np.median(np.median(contrast,axis=0))
    feature.extend([contrast_row_mean, contrast_col_mean, contrast_row_median, contrast_col_median])

    chroma_cens = librosa.feature.chroma_cens(y=audio_data, sr=sr)
    chroma_cq = librosa.feature.chroma_cqt(y=audio_data, sr=sr)
    chroma_cens_row_mean = np.mean(np.mean(chroma_cens,axis=1))
    chroma_cens_col_mean = np.mean(np.mean(chroma_cens,axis=0))
    chroma_cens_row_median = np.median(np.median(chroma_cq,axis=1))
    chroma_cens_col_median = np.median(np.median(chroma_cq,axis=0))
    chroma_cq_row_mean = np.mean(np.mean(chroma_cens,axis=1))
    chroma_cq_col_mean = np.mean(np.mean(chroma_cens,axis=0))
    chroma_cq_row_median = np.median(np.median(chroma_cq,axis=1))
    chroma_cq_col_median = np.median(np.median(chroma_cq,axis=0))
    feature.extend([chroma_cens_row_mean, chroma_cens_col_mean, chroma_cens_row_median, chroma_cens_col_median, chroma_cq_row_mean, chroma_cq_col_mean, chroma_cq_row_median, chroma_cq_col_median])

    cent = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
    cent_mean = np.mean(cent,axis=1)
    cent_median = np.median(cent,axis=1)
    feature.extend([cent_mean[0],cent_median[0]])

    flatness = librosa.feature.spectral_flatness(y=audio_data)
    flatness_mean = np.mean(flatness,axis=1)
    flatness_median = np.median(flatness,axis=1)
    feature.extend([flatness_mean[0],flatness_median[0]])

    rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sr)
    rolloff_mean = np.mean(rolloff,axis=1)
    rolloff_median = np.median(rolloff,axis=1)
    feature.extend([rolloff_mean[0],rolloff_median[0]])

    mfcc = librosa.feature.mfcc(y=audio_data,sr=sr)
    row_mean = np.mean(np.mean(mfcc,axis=1))
    col_mean = np.mean(np.mean(mfcc,axis=0))
    row_median = np.median(np.median(mfcc,axis=1))
    col_median = np.median(np.median(mfcc,axis=0))
    feature.extend([row_mean, col_mean, row_median, col_median])

    zcr = librosa.feature.zero_crossing_rate(y=audio_data)
    zcr_mean = np.mean(zcr,axis=1)
    zcr_median = np.median(zcr,axis=1)
    feature.extend([zcr_mean[0],zcr_median[0]])

    magnitude, phase = librosa.magphase(librosa.stft(y=audio_data))
    sro = librosa.feature.spectral_rolloff(S=magnitude,sr=sr)
    sro_mean = np.mean(sro,axis=1)
    sro_median = np.median(sro,axis=1)
    feature.extend([sro_mean[0],sro_median[0]])

    pitches,magnitudes = librosa.core.piptrack(y=audio_data,sr=sr)
    pitches_row_mean = np.mean(np.mean(pitches,axis=1))
    pitches_col_mean = np.mean(np.mean(pitches,axis=0))
    pitches_row_median = np.median(np.median(pitches,axis=1))
    pitches_col_median = np.median(np.median(pitches,axis=0))
    magnitudes_row_mean = np.mean(np.mean(magnitudes,axis=1))
    magnitudes_col_mean = np.mean(np.mean(magnitudes,axis=0))
    magnitudes_row_median = np.median(np.median(magnitudes,axis=1))
    magnitudes_col_median = np.median(np.median(magnitudes,axis=0))
    feature.extend([pitches_row_mean, pitches_col_mean, pitches_row_median, pitches_col_median, magnitudes_row_mean, magnitudes_col_mean, magnitudes_row_median, magnitudes_col_median])

    c_stft = librosa.feature.chroma_stft(y=audio_data,sr=sr)
    c_stft_row_mean = np.mean(np.mean(c_stft,axis=1))
    c_stft_col_mean = np.mean(np.mean(c_stft,axis=0))
    c_stft_row_median = np.median(np.median(c_stft,axis=1))
    c_stft_col_median = np.median(np.median(c_stft,axis=0))
    feature.extend([c_stft_row_mean,c_stft_col_mean,c_stft_row_median,c_stft_col_median])

    y = librosa.effects.harmonic(audio_data)
    tonnetz = librosa.feature.tonnetz(y=audio_data, sr=sr)
    tonnetz_row_mean = np.mean(np.mean(tonnetz,axis=1))
    tonnetz_col_mean = np.mean(np.mean(tonnetz,axis=0))
    tonnetz_row_median = np.median(np.median(tonnetz,axis=1))
    tonnetz_col_median = np.median(np.median(tonnetz,axis=0))
    feature.extend([tonnetz_row_mean,tonnetz_col_mean,tonnetz_row_median,tonnetz_col_median])

    return feature

### Traversing through Datset and Extracting Features

In [9]:
dataset_path = os.path.join(os.getcwd(),'dataset')
classes = os.listdir(dataset_path)

# Open the CSV file for writing
with open('features.csv','a',newline='') as file:
    writer = csv.writer(file)

    # Loop through each class
    for clas in classes:
        audio_files = os.path.join(dataset_path, clas)

        # Loop through each audio file in the class
        for file in os.listdir(audio_files):
            file_path = os.path.join(dataset_path, clas, file)

            # Call the preprocessing function
            feature = preprocess_audio_file(file_path)

            # Write the feature row to the CSV file
            feature.append(clas)
            writer.writerow(feature)

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


### Importing Features.csv

In [43]:
df = pd.read_csv('./features.csv')

X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

### Label Encoding

In [44]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

### Spliting Dataset into Training and Testing Dataset 

In [45]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Feature Scaling

In [46]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Artificial Neural Network

### Converting Data to PyTorch Tensors

In [47]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

### Creating Datasets and Data Loaders

In [48]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Defining our Neural Network Architecture

In [49]:
class SimpleANN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

### Initialising Input Size, Hidden Size, Number Of Classes, Model, Criterion and Optimizer

In [50]:
input_size = X.shape[1]
hidden_size = 64
num_classes = len(np.unique(y))
model = SimpleANN(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training Loop, Validation Loop, Calculating and Printing Accuracy

In [57]:
num_epochs = 500
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0
    total_loss_train = 0.0

    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss_train = criterion(outputs, y_batch)
        loss_train.backward()
        optimizer.step()
        total_loss_train += loss_train.item()


        _, predicted = torch.max(outputs, 1)
        total_correct_train += (predicted == y_batch).sum().item()
        total_samples_train += y_batch.size(0)

    # Validation loop
    model.eval()
    total_correct = 0
    total_samples = 0
    total_correct_val = 0
    total_samples_val = 0
    total_loss_val = 0.0

    with torch.no_grad():
        for batch in test_loader:
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch)
            loss_val = criterion(outputs, y_batch)
            total_loss_val += loss_val.item()

            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == y_batch).sum().item()
            total_samples += y_batch.size(0)
            total_correct_val += (predicted == y_batch).sum().item()
            total_samples_val += y_batch.size(0)

    accuracy = total_correct / total_samples

    avg_loss_train = total_loss_train / len(train_loader)
    accuracy_train = total_correct_train / total_samples_train

    avg_loss_val = total_loss_val / len(test_loader)
    accuracy_val = total_correct_val / total_samples_val

    print(f'Epoch {epoch + 1}/{num_epochs} Training Loss: {avg_loss_train:.4f}, Training Accuracy: {accuracy_train * 100:.4f}%, Validation Loss: {avg_loss_val:.4f}, Validation Accuracy: {accuracy_val * 100:.4f}%')


Epoch 1/500 Training Loss: 0.9421, Training Accuracy: 71.9643%, Validation Loss: 3.2016, Validation Accuracy: 21.7857%
Epoch 2/500 Training Loss: 0.9360, Training Accuracy: 73.1250%, Validation Loss: 3.2142, Validation Accuracy: 22.5000%
Epoch 3/500 Training Loss: 0.9294, Training Accuracy: 73.9286%, Validation Loss: 3.2386, Validation Accuracy: 21.4286%
Epoch 4/500 Training Loss: 0.9253, Training Accuracy: 73.6607%, Validation Loss: 3.2085, Validation Accuracy: 22.1429%
Epoch 5/500 Training Loss: 0.9240, Training Accuracy: 73.3036%, Validation Loss: 3.2239, Validation Accuracy: 21.7857%
Epoch 6/500 Training Loss: 0.9182, Training Accuracy: 74.3750%, Validation Loss: 3.2311, Validation Accuracy: 21.7857%
Epoch 7/500 Training Loss: 0.9195, Training Accuracy: 74.1071%, Validation Loss: 3.2316, Validation Accuracy: 21.7857%
Epoch 8/500 Training Loss: 0.9173, Training Accuracy: 75.0000%, Validation Loss: 3.2413, Validation Accuracy: 21.7857%
Epoch 9/500 Training Loss: 0.9165, Training Accu

### Saving the trained model

In [52]:
torch.save(model.state_dict(), 'audio_classifier_model.pth')

### Testing Phase

### Importing New File

In [53]:
new_audio_path = 'Recording.wav'

### Extracting Features of the new file

In [54]:
new_audio_feature = preprocess_audio_file(new_audio_path)

	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


### Model Prediction

In [56]:
model.load_state_dict(torch.load('audio_classifier_model.pth'))
df = pd.read_csv('features.csv')

new_audio_tensor = torch.tensor(new_audio_feature, dtype=torch.float32)

model.eval()
with torch.no_grad():
    new_audio_tensor = new_audio_tensor.unsqueeze(0)
    predictions = model(new_audio_tensor)
    _, predicted_class = torch.max(predictions, 1)

# Convert the predicted class index back to the original label
predicted_y = le.inverse_transform([predicted_class.item()])

print(f"The model predicts that the audio belongs to class: {predicted_y}")

The model predicts that the audio belongs to class: ['ac on']
