### Importing Libraries

In [1]:
import os
import csv
import librosa
import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

### Importing Dataset

In [2]:
dataset_path = os.path.join(os.getcwd(),'dataset')
classes = os.listdir(dataset_path)

### Feature Extraction

In [78]:
i = 0
for clas in classes:
    audio_files = os.path.join(dataset_path,clas)
    for file in os.listdir(audio_files):
        feature = []
        file_path = os.path.join(dataset_path,clas,file)
        print(file_path)
        audio_data,sr = librosa.load(file_path)

        rms = librosa.feature.rms(y=audio_data)
        rms_mean = np.mean(rms,axis=1)
        rms_median = np.median(rms,axis=1)
        feature.extend([rms_mean[0],rms_median[0]])

        spec_bw = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
        spec_bw_mean = np.mean(spec_bw,axis=1)
        spec_bw_median = np.median(spec_bw,axis=1)
        feature.extend([spec_bw_mean[0],spec_bw_median[0]])

        S = np.abs(librosa.stft(y=audio_data))
        contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
        contrast_row_mean = np.mean(np.mean(contrast,axis=1))
        contrast_col_mean = np.mean(np.mean(contrast,axis=0))
        contrast_row_median = np.median(np.median(contrast,axis=1))
        contrast_col_median = np.median(np.median(contrast,axis=0))
        feature.extend([contrast_row_mean, contrast_col_mean, contrast_row_median, contrast_col_median])

        chroma_cens = librosa.feature.chroma_cens(y=audio_data, sr=sr)
        chroma_cq = librosa.feature.chroma_cqt(y=audio_data, sr=sr)
        chroma_cens_row_mean = np.mean(np.mean(chroma_cens,axis=1))
        chroma_cens_col_mean = np.mean(np.mean(chroma_cens,axis=0))
        chroma_cens_row_median = np.median(np.median(chroma_cq,axis=1))
        chroma_cens_col_median = np.median(np.median(chroma_cq,axis=0))
        chroma_cq_row_mean = np.mean(np.mean(chroma_cens,axis=1))
        chroma_cq_col_mean = np.mean(np.mean(chroma_cens,axis=0))
        chroma_cq_row_median = np.median(np.median(chroma_cq,axis=1))
        chroma_cq_col_median = np.median(np.median(chroma_cq,axis=0))
        feature.extend([chroma_cens_row_mean, chroma_cens_col_mean, chroma_cens_row_median, chroma_cens_col_median, chroma_cq_row_mean, chroma_cq_col_mean, chroma_cq_row_median, chroma_cq_col_median])

        cent = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
        cent_mean = np.mean(cent,axis=1)
        cent_median = np.median(cent,axis=1)
        feature.extend([cent_mean[0],cent_median[0]])

        flatness = librosa.feature.spectral_flatness(y=audio_data)
        flatness_mean = np.mean(flatness,axis=1)
        flatness_median = np.median(flatness,axis=1)
        feature.extend([flatness_mean[0],flatness_median[0]])

        rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sr)
        rolloff_mean = np.mean(rolloff,axis=1)
        rolloff_median = np.median(rolloff,axis=1)
        feature.extend([rolloff_mean[0],rolloff_median[0]])

        mfcc = librosa.feature.mfcc(y=audio_data,sr=sr)
        row_mean = np.mean(np.mean(mfcc,axis=1))
        col_mean = np.mean(np.mean(mfcc,axis=0))
        row_median = np.median(np.median(mfcc,axis=1))
        col_median = np.median(np.median(mfcc,axis=0))
        feature.extend([row_mean, col_mean, row_median, col_median])

        zcr = librosa.feature.zero_crossing_rate(y=audio_data)
        zcr_mean = np.mean(zcr,axis=1)
        zcr_median = np.median(zcr,axis=1)
        feature.extend([zcr_mean[0],zcr_median[0]])

        magnitude, phase = librosa.magphase(librosa.stft(y=audio_data))
        sro = librosa.feature.spectral_rolloff(S=magnitude,sr=sr)
        sro_mean = np.mean(sro,axis=1)
        sro_median = np.median(sro,axis=1)
        feature.extend([sro_mean[0],sro_median[0]])

        pitches,magnitudes = librosa.core.piptrack(y=audio_data,sr=sr)
        pitches_row_mean = np.mean(np.mean(pitches,axis=1))
        pitches_col_mean = np.mean(np.mean(pitches,axis=0))
        pitches_row_median = np.median(np.median(pitches,axis=1))
        pitches_col_median = np.median(np.median(pitches,axis=0))
        magnitudes_row_mean = np.mean(np.mean(magnitudes,axis=1))
        magnitudes_col_mean = np.mean(np.mean(magnitudes,axis=0))
        magnitudes_row_median = np.median(np.median(magnitudes,axis=1))
        magnitudes_col_median = np.median(np.median(magnitudes,axis=0))
        feature.extend([pitches_row_mean, pitches_col_mean, pitches_row_median, pitches_col_median, magnitudes_row_mean, magnitudes_col_mean, magnitudes_row_median, magnitudes_col_median])

        c_stft = librosa.feature.chroma_stft(y=audio_data,sr=sr)
        c_stft_row_mean = np.mean(np.mean(c_stft,axis=1))
        c_stft_col_mean = np.mean(np.mean(c_stft,axis=0))
        c_stft_row_median = np.median(np.median(c_stft,axis=1))
        c_stft_col_median = np.median(np.median(c_stft,axis=0))
        feature.extend([c_stft_row_mean,c_stft_col_mean,c_stft_row_median,c_stft_col_median])

        y = librosa.effects.harmonic(audio_data)
        tonnetz = librosa.feature.tonnetz(y=audio_data, sr=sr)
        tonnetz_row_mean = np.mean(np.mean(tonnetz,axis=1))
        tonnetz_col_mean = np.mean(np.mean(tonnetz,axis=0))
        tonnetz_row_median = np.median(np.median(tonnetz,axis=1))
        tonnetz_col_median = np.median(np.median(tonnetz,axis=0))
        feature.extend([tonnetz_row_mean,tonnetz_col_mean,tonnetz_row_median,tonnetz_col_median])

        feature.append(clas)
        
        with open('features.csv','a',newline='') as file:
            writer = csv.writer(file)
            writer.writerow(feature)  


c:\Users\shami\OneDrive\Desktop\v2\dataset\ac\2023_10_03_10_36_34.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\fan\2023_10_03_10_31_11.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\fridge\2023_10_03_10_58_26.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\light\2023_10_03_10_27_36.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\microwave\2023_10_03_10_57_19.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\tv\2023_10_03_10_38_54.wav
c:\Users\shami\OneDrive\Desktop\v2\dataset\washing machine\2023_10_03_10_56_38.wav


### Importing Features.csv

In [48]:
df = pd.read_csv('./features.csv',header=None)
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [32]:
df = pd.read_csv("features.csv",header=None)
x = df.iloc[:,:-1].values
x += 1e-8
y = df.iloc[:,-1].values
x.shape

(1401, 46)

### label Encoding

In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

### Artificial Neural Network

In [89]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the features and labels from the CSV file
df = pd.read_csv('features.csv')

# Extract features and labels
features = df.iloc[:, :-1].values  # Assuming the last column is the label
labels = df.iloc[:, -1].values


# Convert labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split the data into training and validation sets
features_train, features_val, labels_train, labels_val = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

# Convert data to PyTorch tensors
features_train_tensor = torch.tensor(features_train, dtype=torch.float32)
labels_train_tensor = torch.tensor(labels_train, dtype=torch.long)
features_val_tensor = torch.tensor(features_val, dtype=torch.float32)
labels_val_tensor = torch.tensor(labels_val, dtype=torch.long)

# Create datasets and data loaders
train_dataset = TensorDataset(features_train_tensor, labels_train_tensor)
val_dataset = TensorDataset(features_val_tensor, labels_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define your neural network architecture
class SimpleANN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the model, loss function, and optimizer
input_size = features.shape[1]  # Adjust this based on the number of features
hidden_size = 64  # You can adjust this as well
num_classes = len(np.unique(labels))
model = SimpleANN(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0
    total_loss_train = 0.0

    for batch in train_loader:
        features_batch, labels_batch = batch
        features_batch, labels_batch = features_batch.to(device), labels_batch.to(device)

        optimizer.zero_grad()
        outputs = model(features_batch)
        loss_train = criterion(outputs, labels_batch)
        loss_train.backward()
        optimizer.step()
        total_loss_train += loss_train.item()


        _, predicted = torch.max(outputs, 1)
        total_correct_train += (predicted == labels_batch).sum().item()
        total_samples_train += labels_batch.size(0)

    # Validation loop
    model.eval()
    total_correct = 0
    total_samples = 0
    total_correct_val = 0
    total_samples_val = 0
    total_loss_val = 0.0

    with torch.no_grad():
        for batch in val_loader:
            features_batch, labels_batch = batch
            features_batch, labels_batch = features_batch.to(device), labels_batch.to(device)

            outputs = model(features_batch)
            loss_val = criterion(outputs, labels_batch)
            total_loss_val += loss_val.item()

            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels_batch).sum().item()
            total_samples += labels_batch.size(0)
            total_correct_val += (predicted == labels_batch).sum().item()
            total_samples_val += labels_batch.size(0)

    accuracy = total_correct / total_samples

    avg_loss_train = total_loss_train / len(train_loader)
    accuracy_train = total_correct_train / total_samples_train

    avg_loss_val = total_loss_val / len(val_loader)
    accuracy_val = total_correct_val / total_samples_val

    #print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.4f}')
    print(f'Epoch {epoch + 1}/{num_epochs} Training Loss: {avg_loss_train:.4f}, Training Accuracy: {accuracy_train:.4f} Validation Loss: {avg_loss_val:.4f}, Validation Accuracy: {accuracy_val:.4f}')

# Save the trained model if needed
torch.save(model.state_dict(), 'audio_classifier_model.pth')


Epoch 1/50 Training Loss: 83.6126, Training Accuracy: 0.1562 Validation Loss: 32.1546, Validation Accuracy: 0.1821
Epoch 2/50 Training Loss: 23.5563, Training Accuracy: 0.1866 Validation Loss: 15.1231, Validation Accuracy: 0.2036
Epoch 3/50 Training Loss: 12.9119, Training Accuracy: 0.2036 Validation Loss: 13.3556, Validation Accuracy: 0.1714
Epoch 4/50 Training Loss: 8.7414, Training Accuracy: 0.1920 Validation Loss: 6.5310, Validation Accuracy: 0.1821
Epoch 5/50 Training Loss: 7.0491, Training Accuracy: 0.1732 Validation Loss: 7.6429, Validation Accuracy: 0.1536
Epoch 6/50 Training Loss: 6.5652, Training Accuracy: 0.1893 Validation Loss: 6.0681, Validation Accuracy: 0.1571
Epoch 7/50 Training Loss: 6.8593, Training Accuracy: 0.1973 Validation Loss: 5.6262, Validation Accuracy: 0.1893
Epoch 8/50 Training Loss: 7.4869, Training Accuracy: 0.1795 Validation Loss: 8.6659, Validation Accuracy: 0.2071
Epoch 9/50 Training Loss: 7.5943, Training Accuracy: 0.1893 Validation Loss: 7.0307, Valid

### New File

In [97]:
model.load_state_dict(torch.load('audio_classifier_model.pth'))

new_audio_path = 'Recording.wav'

feature = []
audio_data,sr = librosa.load(new_audio_path)

rms = librosa.feature.rms(y=audio_data)
rms_mean = np.mean(rms,axis=1)
rms_median = np.median(rms,axis=1)
feature.extend([rms_mean[0],rms_median[0]])

spec_bw = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
spec_bw_mean = np.mean(spec_bw,axis=1)
spec_bw_median = np.median(spec_bw,axis=1)
feature.extend([spec_bw_mean[0],spec_bw_median[0]])

S = np.abs(librosa.stft(y=audio_data))
contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
contrast_row_mean = np.mean(np.mean(contrast,axis=1))
contrast_col_mean = np.mean(np.mean(contrast,axis=0))
contrast_row_median = np.median(np.median(contrast,axis=1))
contrast_col_median = np.median(np.median(contrast,axis=0))
feature.extend([contrast_row_mean, contrast_col_mean, contrast_row_median, contrast_col_median])

chroma_cens = librosa.feature.chroma_cens(y=audio_data, sr=sr)
chroma_cq = librosa.feature.chroma_cqt(y=audio_data, sr=sr)
chroma_cens_row_mean = np.mean(np.mean(chroma_cens,axis=1))
chroma_cens_col_mean = np.mean(np.mean(chroma_cens,axis=0))
chroma_cens_row_median = np.median(np.median(chroma_cq,axis=1))
chroma_cens_col_median = np.median(np.median(chroma_cq,axis=0))
chroma_cq_row_mean = np.mean(np.mean(chroma_cens,axis=1))
chroma_cq_col_mean = np.mean(np.mean(chroma_cens,axis=0))
chroma_cq_row_median = np.median(np.median(chroma_cq,axis=1))
chroma_cq_col_median = np.median(np.median(chroma_cq,axis=0))
feature.extend([chroma_cens_row_mean, chroma_cens_col_mean, chroma_cens_row_median, chroma_cens_col_median, chroma_cq_row_mean, chroma_cq_col_mean, chroma_cq_row_median, chroma_cq_col_median])

cent = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
cent_mean = np.mean(cent,axis=1)
cent_median = np.median(cent,axis=1)
feature.extend([cent_mean[0],cent_median[0]])

flatness = librosa.feature.spectral_flatness(y=audio_data)
flatness_mean = np.mean(flatness,axis=1)
flatness_median = np.median(flatness,axis=1)
feature.extend([flatness_mean[0],flatness_median[0]])

rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sr)
rolloff_mean = np.mean(rolloff,axis=1)
rolloff_median = np.median(rolloff,axis=1)
feature.extend([rolloff_mean[0],rolloff_median[0]])

mfcc = librosa.feature.mfcc(y=audio_data,sr=sr)
row_mean = np.mean(np.mean(mfcc,axis=1))
col_mean = np.mean(np.mean(mfcc,axis=0))
row_median = np.median(np.median(mfcc,axis=1))
col_median = np.median(np.median(mfcc,axis=0))
feature.extend([row_mean, col_mean, row_median, col_median])

zcr = librosa.feature.zero_crossing_rate(y=audio_data)
zcr_mean = np.mean(zcr,axis=1)
zcr_median = np.median(zcr,axis=1)
feature.extend([zcr_mean[0],zcr_median[0]])

magnitude, phase = librosa.magphase(librosa.stft(y=audio_data))
sro = librosa.feature.spectral_rolloff(S=magnitude,sr=sr)
sro_mean = np.mean(sro,axis=1)
sro_median = np.median(sro,axis=1)
feature.extend([sro_mean[0],sro_median[0]])

pitches,magnitudes = librosa.core.piptrack(y=audio_data,sr=sr)
pitches_row_mean = np.mean(np.mean(pitches,axis=1))
pitches_col_mean = np.mean(np.mean(pitches,axis=0))
pitches_row_median = np.median(np.median(pitches,axis=1))
pitches_col_median = np.median(np.median(pitches,axis=0))
magnitudes_row_mean = np.mean(np.mean(magnitudes,axis=1))
magnitudes_col_mean = np.mean(np.mean(magnitudes,axis=0))
magnitudes_row_median = np.median(np.median(magnitudes,axis=1))
magnitudes_col_median = np.median(np.median(magnitudes,axis=0))
feature.extend([pitches_row_mean, pitches_col_mean, pitches_row_median, pitches_col_median, magnitudes_row_mean, magnitudes_col_mean, magnitudes_row_median, magnitudes_col_median])

c_stft = librosa.feature.chroma_stft(y=audio_data,sr=sr)
c_stft_row_mean = np.mean(np.mean(c_stft,axis=1))
c_stft_col_mean = np.mean(np.mean(c_stft,axis=0))
c_stft_row_median = np.median(np.median(c_stft,axis=1))
c_stft_col_median = np.median(np.median(c_stft,axis=0))
feature.extend([c_stft_row_mean,c_stft_col_mean,c_stft_row_median,c_stft_col_median])

y = librosa.effects.harmonic(audio_data)
tonnetz = librosa.feature.tonnetz(y=audio_data, sr=sr)
tonnetz_row_mean = np.mean(np.mean(tonnetz,axis=1))
tonnetz_col_mean = np.mean(np.mean(tonnetz,axis=0))
tonnetz_row_median = np.median(np.median(tonnetz,axis=1))
tonnetz_col_median = np.median(np.median(tonnetz,axis=0))
feature.extend([tonnetz_row_mean,tonnetz_col_mean,tonnetz_row_median,tonnetz_col_median])

df = pd.read_csv('features.csv')

# Extract features and labels
labels = df.iloc[:, -1].values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

new_audio_features = feature

new_audio_tensor = torch.tensor(new_audio_features, dtype=torch.float32)

model.eval()
with torch.no_grad():
    new_audio_tensor = new_audio_tensor.unsqueeze(0)  # Add batch dimension
    predictions = model(new_audio_tensor)
    _, predicted_class = torch.max(predictions, 1)

# Convert the predicted class index back to the original label
predicted_label = label_encoder.inverse_transform([predicted_class.item()])  # Pass a list or 1D array

print(f"The model predicts that the audio belongs to class: {predicted_label}")


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


The model predicts that the audio belongs to class: ['washing machine']


### Extracting Best Features

In [59]:
k_best = 20 # Choose the number of top features you want to select
selector = SelectKBest(score_func=f_classif, k=k_best)
X_new = selector.fit_transform(x, y)
X_new.shape
selected_feature_indices = selector.get_support(indices=True)
print(selected_feature_indices)

[ 0  1  2  4  5  6  7  8 10 11 12 13 14 15 16 20 25 28 34 35]


### Spliting Dataset into Training and Tesing Dataset

In [60]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.2,random_state=42)

### Feature Scaling

In [61]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### XGBoost Model

In [62]:
# from xgboost import XGBClassifier
# xgb = XGBClassifier(objective="multi:softmax", num_class=len(set(y)), random_state=42)
# xgb.fit(X_train, y_train)

# from sklearn.linear_model import LogisticRegression
# xgb = LogisticRegression()
# xgb.fit(X_train,y_train)

# from sklearn.ensemble import RandomForestClassifier
# xgb = RandomForestClassifier()
# xgb.fit(X_train,y_train)

# from sklearn.svm import SVC
# xgb = SVC(kernel='rbf')
# xgb.fit(X_train,y_train)

# from sklearn.tree import DecisionTreeClassifier
# xgb = DecisionTreeClassifier()
# xgb.fit(X_train,y_train)

# from sklearn.naive_bayes import GaussianNB
# xgb = GaussianNB()
# xgb.fit(X_train,y_train)

from sklearn.naive_bayes import BernoulliNB
xgb = BernoulliNB()
xgb.fit(X_train,y_train)

### Predictions

In [63]:
y_pred_train = xgb.predict(X_train)
y_pred_test = xgb.predict(X_test)

### Accuracy

In [64]:
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train,y_pred_train)
acc_test = accuracy_score(y_test,y_pred_test)
print(round(acc_train*100,2))
print(round(acc_test*100,2))

22.05
18.86
