In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import sounddevice as sd

# Step 1: Data Collection and Loading
file_path_csv = 'ESC-50-master/ESC-50-master/meta/esc50.csv'
data = pd.read_csv(file_path_csv)

# Filter audio files based on specific categories (only non-speech human sounds)
selected_categories = ['breathing', 'brushing_teeth', 'clapping', 'coughing', 'crying_baby', 
                       'drinking_sipping', 'footsteps', 
                       'laughing', 'sneezing', 'snoring']
filtered_data = data[data['category'].isin(selected_categories)]

# Directory containing audio files
folder_path = 'ESC-50-master/ESC-50-master/audio' 

features = []
labels = []

for index, row in filtered_data.iterrows():
    file_name = row['filename']
    file_path = os.path.join(folder_path, file_name)
    
    if os.path.isfile(file_path):  # Check if file exists
        try:
            audio_data, sr = librosa.load(file_path, sr=None)
            
            # Extracting MFCCs (feature 1)
            mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
            mfccs_flattened = mfccs.flatten()
            
            # Extracting Spectral Centroid (feature 2)
            centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
            
            # Extracting Spectral Bandwidth (feature 3)
            bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
            
            # Extracting Zero-Crossing Rate (feature 4)
            zcr = librosa.feature.zero_crossing_rate(audio_data)
            
            # Combining features
            combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                                bandwidth.flatten(), zcr.flatten()))
            
            # Append features and label to the lists
            features.append(combined_features)
            labels.append(row['category'])
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    else:
        print(f"File not found: {file_name}")

# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Min-Max scaling on extracted features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split scaled data into train and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [3]:
from sklearn.metrics import accuracy_score

# Define the range of hyperparameters to search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']  
}

# Initialize the SVM classifier
svm_model = SVC(random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters found by GridSearchCV
best_hyperparameters = grid_search.best_params_
print(f"Best hyperparameters: {best_hyperparameters}")

# Use the best hyperparameters to build the model
best_svm_model = SVC(**best_hyperparameters, random_state=42)
best_svm_model.fit(X_train_scaled, y_train)


Best hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


SVC(C=10, random_state=42)

In [4]:
# Assuming X_test_scaled and y_test are defined similarly
# X_test_scaled = scaler.transform(X_test)
predictions = best_svm_model.predict(X_test_scaled)

# Calculate accuracy using the test set
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy with best hyperparameters: {accuracy}")

Accuracy with best hyperparameters: 0.725


In [5]:
# Create a DataFrame to compare actual and predicted classes
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Display the comparison
print(comparison_df)

              Actual         Predicted
0        crying_baby       crying_baby
1            snoring           snoring
2           laughing          laughing
3        crying_baby    brushing_teeth
4           sneezing          sneezing
5           coughing         footsteps
6        crying_baby       crying_baby
7           sneezing          sneezing
8           coughing          coughing
9           laughing          laughing
10         footsteps         footsteps
11          sneezing          sneezing
12          coughing  drinking_sipping
13    brushing_teeth         breathing
14          clapping          clapping
15          laughing          laughing
16           snoring           snoring
17          laughing       crying_baby
18          clapping          clapping
19          clapping          clapping
20  drinking_sipping           snoring
21  drinking_sipping  drinking_sipping
22         breathing  drinking_sipping
23    brushing_teeth    brushing_teeth
24          clapping     

In [6]:
# Function to extract features from recorded audio
def extract_features(audio_data, sr):
    # Extract MFCCs for recorded audio
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
    mfccs_flattened = mfccs.flatten()
    
    # Extract Spectral Centroid for recorded audio
    centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
    
    # Extract Spectral Bandwidth for recorded audio
    bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
    
    # Extract Zero-Crossing Rate for recorded audio
    zcr = librosa.feature.zero_crossing_rate(audio_data)
    
    # Combine features for recorded audio
    combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                        bandwidth.flatten(), zcr.flatten()))
    
    return combined_features

In [7]:
# Function to manually record audio
def manual_record(seconds=5, samplerate=44100):
    print(f"Recording for {seconds} seconds. Press Enter to start recording...")
    input("Press Enter to start recording...")
    
    audio_data = sd.rec(int(seconds * samplerate), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()
    
    print("Recording completed.")
    return audio_data, samplerate

In [8]:
# Manual recording audio
recorded_audio, sr = manual_record()

Recording for 5 seconds. Press Enter to start recording...
Press Enter to start recording...
Recording completed.


In [9]:
# Extract features from recorded audio
extracted_features = extract_features(np.squeeze(recorded_audio), sr)

In [10]:
# Ensure the number of extracted features matches the expected number for scaling
if len(extracted_features) != len(scaler.data_max_):
    print("Number of features extracted doesn't match the expected number.")
else:
    # Scaling extracted features
    scaled_features = scaler.transform(extracted_features.reshape(1, -1))

    # Perform classification using the scaled features and the trained SVM model
    prediction = best_svm_model.predict(scaled_features)
    print(f"Predicted class with SVM: {prediction}")

Predicted class with SVM: ['crying_baby']
