In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Step 1: Data Collection and Loading
file_path_csv = 'ESC-50-master/ESC-50-master/meta/esc50.csv'
data = pd.read_csv(file_path_csv)

In [3]:
# Filter audio files based on specific categories (only non-speech human sounds)
selected_categories = ['breathing', 'brushing_teeth', 'clapping', 'coughing', 'crying_baby', 
                       'drinking_sipping', 'footsteps', 
                       'laughing', 'sneezing', 'snoring']
filtered_data = data[data['category'].isin(selected_categories)]

In [4]:
# Directory containing audio files
folder_path = 'ESC-50-master/ESC-50-master/audio'

In [5]:
features = []
labels = []

In [6]:
for index, row in filtered_data.iterrows():
    file_name = row['filename']
    file_path = os.path.join(folder_path, file_name)
    
    if os.path.isfile(file_path):  # Check if file exists
        try:
            audio_data, sr = librosa.load(file_path, sr=None)
            
            # Extracting MFCCs (feature 1)
            mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
            mfccs_flattened = mfccs.flatten()
            
            # Extracting Spectral Centroid (feature 2)
            centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
            
            # Extracting Spectral Bandwidth (feature 3)
            bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
            
            # Extracting Zero-Crossing Rate (feature 4)
            zcr = librosa.feature.zero_crossing_rate(audio_data)
            
            # Combining features
            combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                                bandwidth.flatten(), zcr.flatten()))
            
            # Append features and label to the lists
            features.append(combined_features)
            labels.append(row['category'])
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    else:
        print(f"File not found: {file_name}")

In [7]:
# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

In [8]:
# Min-Max scaling on extracted features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split scaled data into train and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

In [10]:
# List of possible values for n_neighbors
possible_neighbors = [1, 3, 5, 7, 9]

In [11]:
# Dictionary to store accuracy for each n_neighbors (used  to determine the best number of neighbors (k))
accuracy_dict = {}

In [12]:
for neighbors in possible_neighbors:
    # Initialize and train the KNN model with a specific number of neighbors
    knn_model = KNeighborsClassifier(n_neighbors=neighbors)
    knn_model.fit(X_train_scaled, y_train)

    # Predict using the trained model
    predictions = knn_model.predict(X_test_scaled)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, predictions)
    accuracy_dict[neighbors] = accuracy
    print(f"Accuracy with {neighbors} neighbors: {accuracy}")

Accuracy with 1 neighbors: 0.51
Accuracy with 3 neighbors: 0.52
Accuracy with 5 neighbors: 0.46
Accuracy with 7 neighbors: 0.47
Accuracy with 9 neighbors: 0.45


In [23]:
# Choose the number of neighbors with the best accuracy
best_neighbors = max(accuracy_dict, key=accuracy_dict.get)
print(f"Best number of neighbors: {best_neighbors}")
best_accuracy = accuracy_dict[best_neighbors]
print(f"Accuracy with {best_neighbors} neighbors: {best_accuracy}")

# Fix the number of neighbors with the best accuracy in the model for future use
knn_model = KNeighborsClassifier(n_neighbors=best_neighbors)
knn_model.fit(X_train_scaled, y_train)

Best number of neighbors: 3
Accuracy with 3 neighbors: 0.52


KNeighborsClassifier(n_neighbors=3)

In [21]:
predictions = knn_model.predict(X_test_scaled)

# Create a DataFrame to compare actual and predicted classes
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

print(comparison_df)

              Actual         Predicted
0        crying_baby          clapping
1            snoring           snoring
2           laughing          coughing
3        crying_baby         footsteps
4           sneezing          sneezing
..               ...               ...
95  drinking_sipping         footsteps
96           snoring           snoring
97  drinking_sipping          sneezing
98         footsteps         footsteps
99  drinking_sipping  drinking_sipping

[100 rows x 2 columns]


In [14]:
import sounddevice as sd

In [15]:
# Function to extract features from recorded audio
def extract_features(audio_data, sr):
    # Extract MFCCs for recorded audio
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
    mfccs_flattened = mfccs.flatten()
    
    # Extract Spectral Centroid for recorded audio
    centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
    
    # Extract Spectral Bandwidth for recorded audio
    bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
    
    # Extract Zero-Crossing Rate for recorded audio
    zcr = librosa.feature.zero_crossing_rate(audio_data)
    
    # Combine features for recorded audio
    combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                        bandwidth.flatten(), zcr.flatten()))
    
    return combined_features

In [16]:
# Function to manually record audio
def manual_record(seconds=5, samplerate=44100):
    print(f"Recording for {seconds} seconds. Press Enter to start recording...")
    input("Press Enter to start recording...")
    
    audio_data = sd.rec(int(seconds * samplerate), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()
    
    print("Recording completed.")
    return audio_data, samplerate

In [18]:
# Manual recording audio
recorded_audio, sr = manual_record()

Recording for 5 seconds. Press Enter to start recording...
Press Enter to start recording...
Recording completed.


In [19]:
# Extract features from recorded audio
extracted_features = extract_features(np.squeeze(recorded_audio), sr)

In [20]:
# Ensure the number of extracted features matches the expected number for scaling
if len(extracted_features) != len(scaler.data_max_):
    print("Number of features extracted doesn't match the expected number.")
else:
    # Scaling extracted features
    scaled_features = scaler.transform(extracted_features.reshape(1, -1))

    # Perform classification using the scaled features and the trained model
    prediction = knn_model.predict(scaled_features)
    print(f"Predicted class: {prediction}")

Predicted class: ['clapping']
