In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import sounddevice as sd

In [2]:
# Step 1: Data Collection and Loading
file_path_csv = 'ESC-50-master/ESC-50-master/meta/esc50.csv'
data = pd.read_csv(file_path_csv)

In [3]:
# Filter audio files based on specific categories (only non-speech human sounds)
selected_categories = ['breathing', 'brushing_teeth', 'clapping', 'coughing', 'crying_baby', 
                       'drinking_sipping', 'footsteps', 
                       'laughing', 'sneezing', 'snoring']
filtered_data = data[data['category'].isin(selected_categories)]

In [4]:
# Directory containing audio files
folder_path = 'ESC-50-master/ESC-50-master/audio' 

In [5]:
features = []
labels = []

In [6]:
for index, row in filtered_data.iterrows():
    file_name = row['filename']
    file_path = os.path.join(folder_path, file_name)
    
    if os.path.isfile(file_path):  # Check if file exists
        try:
            audio_data, sr = librosa.load(file_path, sr=None)
            
            # Extracting MFCCs (feature 1)
            mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
            mfccs_flattened = mfccs.flatten()
            
            # Extracting Spectral Centroid (feature 2)
            centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
            
            # Extracting Spectral Bandwidth (feature 3)
            bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
            
            # Extracting Zero-Crossing Rate (feature 4)
            zcr = librosa.feature.zero_crossing_rate(audio_data)
            
            # Combining features
            combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                                bandwidth.flatten(), zcr.flatten()))
            
            # Append features and label to the lists
            features.append(combined_features)
            labels.append(row['category'])
            
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    else:
        print(f"File not found: {file_name}")

In [7]:
# Convert features and labels to NumPy arrays
X = np.array(features)
y = np.array(labels)

In [8]:
# Min-Max scaling on extracted features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split scaled data into train and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

In [18]:
# List of possible values for n_estimators (number of trees)
possible_estimators = [50, 100, 150, 200, 250, 300, 350]

# Dictionary to store accuracy for each n_estimators (used  to determine the best number of estimators (trees))
rf_accuracy_dict = {}

for estimator in possible_estimators:
    # Initialize and train the Random Forest Classifier with a specific number of estimators
    rf_model = RandomForestClassifier(n_estimators=estimator)
    rf_model.fit(X_train_scaled, y_train)

    # Predict using the trained model
    rf_predictions = rf_model.predict(X_test_scaled)

    # Evaluate model performance
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    rf_accuracy_dict[estimator] = rf_accuracy
    print(f"Accuracy with {estimator} estimators: {rf_accuracy}")

Accuracy with 50 estimators: 0.625
Accuracy with 100 estimators: 0.65
Accuracy with 150 estimators: 0.725
Accuracy with 200 estimators: 0.675
Accuracy with 250 estimators: 0.625
Accuracy with 300 estimators: 0.65
Accuracy with 350 estimators: 0.675


In [19]:
# Choose the number of estimators with the best accuracy
best_estimators = max(rf_accuracy_dict, key=rf_accuracy_dict.get)  # Choose the one with the highest accuracy
print(f"Best number of estimators: {best_estimators}")
best_accuracy = rf_accuracy_dict[best_estimators]
print(f"Accuracy with {best_estimators} estimators: {best_accuracy}")

# Fix the number of estimators with the best accuracy in the model for future use
rf_model = RandomForestClassifier(n_estimators=best_estimators)
rf_model.fit(X_train_scaled, y_train)

Best number of estimators: 150
Accuracy with 150 estimators: 0.725


RandomForestClassifier(n_estimators=150)

In [20]:
rf_predictions = rf_model.predict(X_test_scaled)

In [21]:
# Create a DataFrame to compare actual and predicted classes
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': rf_predictions})

print(comparison_df)

              Actual         Predicted
0        crying_baby       crying_baby
1            snoring           snoring
2           laughing          laughing
3        crying_baby    brushing_teeth
4           sneezing          sneezing
5           coughing         footsteps
6        crying_baby       crying_baby
7           sneezing          sneezing
8           coughing          coughing
9           laughing          laughing
10         footsteps         footsteps
11          sneezing          sneezing
12          coughing         footsteps
13    brushing_teeth          sneezing
14          clapping          clapping
15          laughing          laughing
16           snoring           snoring
17          laughing          laughing
18          clapping          clapping
19          clapping          clapping
20  drinking_sipping           snoring
21  drinking_sipping         footsteps
22         breathing           snoring
23    brushing_teeth    brushing_teeth
24          clapping     

In [13]:
# Function to extract features from recorded audio
def extract_features(audio_data, sr):
    # Extract MFCCs for recorded audio
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
    mfccs_flattened = mfccs.flatten()
    
    # Extract Spectral Centroid for recorded audio
    centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
    
    # Extract Spectral Bandwidth for recorded audio
    bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
    
    # Extract Zero-Crossing Rate for recorded audio
    zcr = librosa.feature.zero_crossing_rate(audio_data)
    
    # Combine features for recorded audio
    combined_features = np.concatenate((mfccs_flattened, centroid.flatten(), 
                                        bandwidth.flatten(), zcr.flatten()))
    
    return combined_features

In [14]:
# Function to manually record audio
def manual_record(seconds=5, samplerate=44100):
    print(f"Recording for {seconds} seconds. Press Enter to start recording...")
    input("Press Enter to start recording...")
    
    audio_data = sd.rec(int(seconds * samplerate), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()
    
    print("Recording completed.")
    return audio_data, samplerate

In [15]:
# Manual recording audio
recorded_audio, sr = manual_record()

Recording for 5 seconds. Press Enter to start recording...
Press Enter to start recording...
Recording completed.


In [23]:
# Extract features from recorded audio
extracted_features = extract_features(np.squeeze(recorded_audio), sr)

In [24]:
# Ensure the number of extracted features matches the expected number for scaling
if len(extracted_features) != len(scaler.data_max_):
    print("Number of features extracted doesn't match the expected number.")
else:
    # Scaling extracted features
    scaled_features = scaler.transform(extracted_features.reshape(1, -1))

    # Perform classification using the scaled features and the trained Random Forest model
    rf_prediction = rf_model.predict(scaled_features)
    print(f"Predicted class with Random Forest: {rf_prediction}")

Predicted class with Random Forest: ['drinking_sipping']
