In [6]:
import os
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# Extract features from audio
def extract_features(audio_path):
    audio, sr = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13).mean(axis=1)
    energy = np.sum(audio ** 2) / len(audio)
    pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
    pitch = np.mean(pitches[magnitudes > np.median(magnitudes)]) if np.any(magnitudes) else 0

    return np.hstack([mfccs, energy, pitch])

# Prepare the dataset
def prepare_dataset(dataset_path):
    features = []
    labels = []

    for label in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, label)
        if os.path.isdir(class_path):
            print(f"Processing {label}...")
            for file in tqdm(os.listdir(class_path)):
                if file.endswith('.wav'):
                    file_path = os.path.join(class_path, file)
                    try:
                        feature = extract_features(file_path)
                        features.append(feature)
                        labels.append(label)
                    except Exception as e:
                        print(f"Error processing {file}: {e}")
    return np.array(features), np.array(labels)

# Example Usage
dataset_path = dataset_path = r"F:\Cap_final\InterviewLens\backend\dataset"
  # Update with your dataset path
X, y = prepare_dataset(dataset_path)

print("Dataset loaded.")
print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")


Processing nervous...


100%|██████████| 5/5 [00:00<00:00, 13.31it/s]


Processing normal...


100%|██████████| 5/5 [00:00<00:00, 35.09it/s]

Dataset loaded.
Features shape: (8, 15)
Labels shape: (8,)





In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Encoded Labels:", y_encoded)


Encoded Labels: [0 0 0 0 1 1 1 1]


In [8]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (6, 15)
Testing data shape: (2, 15)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

# Print accuracy and classification report
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

     nervous       1.00      1.00      1.00         1
      normal       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [12]:
import joblib

# Save the trained model to a file
joblib.dump(model, "speech_fumble_detector.pkl")

# Save the label encoder (to decode predictions later)
joblib.dump(label_encoder, "label_encoder.pkl")

print("Model and label encoder saved.")


Model and label encoder saved.


In [13]:
import joblib

# Load the trained model and label encoder
model = joblib.load("speech_fumble_detector.pkl")
label_encoder = joblib.load("label_encoder.pkl")

def detect_fumble():
    try:
        filename = "real_time_audio.wav"
        record_with_improved_vad(filename)  # Record audio with 4 seconds silence timeout
        features = extract_features(filename).reshape(1, -1)  # Extract and reshape features
        prediction = model.predict(features)  # Predict using the model
        predicted_label = label_encoder.inverse_transform(prediction)  # Decode label
        print(f"Predicted Class: {predicted_label[0]}")
    except KeyboardInterrupt:
        print("Real-time detection stopped.")
