In [1]:
import os
import json
import librosa
import numpy as np

# Paths to JSON and audio files
json_folder = 'JSON'
audio_folder = 'cleaned_wav_files'

# Match audio and JSON files
json_files = {os.path.splitext(f)[0]: os.path.join(json_folder, f) for f in os.listdir(json_folder) if f.endswith('.json')}
audio_files = {os.path.splitext(f)[0]: os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.wav', '.m4a', '.mp3'))}

matched_files = {name: (json_files[name], audio_files[name]) for name in json_files if name in audio_files}

# Function to load annotations from a JSON file
def load_annotations(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    annotations = []
    for item in data[0]['annotations'][0]['result']:
        if item['type'] == 'labels':
            start = item['value']['start']
            end = item['value']['end']
            label = item['value']['labels'][0]
            annotations.append((start, end, label))
    return annotations

# Function to extract audio segments based on annotations
def extract_audio_segments(audio_file, annotations, sr=16000):
    y, _ = librosa.load(audio_file, sr=sr)
    segments = []
    for start, end, label in annotations:
        segment = y[int(start * sr):int(end * sr)]
        segments.append((segment, label))
    return segments

dataset = []
files_loaded = []

for name, (json_path, audio_path) in matched_files.items():
    if len(load_annotations(json_path)) != 0:
        files_loaded.append(json_path)
        annotations = load_annotations(json_path)
        audio_segments = extract_audio_segments(audio_path, annotations)
        dataset.extend(audio_segments)

# Function to extract features (MFCC) from audio segments
def extract_features(segments, n_mfcc=40, max_length=300):
    features, labels = [], []
    for i, (segment, label) in enumerate(segments):
        try:
            # Check if the segment is too short for FFT processing
            if len(segment) < 2048:  # Minimum required length for n_fft=2048
                print(f"Skipping segment {i} due to insufficient length: {len(segment)} samples")
                continue

            # Dynamically adjust n_fft based on signal length
            n_fft = min(2048, len(segment))  # Use the smaller of 2048 or segment length

            # Extract MFCC features with adjusted n_fft
            mfcc = librosa.feature.mfcc(y=segment, sr=16000, n_mfcc=n_mfcc, n_fft=n_fft)

            # Handle variable lengths (pad if short, truncate if long)
            if mfcc.shape[1] < max_length:
                padded_mfcc = np.pad(mfcc, ((0, 0), (0, max_length - mfcc.shape[1])), mode='constant')
            else:
                padded_mfcc = mfcc[:, :max_length]

            # Append features and labels
            features.append(padded_mfcc.T)
            labels.append(0 if label == 'Field pause' else 1)

        except Exception as e:
            print(f"Error processing segment {i}: {e}")

    return np.array(features), np.array(labels)

# Extract features and labels from the dataset
X, y = extract_features(dataset)

Skipping segment 166 due to insufficient length: 0 samples
Skipping segment 167 due to insufficient length: 0 samples
Skipping segment 298 due to insufficient length: 677 samples
Skipping segment 304 due to insufficient length: 0 samples
Skipping segment 401 due to insufficient length: 0 samples
Skipping segment 466 due to insufficient length: 0 samples
Skipping segment 467 due to insufficient length: 0 samples
Skipping segment 468 due to insufficient length: 0 samples
Skipping segment 469 due to insufficient length: 0 samples
Skipping segment 470 due to insufficient length: 0 samples
Skipping segment 471 due to insufficient length: 0 samples
Skipping segment 472 due to insufficient length: 0 samples
Skipping segment 473 due to insufficient length: 0 samples
Skipping segment 474 due to insufficient length: 0 samples
Skipping segment 475 due to insufficient length: 0 samples
Skipping segment 476 due to insufficient length: 0 samples
Skipping segment 477 due to insufficient length: 0 sam

In [2]:
import numpy as np
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

def balance_dataset(X, y, balance_strategy='smoteenn', random_state=42):
    """
    Balance the dataset using various strategies.
    
    Parameters:
    -----------
    X : numpy array
        Feature array of shape (n_samples, sequence_length, n_features)
    y : numpy array
        Labels array of shape (n_samples,)
    balance_strategy : str
        Strategy to use for balancing. Options: 'smoteenn', 'smote', 'random'
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    X_balanced : numpy array
        Balanced feature array
    y_balanced : numpy array
        Balanced labels array
    """
    # Reshape the 3D array to 2D for SMOTE
    original_shape = X.shape
    X_reshaped = X.reshape(X.shape[0], -1)
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_reshaped)
    
    # Apply balancing strategy
    if balance_strategy == 'smoteenn':
        # SMOTEENN combines SMOTE with Edited Nearest Neighbors
        balancer = SMOTEENN(random_state=random_state)
        X_balanced, y_balanced = balancer.fit_resample(X_scaled, y)
    
    elif balance_strategy == 'smote':
        # Only SMOTE for oversampling
        balancer = SMOTE(random_state=random_state)
        X_balanced, y_balanced = balancer.fit_resample(X_scaled, y)
    
    elif balance_strategy == 'random':
        # Random under-sampling of majority class and over-sampling of minority class
        over_sampler = RandomUnderSampler(random_state=random_state)
        X_balanced, y_balanced = over_sampler.fit_resample(X_scaled, y)
    
    else:
        raise ValueError("Invalid balance_strategy. Choose 'smoteenn', 'smote', or 'random'")
    
    # Reshape back to 3D
    X_balanced = X_balanced.reshape(-1, original_shape[1], original_shape[2])
    
    # Inverse transform to get back to original scale
    X_balanced_reshaped = X_balanced.reshape(X_balanced.shape[0], -1)
    X_balanced_scaled = scaler.inverse_transform(X_balanced_reshaped)
    X_balanced = X_balanced_scaled.reshape(X_balanced.shape)
    
    return X_balanced, y_balanced

# Function to display class distribution
def display_class_distribution(y):
    """Display the distribution of classes in the dataset."""
    unique, counts = np.unique(y, return_counts=True)
    for label, count in zip(unique, counts):
        print(f"Class {label}: {count} samples")

# Balance the dataset
print("Original class distribution:")
display_class_distribution(y)

# Try different balancing strategies
strategies = ['smoteenn', 'smote', 'random']
for strategy in strategies:
    print(f"\nBalancing with {strategy}:")
    X_balanced, y_balanced = balance_dataset(X, y, balance_strategy=strategy)
    display_class_distribution(y_balanced)
    print(f"Balanced feature shape: {X_balanced.shape}")

Original class distribution:
Class 0: 21 samples
Class 1: 1292 samples

Balancing with smoteenn:
Class 0: 1292 samples
Class 1: 379 samples
Balanced feature shape: (1671, 300, 40)

Balancing with smote:
Class 0: 1292 samples
Class 1: 1292 samples
Balanced feature shape: (2584, 300, 40)

Balancing with random:
Class 0: 21 samples
Class 1: 21 samples
Balanced feature shape: (42, 300, 40)


In [3]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf

def create_baseline_model(input_shape):
    """Create a simple LSTM model for evaluation."""
    model = Sequential([
        LSTM(64, input_shape=input_shape, return_sequences=True),
        LSTM(32),
        Dense(16, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

def evaluate_dataset(X, y, n_splits=5):
    """
    Evaluate dataset quality using cross-validation.
    
    Parameters:
    -----------
    X : numpy array
        Feature array of shape (n_samples, sequence_length, n_features)
    y : numpy array
        Labels array
    n_splits : int
        Number of cross-validation splits
    
    Returns:
    --------
    dict
        Dictionary containing evaluation metrics
    """
    # Initialize metrics storage
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc_roc': [],
        'class_distribution': []
    }
    
    # Create cross-validation splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nEvaluating fold {fold + 1}/{n_splits}")
        
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Create and train model
        model = create_baseline_model((X.shape[1], X.shape[2]))
        
        # Early stopping to prevent overfitting
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
        
        # Train model
        model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=30,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=0
        )
        
        # Get predictions
        y_pred = (model.predict(X_val) > 0.5).astype(int)
        y_pred_prob = model.predict(X_val)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_val, y_pred))
        metrics['precision'].append(precision_score(y_val, y_pred))
        metrics['recall'].append(recall_score(y_val, y_pred))
        metrics['f1'].append(f1_score(y_val, y_pred))
        metrics['auc_roc'].append(roc_auc_score(y_val, y_pred_prob))
        
        # Calculate class distribution
        unique, counts = np.unique(y_train, return_counts=True)
        metrics['class_distribution'].append(dict(zip(unique, counts)))
    
    return metrics

def compare_datasets(datasets_dict):
    """
    Compare multiple balanced datasets.
    
    Parameters:
    -----------
    datasets_dict : dict
        Dictionary containing datasets with their balancing strategies
        Format: {'strategy_name': (X_balanced, y_balanced)}
    
    Returns:
    --------
    dict
        Comparison results for each strategy
    """
    results = {}
    
    for strategy_name, (X_balanced, y_balanced) in datasets_dict.items():
        print(f"\nEvaluating {strategy_name} strategy:")
        results[strategy_name] = evaluate_dataset(X_balanced, y_balanced)
    
    return results

def print_comparison_results(results):
    """Print formatted comparison results."""
    print("\nDataset Comparison Results:")
    print("-" * 50)
    
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc_roc']
    
    for strategy, metrics_dict in results.items():
        print(f"\n{strategy} Strategy:")
        for metric in metrics:
            values = metrics_dict[metric]
            mean_value = np.mean(values)
            std_value = np.std(values)
            print(f"{metric.upper():10}: {mean_value:.4f} (±{std_value:.4f})")

# Create dictionary of balanced datasets
balanced_datasets = {
    'SMOTEENN': balance_dataset(X, y, 'smoteenn'),
    'SMOTE': balance_dataset(X, y, 'smote'),
    'Random': balance_dataset(X, y, 'random')
}

# Compare datasets
results = compare_datasets(balanced_datasets)

# Print results
print_comparison_results(results)


Evaluating SMOTEENN strategy:

Evaluating fold 1/5

Evaluating fold 2/5

Evaluating fold 3/5

Evaluating fold 4/5

Evaluating fold 5/5

Evaluating SMOTE strategy:

Evaluating fold 1/5

Evaluating fold 2/5

Evaluating fold 3/5

Evaluating fold 4/5

Evaluating fold 5/5

Evaluating Random strategy:

Evaluating fold 1/5

Evaluating fold 2/5

Evaluating fold 3/5

Evaluating fold 4/5

Evaluating fold 5/5

Dataset Comparison Results:
--------------------------------------------------

SMOTEENN Strategy:
ACCURACY  : 0.8509 (±0.0508)
PRECISION : 0.9938 (±0.0125)
RECALL    : 0.3480 (±0.2342)
F1        : 0.4773 (±0.2047)
AUC_ROC   : 0.7377 (±0.1274)

SMOTE Strategy:
ACCURACY  : 0.8498 (±0.1669)
PRECISION : 0.9947 (±0.0107)
RECALL    : 0.7052 (±0.3379)
F1        : 0.7720 (±0.2628)
AUC_ROC   : 0.8758 (±0.1493)

Random Strategy:
ACCURACY  : 0.5000 (±0.0865)
PRECISION : 0.4222 (±0.3875)
RECALL    : 0.3300 (±0.3682)
F1        : 0.3031 (±0.2567)
AUC_ROC   : 0.4363 (±0.1245)


  _warn_prf(average, modifier, msg_start, len(result))


### Decision
As the problem requires a better Recall and F1 score thus the more balanced result from SMOTE will be selected. Moreover, SMOTE provides the best AUC_ROC score, indicating the generalisation of the model.

In [4]:
print(f"\nBalancing with smote:")
X_balanced, y_balanced = balance_dataset(X, y, balance_strategy="smote")
display_class_distribution(y_balanced)
print(f"Balanced feature shape: {X_balanced.shape}")


Balancing with smote:
Class 0: 1292 samples
Class 1: 1292 samples
Balanced feature shape: (2584, 300, 40)


### Data Splitting

In [5]:
from sklearn.model_selection import train_test_split

# First, split the data into training and temp (test + validation)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced
)

# Now, split the temp set equally into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Display the shape of the datasets
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

# Save to CSV
train_data.to_csv("train.csv", index=False)
val_data.to_csv("val.csv", index=False)
test_data.to_csv("test.csv", index=False)

print("Datasets saved as CSV files successfully!")

Training set shape: (1808, 300, 40), (1808,)
Validation set shape: (388, 300, 40), (388,)
Test set shape: (388, 300, 40), (388,)


## Model Selection:

- Statistical Methods
  - Gaussian Mixture Models (GMM)
  - Hidden Markov Models (HMM)
- Machine Learning Models
  - k-Nearest Neighbors (k-NN)
  - Random Forest (RF) with Recursive Feature Elimination (RFE)
  - Support Vector Machines (SVM)
- Deep Learning Models
  - Multilayer Perceptron (MLP)
  - Artificial Neural Networks (ANN)
  - Convolutional Neural Networks (CNN)
  - Convolutional Neural Networks (CNN) + XGBoost
  - Recurrent Neural Networks (RNNs)
  - LSTM

## Statistical Models Training

### Gaussian Mixture Models (GMM)

In [6]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report

# Flatten for GMM
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X_train_flat)

y_pred = gmm.predict(X_test_flat)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.72      0.67       194
           1       0.67      0.58      0.62       194

    accuracy                           0.65       388
   macro avg       0.65      0.65      0.65       388
weighted avg       0.65      0.65      0.65       388



## Machine Learning Models

### k-Nearest Neighbors (k-NN)

In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_flat, y_train)

y_pred = knn.predict(X_test_flat)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.90       194
           1       1.00      0.79      0.88       194

    accuracy                           0.89       388
   macro avg       0.91      0.89      0.89       388
weighted avg       0.91      0.89      0.89       388



### Random Forest with Recursive Feature Elimination (RFE)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from tqdm import tqdm

class RFETqdm(RFE):
    def fit(self, X, y):
        """Wrap RFE’s fit method with a tqdm progress bar."""
        n_features = X.shape[1]
        pbar = tqdm(total=n_features, desc="Feature Selection Progress", unit=" features")

        def callback(estimator, features_to_select):
            pbar.update(n_features - features_to_select)

        self.step = 500  # Speed up by removing 500 features at a time
        self.verbose = 1
        super().fit(X, y)
        pbar.close()

rf = RandomForestClassifier(n_estimators=10, random_state=42)  # Reduce trees for faster RFE
rfe = RFETqdm(estimator=rf, n_features_to_select=20, step=500)
rfe.fit(X_train_flat, y_train)

y_pred = rfe.predict(X_test_flat)
print(classification_report(y_test, y_pred))

Feature Selection Progress:   0%|              | 0/12000 [00:00<?, ? features/s]

Fitting estimator with 12000 features.
Fitting estimator with 11500 features.
Fitting estimator with 11000 features.
Fitting estimator with 10500 features.
Fitting estimator with 10000 features.
Fitting estimator with 9500 features.
Fitting estimator with 9000 features.
Fitting estimator with 8500 features.
Fitting estimator with 8000 features.
Fitting estimator with 7500 features.
Fitting estimator with 7000 features.
Fitting estimator with 6500 features.
Fitting estimator with 6000 features.
Fitting estimator with 5500 features.
Fitting estimator with 5000 features.
Fitting estimator with 4500 features.
Fitting estimator with 4000 features.
Fitting estimator with 3500 features.
Fitting estimator with 3000 features.
Fitting estimator with 2500 features.
Fitting estimator with 2000 features.
Fitting estimator with 1500 features.
Fitting estimator with 1000 features.
Fitting estimator with 500 features.


Feature Selection Progress:   0%|              | 0/12000 [00:11<?, ? features/s]

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       194
           1       1.00      0.98      0.99       194

    accuracy                           0.99       388
   macro avg       0.99      0.99      0.99       388
weighted avg       0.99      0.99      0.99       388






### Support Vector Machines (SVM)

In [9]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train_flat, y_train)

y_pred = svm.predict(X_test_flat)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       194
           1       1.00      0.95      0.97       194

    accuracy                           0.97       388
   macro avg       0.98      0.97      0.97       388
weighted avg       0.98      0.97      0.97       388



## Deep Learning Models

### Multilayer Perceptron (MLP)

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout

mlp_model = Sequential([
    Flatten(input_shape=(300, 40)),
    
    Dense(64, activation='relu'),
    Dropout(0.3),  # Reduced dropout rate
    
    Dense(1, activation='sigmoid')
])

mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x34aca9e90>

### Artificial Neural Network (ANN)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout

ann_model = Sequential([
    Flatten(input_shape=(300, 40)),
    
    Dense(128, activation='relu'),  # Reduced from 256 to 128 neurons
    Dropout(0.3),                   # Reduced dropout rate for lighter regularization
    
    Dense(64, activation='relu'),   # Retained this layer but removed one hidden layer
    
    Dense(1, activation='sigmoid')  # Output layer
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1575d9590>

### Convolutional Neural Network (CNN)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

cnn_model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(300, 40)),  # Reduced filters from 64 to 32
    MaxPooling1D(pool_size=2),
    Dropout(0.2),  # Reduced dropout rate
    
    Flatten(),
    
    Dense(32, activation='relu'),  # Reduced neurons from 64 to 32
    Dropout(0.3),  # Reduced dropout rate
    
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x15774a210>

### CNN + XGBoost

In [13]:
from xgboost import XGBClassifier

cnn_feature_extractor = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(300, 40)),
    MaxPooling1D(pool_size=2),
    Flatten()
])

# Extract CNN features
X_train_features = cnn_feature_extractor.predict(X_train)
X_val_features = cnn_feature_extractor.predict(X_val)
X_test_features = cnn_feature_extractor.predict(X_test)

# Train XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train_features, y_train)

y_pred = xgb_model.predict(X_test_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       194
           1       1.00      1.00      1.00       194

    accuracy                           1.00       388
   macro avg       1.00      1.00      1.00       388
weighted avg       1.00      1.00      1.00       388



### Recurrent Neural Networks (RNN)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization

# Improved RNN Model using LSTM
rnn_model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(300, 40)),
    Dropout(0.3),
    BatchNormalization(),
    
    LSTM(64),
    Dropout(0.3),
    BatchNormalization(),
    
    Dense(32, activation='relu'),
    Dropout(0.3),
    
    Dense(1, activation='sigmoid')
])

# Compile with a lower learning rate for better convergence
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate=0.0005)

rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Fit model with increased epochs for better learning
rnn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x157de0c90>

### Long Short-Term Memory (LSTM)

In [15]:
from tensorflow.keras.layers import LSTM

lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(300, 40)),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x34caf9590>

In [16]:
import os

# Create a results directory if it doesn't exist
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(model_name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Pause", "Pause"], yticklabels=["Non-Pause", "Pause"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix for {model_name}")
    plt.savefig(f"{results_dir}/{model_name}_confusion_matrix.png", dpi=300)
    plt.close()

In [None]:
import os
import numpy as np
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Create results directory
os.makedirs("results", exist_ok=True)

# Select a model for SHAP and LIME explanations (change based on preference)
selected_model = knn  # Example: Using k-NN
selected_model_name = "knn"

# Choose appropriate input data format for the selected model
if selected_model_name in ["knn", "rfe", "svm", "gmm"]:
    X_test_input = X_test.reshape(X_test.shape[0], -1)  # Flatten data for these models
elif selected_model_name in ["mlp_model", "ann_model", "cnn_model", "rnn_model", "lstm_model"]:
    X_test_input = X_test  # Keep original shape for neural networks
elif selected_model_name == "xgb_model":
    X_test_input = cnn_feature_extractor.predict(X_test)  # Use CNN extracted features for XGBoost

# Generate SHAP values with a safe approach
try:
    explainer = shap.Explainer(selected_model.predict, X_test_input)
    num_features = X_test_input.shape[1]
    max_evals = max(1000, 2 * num_features + 1)  # Ensure max_evals is large enough
    shap_values = explainer(X_test_input[:50], max_evals=max_evals)  # Adjust max_evals dynamically
    plt.figure()
    shap.summary_plot(shap_values, X_test_input[:50], show=False)
    plt.savefig(f"results/shap_summary_{selected_model_name}.png")
    plt.close()
except Exception as e:
    print(f"SHAP computation failed: {e}")

# Generate LIME explanations with error handling
try:
    explainer = lime.lime_tabular.LimeTabularExplainer(X_test_input, mode='classification', feature_names=[f'feature_{i}' for i in range(X_test_input.shape[1])])
    idx = np.random.randint(0, len(X_test_input))
    exp = explainer.explain_instance(X_test_input[idx], selected_model.predict_proba)
    
    # Save LIME explanation as PNG
    fig = exp.as_pyplot_figure()
    fig.savefig(f"results/lime_explanation_{selected_model_name}.png")
    plt.close(fig)
except Exception as e:
    print(f"LIME computation failed: {e}")

# Generate t-SNE visualization safely
try:
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=500)
    X_test_tsne = tsne.fit_transform(X_test_input)
    plt.figure(figsize=(8, 6))
    plt.scatter(X_test_tsne[:, 0], X_test_tsne[:, 1], c=y_test, cmap='coolwarm', alpha=0.7)
    plt.colorbar(label="Class Label")
    plt.title("t-SNE Visualization of Test Data")
    plt.savefig("results/tsne_visualization.png")
    plt.close()
except Exception as e:
    print(f"t-SNE computation failed: {e}")

print("SHAP, LIME, and t-SNE visualizations saved in 'results' folder.")

In [20]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model_name, y_true, y_pred, y_prob=None):
    """Compute and return evaluation metrics for a model."""
    results = {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_prob) if y_prob is not None else "N/A"
    }
    return results

# Group Models by Type
statistical_models = {
    "Gaussian Mixture Models": gmm
}

machine_learning_models = {
    "Support Vector Machine": svm,
    "Random Forest (RFE)": rfe,
    "K-Nearest Neighbour": knn,
}

deep_learning_models = {
    "Multilayer Perceptron": mlp_model,
    "Artificial Neural Network": ann_model,
    "Convolutional Neural Network": cnn_model,
    "CNN + XGBoost": xgb_model,
    "Recurrent Neural Network": rnn_model,
    "Long Short-Term Memory": lstm_model
}

# Store results separately for each model type
metrics_table = []

# Evaluate Statistical Models
for model_name, model in statistical_models.items():
    y_pred = model.predict(X_test_flat)
    y_prob = None  # Most statistical models don't provide probability outputs
    metrics_table.append(evaluate_model(f"Statistical | {model_name}", y_test, y_pred, y_prob))

# Evaluate Machine Learning Models
for model_name, model in machine_learning_models.items():
    y_pred = model.predict(X_test_flat)
    y_prob = model.predict_proba(X_test_flat)[:, 1] if hasattr(model, "predict_proba") else None
    metrics_table.append(evaluate_model(f"ML | {model_name}", y_test, y_pred, y_prob))

# Evaluate Deep Learning Models
# Deep Learning Models - Ensure correct input for CNN + XGBoost
for model_name, model in deep_learning_models.items():
    if model_name == "CNN + XGBoost":
        # Extract CNN Features before using XGBoost
        X_test_features = cnn_feature_extractor.predict(X_test)  # Extract CNN features
        y_pred = model.predict(X_test_features)  # Predict using XGBoost
        y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary
        y_prob = y_pred.flatten()
    else:
        # Other DL models work with 3D input
        y_pred = model.predict(X_test)
        y_pred = (y_pred > 0.5).astype(int)
        y_prob = y_pred.flatten()

    metrics_table.append(evaluate_model(f"DL | {model_name}", y_test, y_prob, y_prob))


# Convert to DataFrame and Save
df_metrics = pd.DataFrame(metrics_table)
df_metrics.to_csv(f"{results_dir}/model_performance_metrics.csv", index=False)

# Display metrics table
import ace_tools as tools
tools.display_dataframe_to_user(name="Model Performance Metrics", dataframe=df_metrics)



ModuleNotFoundError: No module named 'ace_tools'

## For KNN, RFE, SVM, GMM (Flattened MFCC Features)

## Traditional Machine Learning Models (SVM, Decision Trees, Random Forest)

In [2]:
import shap

def trad_shap(model, X_test):
    # Assuming you have a trained model and test data
    explainer = shap.Explainer(model, X_test)  
    shap_values = explainer(X_test)
    
    # Visualize global feature importance
    shap.summary_plot(shap_values, X_test, feature_names=[f'MFCC_{i}' for i in range(X_test.shape[1])])

In [3]:
from sklearn.inspection import permutation_importance
# Plot
import matplotlib.pyplot as plt

def trad_permu_imp(model, X_test, y_test):
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
    importances = result.importances_mean
    
    plt.barh(range(len(importances)), importances)
    plt.yticks(range(len(importances)), [f'MFCC_{i}' for i in range(X_test.shape[1])])
    plt.xlabel("Feature Importance")
    plt.ylabel("MFCC Features")
    plt.title("Permutation Feature Importance")
    plt.show()

## Deep Learning Models (CNN, ANN, RNN)

In [4]:
import tensorflow as tf
import shap
import numpy as np
import matplotlib.pyplot as plt

def grad_cam(model, X_test):
    # Assuming TensorFlow model
    explainer = shap.Explainer(model, X_test)
    shap_values = explainer(X_test)
    
    # Convert attributions to NumPy for visualization
    attr = np.mean(shap_values.values, axis=0)
    
    # Plot
    plt.barh(range(len(attr)), attr)
    plt.yticks(range(len(attr)), [f'MFCC_{i}' for i in range(len(attr))])
    plt.xlabel("Feature Importance")
    plt.ylabel("MFCC Features")
    plt.title("SHAP Feature Importance for MFCC Features")
    plt.show()


In [None]:
import os
import json
import librosa
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM
from tensorflow.keras.utils import to_categorical

# Load JSON and audio data
json_folder = 'JSON'
audio_folder = 'cleaned_wav_files'

def load_annotations(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    annotations = [(item['value']['start'], item['value']['end'], item['value']['labels'][0])
                   for item in data[0]['annotations'][0]['result'] if item['type'] == 'labels']
    return annotations

# Feature Extraction (MFCC)
def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfcc, axis=1)

# Data Preprocessing
audio_files = {os.path.splitext(f)[0]: os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.wav', '.m4a', '.mp3'))}
data, labels = [], []
for name, path in audio_files.items():
    features = extract_features(path)
    label = name.split('_')[0]  # Assuming label is in filename
    data.append(features)
    labels.append(label)

data = np.array(data)
labels = LabelEncoder().fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Selection
models = {
    "knn": KNeighborsClassifier(n_neighbors=5),
    "svm": SVC(probability=True),
    "rf": RandomForestClassifier(n_estimators=100),
    "xgb": XGBClassifier(),
    "cnn": Sequential([Conv1D(32, 3, activation='relu', input_shape=(13, 1)), Flatten(), Dense(10, activation='softmax')]),
    "lstm": Sequential([LSTM(50, return_sequences=True, input_shape=(13, 1)), Flatten(), Dense(10, activation='softmax')])
}

for name, model in models.items():
    print(f"Training {name}...")
    if name in ["cnn", "lstm"]:
        X_train_nn = X_train.reshape(-1, 13, 1)
        X_test_nn = X_test.reshape(-1, 13, 1)
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X_train_nn, y_train, epochs=10, batch_size=16, verbose=0)
    else:
        model.fit(X_train, y_train)

# Explainability with SHAP
explainer = shap.Explainer(models["rf"], X_train)
shap_values = explainer(X_test[:10])
shap.summary_plot(shap_values, X_test[:10])

plt.show()
