<a href="https://colab.research.google.com/github/rjshrd/dementia_audio_classifier/blob/main/binary_audio_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.37.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.3.1-cp310-c

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import warnings
import os

warnings.filterwarnings('ignore')

def extract_features(file_path, max_pad_len=128):
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"File does not exist: {file_path}")
            return None

        # Load audio file with librosa
        audio, sample_rate = librosa.load(file_path, sr=None, mono=True, res_type='kaiser_fast', duration=5)

        # Verify audio has been loaded
        if audio is None or len(audio) == 0:
            print(f"Unable to load audio from file: {file_path}")
            return None

        # Extract mel-spectrogram
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128, fmax=sample_rate/2)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

        # Pad or truncate spectrogram to max_pad_len
        if spectrogram.shape[1] < max_pad_len:
            pad_width = max_pad_len - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_pad_len]

        # Expand dimensions to match expected input format for CNN (num_mels, time_steps, 1)
        spectrogram = np.expand_dims(spectrogram, axis=-1)

        return spectrogram

    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(f"Error details: {str(e)}")
        return None

def load_data(csv_path):
    df = pd.read_csv(csv_path, sep='\t')
    print(f"Loaded dataframe shape: {df.shape}")
    print(df.columns)  # Print column names
    print(df.head())   # Print first few rows
    return df

def preprocess_data(df):
    features = []
    for index, row in df.iterrows():
        file_path = row['path']
        class_label = row['label']
        data = extract_features(file_path)
        if data is not None:
            features.append([data, class_label])
        else:
            print(f"Skipping file due to extraction error: {file_path}")

    print(f"Processed {len(features)} files successfully out of {len(df)} total files")
    return features

def prepare_dataset(features):
    if not features:
        raise ValueError("No features were successfully extracted from the dataset.")

    X = np.array([feature[0] for feature in features])
    y = np.array([feature[1] for feature in features])

    le = LabelEncoder()
    y = le.fit_transform(y)

    print(f"Prepared dataset shapes: X: {X.shape}, y: {y.shape}")
    return X, y


def build_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=16):
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_val, y_val))
    return history

def evaluate_model(model, X_test, y_test):
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print(f'\nTest accuracy: {test_acc}')
    return test_acc

def main():
    # Load data
    train_df = load_data('/content/drive/MyDrive/type3/data/train_dm.csv')
    val_df = load_data('/content/drive/MyDrive/type3/data/valid_dm.csv')

    # Preprocess data
    train_features = preprocess_data(train_df)
    val_features = preprocess_data(val_df)

    # Prepare datasets
    try:
        X_train, y_train = prepare_dataset(train_features)
        X_val, y_val = prepare_dataset(val_features)
    except ValueError as e:
        print(f"Error preparing dataset: {str(e)}")
        return

    # Build and train the model
    input_shape = (X_train.shape[1], X_train.shape[2], 1)  # (num_mels, time_steps, 1)
    print(f"Input shape: {input_shape}")

    model = build_model(input_shape)
    history = train_model(model, X_train, y_train, X_val, y_val)

    # Evaluate the model
    val_accuracy = evaluate_model(model, X_val, y_val)

    # Save the model
    model.save('audio_classifier_model.h5')

    print(f'Model training completed. Validation accuracy: {val_accuracy}')

if __name__ == "__main__":
    main()

Loaded dataframe shape: (186, 3)
Index(['file', 'label', 'path'], dtype='object')
              file     label  \
0  TrevorPeacock_5  dementia   
1  RonaldReagan_10  dementia   
2   RonaldReagan_5  dementia   
3     AbeBurrows_5  dementia   
4      PeterMax_15  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Tr...  
1  /content/drive/My Drive/type3/data/dementia/Ro...  
2  /content/drive/My Drive/type3/data/dementia/Ro...  
3  /content/drive/My Drive/type3/data/dementia/Ab...  
4  /content/drive/My Drive/type3/data/dementia/Pe...  
Loaded dataframe shape: (38, 3)
Index(['file', 'label', 'path'], dtype='object')
               file     label  \
0   EstelleGetty_15  dementia   
1      CaseyKasem_5  dementia   
2     CaseyKasem_15  dementia   
3  JimmyFratianno_0  dementia   
4         bbking_10  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Es...  
1  /c




Test accuracy: 0.5263158082962036
Model training completed. Validation accuracy: 0.5263158082962036


In [None]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import warnings
import os
from audiomentations import Compose, AddBackgroundNoise, Shift, PitchShift, TimeStretch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch

warnings.filterwarnings('ignore')

BACKGROUND_NOISE_PATH = '/content/drive/MyDrive/type3/data/'  # for audio augmentation

# Data augmentation setup
augment = Compose([
    AddBackgroundNoise(sounds_path=BACKGROUND_NOISE_PATH, p=0.5),
    Shift(p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5)
])

def extract_features(file_path, max_pad_len=128):
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"File does not exist: {file_path}")
            return None

        # Load audio file with librosa
        audio, sample_rate = librosa.load(file_path, sr=None, mono=True, res_type='kaiser_fast', duration=5)

        # Verify audio has been loaded
        if audio is None or len(audio) == 0:
            print(f"Unable to load audio from file: {file_path}")
            return None

        # Extract mel-spectrogram
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128, fmax=sample_rate/2)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

        # Pad or truncate spectrogram to max_pad_len
        if spectrogram.shape[1] < max_pad_len:
            pad_width = max_pad_len - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_pad_len]

        # Expand dimensions to match expected input format for CNN (num_mels, time_steps, 1)
        spectrogram = np.expand_dims(spectrogram, axis=-1)

        return spectrogram

    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(f"Error details: {str(e)}")
        return None


def load_data(csv_path):
    df = pd.read_csv(csv_path, sep='\t')
    print(f"Loaded dataframe shape: {df.shape}")
    print(df.columns)  # Print column names
    print(df.head())   # Print first few rows
    return df

def preprocess_data(df):
    features = []
    for index, row in df.iterrows():
        file_path = row['path']
        class_label = row['label']
        data = extract_features(file_path)
        if data is not None:
            features.append([data, class_label])
        else:
            print(f"Skipping file due to extraction error: {file_path}")

    print(f"Processed {len(features)} files successfully out of {len(df)} total files")
    return features

def prepare_dataset(features):
    if not features:
        raise ValueError("No features were successfully extracted from the dataset.")

    X = np.array([feature[0] for feature in features])
    y = np.array([feature[1] for feature in features])

    le = LabelEncoder()
    y = le.fit_transform(y)

    print(f"Prepared dataset shapes: X: {X.shape}, y: {y.shape}")
    return X, y

def build_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_val, y_val, epochs=20, batch_size=8):
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_val, y_val))
    return history

def evaluate_model(model, X_test, y_test):
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print(f'\nTest accuracy: {test_acc}')
    return test_acc

def main():
    # Load data
    train_df = load_data('/content/drive/MyDrive/type3/data/train_dm.csv')
    val_df = load_data('/content/drive/MyDrive/type3/data/valid_dm.csv')

    # Preprocess data
    train_features = preprocess_data(train_df)
    val_features = preprocess_data(val_df)

    # Prepare datasets
    try:
        X_train, y_train = prepare_dataset(train_features)
        X_val, y_val = prepare_dataset(val_features)
    except ValueError as e:
        print(f"Error preparing dataset: {str(e)}")
        return

    # Build and train the model
    input_shape = (X_train.shape[1], X_train.shape[2], 1)  # (num_mels, time_steps, 1)
    print(f"Input shape: {input_shape}")

    model = build_model(input_shape)
    history = train_model(model, X_train, y_train, X_val, y_val)

    # Evaluate the model
    val_accuracy = evaluate_model(model, X_val, y_val)

    # Save the model
    model.save('audio_classifier_model.h5')

    print(f'Model training completed. Validation accuracy: {val_accuracy}')

if __name__ == "__main__":
    main()

Loaded dataframe shape: (186, 3)
Index(['file', 'label', 'path'], dtype='object')
              file     label  \
0  TrevorPeacock_5  dementia   
1  RonaldReagan_10  dementia   
2   RonaldReagan_5  dementia   
3     AbeBurrows_5  dementia   
4      PeterMax_15  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Tr...  
1  /content/drive/My Drive/type3/data/dementia/Ro...  
2  /content/drive/My Drive/type3/data/dementia/Ro...  
3  /content/drive/My Drive/type3/data/dementia/Ab...  
4  /content/drive/My Drive/type3/data/dementia/Pe...  
Loaded dataframe shape: (38, 3)
Index(['file', 'label', 'path'], dtype='object')
               file     label  \
0   EstelleGetty_15  dementia   
1      CaseyKasem_5  dementia   
2     CaseyKasem_15  dementia   
3  JimmyFratianno_0  dementia   
4         bbking_10  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Es...  
1  /c




Test accuracy: 0.5
Model training completed. Validation accuracy: 0.5


In [4]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import warnings
import os
from audiomentations import Compose, AddBackgroundNoise, Shift, PitchShift, TimeStretch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch

warnings.filterwarnings('ignore')

BACKGROUND_NOISE_PATH = '/content/drive/MyDrive/type3/data/'  # Added for audio augmentation

# Data augmentation setup
augment = Compose([
    AddBackgroundNoise(sounds_path=BACKGROUND_NOISE_PATH, p=0.5),
    Shift(p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5)
])

# Load Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)


def extract_features(file_path, max_pad_len=128):
    try:
        if not os.path.exists(file_path):
            print(f"File does not exist: {file_path}")
            return None

        audio, sample_rate = librosa.load(file_path, sr=None, mono=True, res_type='kaiser_fast', duration=5)

        if audio is None or len(audio) == 0:
            print(f"Unable to load audio from file: {file_path}")
            return None

        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128, fmax=sample_rate/2)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

        # Padding or trimming
        if spectrogram.shape[1] < max_pad_len:
            pad_width = max_pad_len - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_pad_len]

        # Flatten the spectrogram for the model
        spectrogram = spectrogram.flatten()  # Change this line

        # Return as a dictionary
        return {'input_values': torch.tensor(spectrogram, dtype=torch.float32)}

    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        print(f"Error details: {str(e)}")
        return None

def load_data(csv_path):
    df = pd.read_csv(csv_path, sep='\t')
    print(f"Loaded dataframe shape: {df.shape}")
    print(df.columns)  # Print column names
    print(df.head())   # Print first few rows
    return df


def preprocess_data(df):
    features = []
    for index, row in df.iterrows():
        file_path = row['path']
        class_label = row['label']
        data = extract_features(file_path)
        if data is not None:
            features.append([data, class_label])
        else:
            print(f"Skipping file due to extraction error: {file_path}")

    print(f"Processed {len(features)} files successfully out of {len(df)} total files")
    return features


def prepare_dataset(features):
    if not features:
        raise ValueError("No features were successfully extracted from the dataset.")

    X = np.array([feature[0] for feature in features])
    y = np.array([feature[1] for feature in features])

    le = LabelEncoder()
    y = le.fit_transform(y)

    print(f"Prepared dataset shapes: X: {X.shape}, y: {y.shape}")
    return X, y


def train_model(model, X_train, y_train, epochs=10, batch_size=32):
    # Concatenate the input tensors
    train_inputs = torch.cat([item['input_values'].unsqueeze(0) for item in X_train])
    train_labels = torch.tensor(y_train)

    # Create a DataLoader
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train the model
    for epoch in range(epochs):
        for batch in train_loader:
            inputs, labels = batch
            # Ensure the input is of shape [batch_size, sequence_length]
            inputs = inputs.squeeze(1)
            outputs = model(inputs).logits
            loss = torch.nn.functional.cross_entropy(outputs, labels)
            loss.backward()

    return model


def evaluate_model(model, X_test, y_test):
    # Ensure X_test is concatenated correctly from the 'input_values' in the same way as X_train
    test_inputs = torch.cat([item['input_values'].unsqueeze(0) for item in X_test])  # Concatenate input tensors
    test_labels = torch.tensor(y_test)  # Convert labels to tensor

    # Ensure the input dimensions are correct (shape: [batch_size, sequence_length])
    test_inputs = test_inputs.squeeze(1)

    # Create a DataLoader for testing
    test_dataset = torch.utils.data.TensorDataset(test_inputs, test_labels)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

    # Evaluate the model
    model.eval()
    total_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            outputs = model(inputs).logits
            predictions = torch.argmax(outputs, dim=1)
            total_correct += (predictions == labels).sum().item()

    test_accuracy = total_correct / len(X_test)
    print(f'Test accuracy: {test_accuracy}')
    return test_accuracy

def main():
    # Load data
    train_df = load_data('/content/drive/MyDrive/type3/data/train_dm.csv')
    val_df = load_data('/content/drive/MyDrive/type3/data/valid_dm.csv')

    # Preprocess data
    train_features = preprocess_data(train_df)
    val_features = preprocess_data(val_df)

    # Prepare datasets
    try:
        X_train, y_train = prepare_dataset(train_features)
        X_val, y_val = prepare_dataset(val_features)
    except ValueError as e:
        print(f"Error preparing dataset: {str(e)}")
        return

    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

    # Train the model
    model = train_model(model, X_train, y_train)

    # Evaluate the model
    val_accuracy = evaluate_model(model, X_val, y_val)

    # Save the model
    model.save_pretrained('wav2vec2_audio_classifier_model')

    print(f'Model training completed. Validation accuracy: {val_accuracy}')

if __name__ == "__main__":
    main()

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded dataframe shape: (268, 3)
Index(['file', 'label', 'path'], dtype='object')
              file     label  \
0  JeanneLittle_10  dementia   
1  TrevorPeacock_5  dementia   
2  RonaldReagan_10  dementia   
3   RonaldReagan_5  dementia   
4  woodydurham_0_2  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Je...  
1  /content/drive/My Drive/type3/data/dementia/Tr...  
2  /content/drive/My Drive/type3/data/dementia/Ro...  
3  /content/drive/My Drive/type3/data/dementia/Ro...  
4  /content/drive/My Drive/type3/data/dementia/Wo...  
Loaded dataframe shape: (56, 3)
Index(['file', 'label', 'path'], dtype='object')
               file     label  \
0   EstelleGetty_15  dementia   
1      CaseyKasem_5  dementia   
2     CaseyKasem_15  dementia   
3  JimmyFratianno_0  dementia   
4       JoeConley_0  dementia   

                                                path  
0  /content/drive/My Drive/type3/data/dementia/Es...  
1  /c

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test accuracy: 0.6071428571428571
Model training completed. Validation accuracy: 0.6071428571428571
