In [1]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import numpy as np
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

In [2]:
def load_audio_files(directory, sr=16000):
    audio_data = []
    labels = []
    class_labels = {label: i for i, label in enumerate(os.listdir(directory))}  # Assign numerical labels

    for label in os.listdir(directory):
        class_dir = os.path.join(directory, label)
        if os.path.isdir(class_dir):
            for file in os.listdir(class_dir):
                file_path = os.path.join(class_dir, file)
                try:
                    audio, _ = librosa.load(file_path, sr=sr)  # Load audio
                    audio_data.append(audio)
                    labels.append(class_labels[label])  # Convert label to integer
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
    
    return np.array(audio_data, dtype=object), np.array(labels)

In [3]:
train_dir = "/Users/shanoonissaka/Documents/school/thesis-project/datasets/audio/training"
test_dir = "/Users/shanoonissaka/Documents/school/thesis-project/datasets/audio/testing"

In [4]:
train_features, train_labes = load_audio_files(train_dir)

In [5]:
test_features , test_labels = load_audio_files(test_dir)

In [6]:
train_features.shape

(2191,)

In [7]:
# Load pretrained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [9]:
def extract_wav2vec2_features(audio_data):
    features = []
    
    for audio in audio_data:
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU if available
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        features.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())  # Take mean across time
        
    return np.array(features)

In [10]:
wav_vec_train_features = extract_wav2vec2_features(train_features)

In [11]:
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(np.vstack(features), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [12]:
train_dataset = AudioDataset(wav_vec_train_features, train_labes)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [13]:
# Extract features and labels from the DataLoader
def dataloader_to_numpy(dataloader):
    feature_list, label_list = [], []
    for batch in dataloader:
        inputs, labels = batch
        feature_list.append(inputs.cpu().numpy())  # Convert to NumPy
        label_list.append(labels.cpu().numpy())

    flat_features = np.vstack(feature_list)  # Stack into single NumPy array
    flat_labels = np.hstack(label_list)  # Stack labels

    return flat_features, flat_labels

In [14]:
flatten_features, flatten_labels = dataloader_to_numpy(train_loader)

In [15]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(flatten_features, flatten_labels, test_size=0.2, random_state=42)

# Initialize and fit LazyPredict classifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)

 87%|██████████████████████████████████████████████████████████████████████████▉           | 27/31 [00:20<00:02,  1.98it/s]

[LightGBM] [Info] Number of positive: 934, number of negative: 818
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.533105 -> initscore=0.132614
[LightGBM] [Info] Start training from score 0.132614


100%|██████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:22<00:00,  1.38it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LinearSVC                          1.00               1.00     1.00      1.00   
CalibratedClassifierCV             1.00               1.00     1.00      1.00   
RidgeClassifierCV                  1.00               1.00     1.00      1.00   
RidgeClassifier                    1.00               1.00     1.00      1.00   
PassiveAggressiveClassifier        1.00               1.00     1.00      1.00   
LogisticRegression                 1.00               1.00     1.00      1.00   
KNeighborsClassifier               1.00               0.99     0.99      1.00   
SVC                                1.00               0.99     0.99      1.00   
LinearDiscriminantAnalysis         1.00               0.99     0.99      1.00   
LGBMClassifier                     0.99               0.99     0.99      0.99   
Perceptron                  




In [21]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
def extract_openl3_features(audio_data, sr=16000, embedding_size=512, content_type="env"):
    features = []
    
    for audio in audio_data:
        # Ensure audio is float32
        audio = np.asarray(audio, dtype=np.float32)
        # Ensure audio is in the correct format
        if len(audio.shape) == 1:  # Convert mono to stereo
            audio = np.stack([audio, audio], axis=0)

        # Extract OpenL3 embeddings
        embedding, _ = openl3.get_audio_embedding(audio, sr, embedding_size=embedding_size, content_type=content_type)
        
        # Take the mean across time dimension
        features.append(np.mean(embedding, axis=0))

    return np.array(features)

In [22]:
openl3_train_features = extract_openl3_features(train_features)

RuntimeError: Exception encountered when calling STFT.call().

[1mCould not automatically infer the output shape / dtype of 'stft_1' (of type STFT). Either the `STFT.call()` method is incorrect, or you need to implement the `STFT.compute_output_spec() / compute_output_shape()` method. Error encountered:

Invalid dtype: complex64[0m

Arguments received by STFT.call():
  • args=('<KerasTensor shape=(None, 1, 48000), dtype=float32, sparse=None, name=keras_tensor_3>',)
  • kwargs=<class 'inspect._empty'>

In [24]:
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [25]:
cnn_train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
cnn_val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [33]:
import torch.nn as nn
import torch.optim as optim

class AudioCNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(32 * (input_size // 2), 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = x.unsqueeze(1) 
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [34]:
# Initialize Model
input_size = wav_vec_train_features.shape[1]  # Feature size from Wav2Vec2
num_classes = len(set(train_labes))  # Number of unique labels
model_cnn = AudioCNN(input_size, num_classes).to(device)

In [35]:
# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)

In [36]:
# Training the CNN Model
num_epochs = 25

for epoch in range(num_epochs):
    model_cnn.train()
    running_loss = 0.0

    for features, labels in cnn_train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model_cnn(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x6144 and 0x128)

In [37]:
print("hello")

hello


In [38]:
def extract_mfcc_features(audio_data, sr=16000, n_mfcc=13):
    features = []
    
    for audio in audio_data:
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = np.mean(mfcc, axis=1)  # Take mean over time
        features.append(mfcc_mean)

    return np.array(features)

In [39]:
mfcc_train_features = extract_mfcc_features(train_features)

In [41]:
# Create DataLoader
mfcc_train_dataset = AudioDataset(mfcc_train_features, train_labes)
mfcc_train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [42]:
mfcc_flatten_features, mfcc_flatten_labels = dataloader_to_numpy(mfcc_train_loader)

In [43]:
mfcc_X_train, mfcc_X_test, mfcc_y_train, mfcc_y_test = train_test_split(mfcc_flatten_features, mfcc_flatten_labels, test_size=0.2, random_state=42)

In [44]:
# Initialize and fit LazyPredict classifier
mfcc_models, mfcc_predictions = clf.fit(mfcc_X_train, mfcc_X_test, mfcc_y_train, mfcc_y_test)

# Display results
print(mfcc_models)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


 87%|██████████████████████████████████████████████████████████████████████████▉           | 27/31 [00:15<00:01,  2.28it/s]

[LightGBM] [Info] Number of positive: 761, number of negative: 640
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010887 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 1401, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.543183 -> initscore=0.173165
[LightGBM] [Info] Start training from score 0.173165


100%|██████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:17<00:00,  1.77it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
RidgeClassifier                    1.00               1.00     1.00      1.00   
LinearSVC                          0.99               0.99     0.99      0.99   
CalibratedClassifierCV             0.99               0.99     0.99      0.99   
RidgeClassifierCV                  0.99               0.99     0.99      0.99   
LogisticRegression                 0.99               0.99     0.99      0.99   
SGDClassifier                      0.99               0.99     0.99      0.99   
Perceptron                         0.99               0.99     0.99      0.99   
PassiveAggressiveClassifier        0.99               0.99     0.99      0.99   
SVC                                0.99               0.99     0.99      0.99   
ExtraTreesClassifier               0.99               0.99     0.99      0.99   
KNeighborsClassifier        


