In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'vap_sound/src')))


from utils.save_load_model import save_model_pickle, load_model_pickle

from config import (
    NUM_CLASSES,
    BATCH_SIZE,
    NUM_EPOCHS,
    LEARNING_RATE
    )

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from glob import glob
import pickle

# Define the VAP Model with LSTM
class VAPModel(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=128, lstm_hidden_dim=256, num_heads=4, num_layers=2, output_dim=2):
        super(VAPModel, self).__init__()
        
        # Transformer layer
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads),
            num_layers=num_layers
        )
        
        # LSTM Layer
        self.lstm = nn.LSTM(input_dim, lstm_hidden_dim, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)
    
    def forward(self, x):
        # Apply transformer for better feature encoding
        x = self.transformer(x)
        
        # Apply LSTM for sequence modeling
        x, _ = self.lstm(x)
        
        # Apply fully connected layer
        x = self.fc(x)
        
        return torch.sigmoid(x)

# Function to load pre-trained Wave2Vec model
def load_wave2vec_model():
    # Load the pre-trained wave2vec model
    model, _ = torchaudio.pipelines.WAV2VEC2_LARGE_LV60
    model.eval()  # Set model to evaluation mode
    return model

# Function to extract features using Wave2Vec
def extract_wave2vec_features(audio_path, model):
    waveform, sample_rate = torchaudio.load(audio_path)
    with torch.no_grad():
        features, _ = model.encode(waveform)  # Extract embeddings from the model
    return features.squeeze(0).T  # Transpose to have time as the first dimension

# Custom dataset class for loading and processing audio files
class VAPDataset(Dataset):
    def __init__(self, csv_files, model, seq_length=100):
        if isinstance(csv_files, str):
            csv_files = [csv_files]
        
        self.seq_length = seq_length
        self.data = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
        self.audio_paths = self.data['audio_path'].tolist()
        self.labels = [torch.tensor(eval(l)) for l in self.data['labels']]
        self.model = model  # Use wave2vec model for feature extraction
    
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        feature = extract_wave2vec_features(self.audio_paths[idx], self.model)
        label = self.labels[idx]
        
        if feature.shape[0] > self.seq_length:
            feature = feature[-self.seq_length:]
            label = label[-self.seq_length:]
        else:
            pad_length = self.seq_length - feature.shape[0]
            feature = torch.cat([torch.zeros(pad_length, feature.shape[1]), feature], dim=0)
            label = torch.cat([torch.zeros(pad_length, label.shape[1]), label], dim=0)
        
        return feature, label

# Function to save model as pickle file
def save_model_pickle(model, path="vap_model.pkl"):
    with open(path, "wb") as f:
        pickle.dump(model, f)
    print("Model saved successfully as pickle file!")

# Function to load model from pickle file
def load_model_pickle(path="vap_model.pkl"):
    with open(path, "rb") as f:
        model = pickle.load(f)
    model.eval()
    print("Model loaded successfully from pickle file!")
    return model

# Collate function to pad variable-length sequences
def collate_fn(batch):
    features, labels = zip(*batch)
    features_padded = pad_sequence(features, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    return features_padded, labels_padded

# Function to evaluate the model and save predictions
def evaluate_model(model, test_loader, output_file="evaluation_predictions.txt"):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0
    criterion = nn.BCELoss()
    
    with torch.no_grad(), open(output_file, "w") as f:
        for features, labels in test_loader:
            outputs = model(features)
            
            min_length = min(outputs.shape[1], labels.shape[1])
            outputs = outputs[:, :min_length, :]
            labels = labels[:, :min_length, :]
            
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())
            
            for p, l in zip(preds, labels):
                f.write(f"Predicted: {p.tolist()}\nActual: {l.tolist()}\n\n")
    
    acc = accuracy_score(np.array(all_labels).flatten(), np.array(all_preds).flatten())
    conf_matrix = confusion_matrix(np.array(all_labels).flatten(), np.array(all_preds).flatten())
    
    print(f"Test Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# Train the model in batches
def train_model(model, train_files, batch_size=10, epochs=10, seq_length=100):
    train_dataset = VAPDataset(train_files, model, seq_length=seq_length)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            
            min_length = min(outputs.shape[1], labels.shape[1])
            outputs = outputs[:, :min_length, :]
            labels = labels[:, :min_length, :]
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_loss/len(train_loader):.4f}")
    
    save_model_pickle(model, "vap_model.pkl")
    
    test_dataset = VAPDataset(test_files, model, seq_length=seq_length)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
    evaluate_model(model, test_loader)

# Load wave2vec model
wave2vec_model = load_wave2vec_model()

# Example file paths
data_dir = "/Users/shanujha/Desktop/voice_activity_prediction/voice_data_mozilla/en/clips/"
audio_files = glob(os.path.join(data_dir, "*.mp3"))
csv_files = glob("processed_data/*.csv")

# Split dataset into training and testing
train_files, test_files = train_test_split(csv_files, test_size=0.2, random_state=42)

# Train the model
train_model(VAPModel(), train_files, batch_size=10, epochs=10, seq_length=100)
print("Training complete!")


AttributeError: module 'torchaudio.pipelines' has no attribute 'WAV2VEC2_LARGE_LV60'

In [2]:
# Function to load pre-trained Wave2Vec model
def load_wave2vec_model():
    # Load the pre-trained wave2vec model
    model, _ = torchaudio.pipelines.WAV2VEC2_LARGE_LV60
    model.eval()  # Set model to evaluation mode
    return model


In [6]:
# a = load_wave2vec_model()

model, _ = torchaudio.pipelines.WAV2VEC2_ASR_LARGE_LV60K

AttributeError: module 'torchaudio.pipelines' has no attribute 'WAV2VEC2_ASR_LARGE_LV60K'

In [18]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio

# Load the Wav2Vec 2.0 Tiny model and processor
model_name = "facebook/wav2vec2-xls-r-300m"  # Tiny model
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# # Example: Load an audio file
# waveform, sample_rate = torchaudio.load("/Users/shanujha/Desktop/voice_activity_prediction/voice_data_mozilla/en/clips/common_voice_en_41247165.mp3")  # Replace with your audio file path

# # Preprocess the audio
# inputs = processor(
#     waveform.squeeze().numpy(),  # Remove batch dimension and convert to numpy array
#     sampling_rate=sample_rate,  # Pass the sample rate
#     return_tensors="pt"  # Return PyTorch tensors
# )

# # Perform inference
# with torch.no_grad():
#     logits = model(inputs.input_values).logits

# # Decode the output
# predicted_ids = torch.argmax(logits, dim=-1)

Error while downloading from https://cdn-lfs.hf.co/facebook/wav2vec2-xls-r-300m/d5e490574712ad0a6736923b9ed11d4cd51c78609c36205f704fc4e87b11d2e0?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1742551401&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjU1MTQwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay93YXYydmVjMi14bHMtci0zMDBtL2Q1ZTQ5MDU3NDcxMmFkMGE2NzM2OTIzYjllZDExZDRjZDUxYzc4NjA5YzM2MjA1ZjcwNGZjNGU4N2IxMWQyZTA%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=JU5WIMnX-s2wtFK2ztarzW%7EOTOuASqX9qnT4E0J3LDy1fQSI12PR83svjN%7Epdabp3pMdDFlN1DADrwPYncyQLX%7EAbi4w-4gEAIC2iqUZxQ31dYqKEKGG0a5ffI%7E72zmF4f1iKtt64A6uu-7sbUnPf2oCPD5NEIgxcSLFnp8RRL1lQl0p6nHkZqNtoaoldcIHMDqCXV2sSDTtLDKoub9xuVO7tTCLRy9YMA2ZWWI59pyEtMEHJdiNsI8lmlBEP4w-kDQJP7oIYTQGELt%7Evnu-HaJ1wdsdMim2

ConnectTimeout: (MaxRetryError("HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Max retries exceeded with url: /facebook/wav2vec2-xls-r-300m/d5e490574712ad0a6736923b9ed11d4cd51c78609c36205f704fc4e87b11d2e0?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1742551401&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjU1MTQwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay93YXYydmVjMi14bHMtci0zMDBtL2Q1ZTQ5MDU3NDcxMmFkMGE2NzM2OTIzYjllZDExZDRjZDUxYzc4NjA5YzM2MjA1ZjcwNGZjNGU4N2IxMWQyZTA~cmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=JU5WIMnX-s2wtFK2ztarzW~OTOuASqX9qnT4E0J3LDy1fQSI12PR83svjN~pdabp3pMdDFlN1DADrwPYncyQLX~Abi4w-4gEAIC2iqUZxQ31dYqKEKGG0a5ffI~72zmF4f1iKtt64A6uu-7sbUnPf2oCPD5NEIgxcSLFnp8RRL1lQl0p6nHkZqNtoaoldcIHMDqCXV2sSDTtLDKoub9xuVO7tTCLRy9YMA2ZWWI59pyEtMEHJdiNsI8lmlBEP4w-kDQJP7oIYTQGELt~vnu-HaJ1wdsdMim2GyZg6W3dwyDzbcQsCxQZ47thoyoQpt966ky2srFjFoDfGcop9LORlg__&Key-Pair-Id=K3RPWS32NSSJCE (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x1763f5a30>, 'Connection to cdn-lfs.hf.co timed out. (connect timeout=10)'))"), '(Request ID: ceb4e611-edc6-4daa-9cfa-4ce528db111f)')

In [9]:
! pip3 install transformers

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)
Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m8.7 MB/s[0m et

In [19]:
import pandas as pd
import glob

# Path to CSV files
# csv_files = glob.glob("processed_data/*.csv")
csv_file = "/Users/shanujha/Desktop/voice_activity_prediction/mfcc_extract_csv/dataset_3.csv"

# Initialize counters
class_counts = {0: 0, 1: 0}

# Read all CSV files and count occurrences of each label
# for csv_file in csv_files:
df = pd.read_csv(csv_file)

# Assuming labels are stored as lists of 0s and 1s in a column named 'labels'
all_labels = df['labels'].apply(eval).explode()  # Flatten list into separate rows

# Count occurrences
class_counts[0] += (all_labels == 0).sum()
class_counts[1] += (all_labels == 1).sum()

# Print class distribution
print(f"Class 0 (No Voice Activity): {class_counts[0]}")
print(f"Class 1 (Voice Activity): {class_counts[1]}")

# Calculate imbalance ratio
total_samples = class_counts[0] + class_counts[1]
imbalance_ratio = class_counts[1] / total_samples
print(f"Voice Activity Ratio: {imbalance_ratio:.4f}")

# Check if there is a significant imbalance
if imbalance_ratio < 0.3 or imbalance_ratio > 0.7:
    print("Warning: Your dataset is imbalanced!")


Class 0 (No Voice Activity): 0
Class 1 (Voice Activity): 0
Voice Activity Ratio: nan


  imbalance_ratio = class_counts[1] / total_samples


In [43]:
import ast

a = ast.literal_eval(df['features'][0])

len(a[0])

40

In [42]:
b = ast.literal_eval(df['labels'][0])

len(b[0])

2

In [34]:
a

array('[[-299.829833984375, 1.3464259609463625e-05, -1.516477368568303e-05, 1.960006011358928e-05, -1.4089594515098725e-05, 3.0129422157187946e-06, -8.76343074196484e-06, -1.7553734323882964e-06, 1.3819465038977796e-06, -1.908028934849426e-05, 9.123193740379065e-05, -5.2286835853010416e-05, -1.633862848393619e-05, 4.1740604501683265e-05, -2.3741495169815607e-05, 2.832035897881724e-05, 2.4349807063117623e-05, 4.064072709297761e-05, -5.932934436714277e-05, 0.00011120547424070537, -7.735864346614107e-05, 0.0001461734063923359, -5.5970951507333666e-05, -3.2203224691329524e-05, 8.304687071358785e-05, -6.032946203049505e-06, -4.5895420043962076e-05, -5.219458762439899e-05, -2.175490044464823e-05, -7.407990779029205e-05, 4.812349288840778e-05, -0.00020181799482088536, 1.2499200238380581e-05, 0.00012033882376272231, 6.677576311631128e-05, -6.613741425098851e-05, -3.811121814578655e-06, 5.4606018238700926e-05, 1.948940916918218e-05, 0.00013233287609182298], [-299.829833984375, 1.346425960946362

In [35]:
df['labels'][0]

'[[1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0