In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.transforms as T
import numpy as np
import os
import pickle
import pyaudio
import wave
import pandas as pd
import matplotlib.pyplot as plt
import time
from collections import deque
from glob import glob
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


from dotenv import load_dotenv
load_dotenv()

True

In [74]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join('..', 'vap_sound/src')))


from utils.evaluate_model import evaluate_model
from utils.extract_mfcc_torchaudio import extract_mfcc
from utils.save_load_model import save_model_pickle, load_model_pickle

from config import (
    MFCC_SAMPLE_RATE, 
    N_MFCC_PER_FRAME,
    MFCC_DIM,
    SEQ_LENGTH,
    NUM_CLASSES,
    BATCH_SIZE,
    NUM_EPOCHS,
    LEARNING_RATE
    )

In [75]:
### Variables

MODEL_PATH = "../model/vapor_model.pkl"
AUDIO_FILES_DIR = "/Users/shanujha/Desktop/voice_activity_prediction/voice_data_mozilla/en/clips/"
CSV_FILES_DIR = "/Users/shanujha/Desktop/voice_activity_prediction/mfcc_extract_csv/*.csv"
LOG_PROCESSED_FILES = "../logs/processed_files.log"
LOG_MODEL_TRAINING = "../logs/model_training.log"
LOG_MODEL_EVALUATION = "../logs/model_evaluation.log"
LOG_MODEL_PREDICTION = "../logs/model_prediction.log"


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")   

In [None]:
# class VAPModel(nn.Module):
#     def __init__(self, input_dim=N_MFCC_PER_FRAME, hidden_dim=256, num_heads=8, num_layers=6, output_dim=2):
#         super(VAPModel, self).__init__()
#         self.transformer = nn.TransformerEncoder(
#             nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads),
#             num_layers=num_layers
#         )
#         self.fc = nn.Linear(input_dim, output_dim)
    
#     def forward(self, x):
#         x = self.transformer(x)
#         x = self.fc(x)
#         return torch.sigmoid(x)
    

import torch
import torch.nn as nn

# Define the VAP Model with LSTM
class VAPModel(nn.Module):
    def __init__(self, input_dim=N_MFCC_PER_FRAME, hidden_dim=128, lstm_hidden_dim=256, num_heads=8, num_layers=4, output_dim=1):
        super(VAPModel, self).__init__()
        
        # Transformer layer
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads),
            num_layers=num_layers
        )
        
        # LSTM Layer (make sure it's properly initialized)
        self.lstm = nn.LSTM(input_dim, lstm_hidden_dim, batch_first=True)
        
        # Fully connected layer to produce final output
        self.fc = nn.Linear(lstm_hidden_dim, output_dim)
    
    def forward(self, x):
        # Apply transformer for better feature encoding
        x = self.transformer(x)
        
        # Apply LSTM for sequence modeling
        x, _ = self.lstm(x)
        
        # Apply fully connected layer
        x = self.fc(x)
        
        return torch.sigmoid(x)


In [None]:
# Custom dataset class for extracting MFCC features and labels
class VAPDataset(Dataset):
    def __init__(self, csv_files, seq_length=SEQ_LENGTH):
        if isinstance(csv_files, str):
            csv_files = [csv_files]
        
        self.seq_length = seq_length
        self.data = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
        self.features = [torch.tensor(eval(f)) for f in self.data['features']]
        self.labels = [torch.tensor(eval(l))[:,0] for l in self.data['labels']]
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        
        if feature.shape[0] > self.seq_length:
            feature = feature[-self.seq_length:]
            label = label[-self.seq_length:]
        else:
            pad_length = self.seq_length - feature.shape[0]
            feature = torch.cat([torch.zeros(pad_length, feature.shape[1]), feature], dim=0)
            label = torch.cat([torch.zeros(pad_length, label.shape[1]), label], dim=0)
        
        return feature, label

In [88]:
# Load dataset from a folder containing MP3 files and save to CSV
data_dir = AUDIO_FILES_DIR
audio_files = glob(os.path.join(data_dir, "*.mp3"))
csv_files = glob(CSV_FILES_DIR)

# Splitting dataset into training and testing
train_files, test_files = train_test_split(csv_files, test_size=0.2, random_state=42)

In [89]:
# Collate function to pad variable-length sequences
def collate_fn(batch):
    features, labels = zip(*batch)
    features_padded = pad_sequence(features, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    return features_padded, labels_padded

# Function to track processed files
def get_processed_files(log_file=LOG_PROCESSED_FILES):
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            return set(f.read().splitlines())
    return set()

def update_processed_files(files, log_file=LOG_PROCESSED_FILES):
    with open(log_file, "a") as f:
        for file in files:
            f.write(file + "\n")

In [96]:

# Training loop with batch-wise dataset loading
def train_model(model, csv_files, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, log_file=LOG_MODEL_TRAINING, processed_log=LOG_PROCESSED_FILES, seq_length=SEQ_LENGTH):
    processed_files = get_processed_files(processed_log)
    remaining_files = [f for f in csv_files if f not in processed_files]
    total_files = len(remaining_files)
    
    with open(log_file, "a") as log:
        for i in range(0, total_files, batch_size):
            batch_files = remaining_files[i:i+batch_size]
            train_dataset = VAPDataset_m(batch_files, seq_length=seq_length)
            # print(type(train_dataset), train_dataset)
            train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
            # print(type(train_loader), train_loader)

            eval_files = np.random.choice(test_files, 5)
            eval_dataset = VAPDataset_m(eval_files, seq_length=seq_length)
            eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
            
            if os.path.exists(MODEL_PATH):
                model = load_model_pickle(path=MODEL_PATH)
            
            optimizer = optim.Adam(model.parameters(), lr=0.001)
            criterion = nn.BCELoss()
            
            for epoch in range(epochs):
                model.train()
                total_loss = 0
                val_loss = 0
                
                for features, labels in train_loader:
                    optimizer.zero_grad()
                    outputs = model(features)
                    
                    min_length = min(outputs.shape[1], labels.shape[1])
                    outputs = outputs[:, :min_length, :]
                    # labels = labels[:, :min_length, :]
                    labels = labels[:, :min_length].unsqueeze(-1)
                    
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                
                model.eval()
                with torch.no_grad():
                    for features, labels in train_loader:
                        outputs = model(features)
                        outputs = outputs[:, :min_length, :]
                        labels = labels[:, :min_length, :]
                        loss = criterion(outputs, labels)
                        val_loss += loss.item()
                
                log.write(f"Batch {i//batch_size+1}/{total_files//batch_size+1}, Epoch {epoch+1}/{epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(train_loader):.4f}\n")
                print(f"Batch {i//batch_size+1}/{total_files//batch_size+1}, Epoch {epoch+1}/{epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(train_loader):.4f}")
            
            save_model_pickle(model, path=MODEL_PATH)
            update_processed_files(batch_files, processed_log)
            evaluate_model(model, eval_loader, output_file=LOG_MODEL_EVALUATION, metrics_file=LOG_MODEL_PREDICTION)

# Train the model in batches
train_model(VAPModel(), train_files, batch_size=10, epochs=10, seq_length=SEQ_LENGTH)
print("Training complete!")




IndexError: too many indices for tensor of dimension 2

In [105]:
class VAPDataset_m(Dataset):
    def __init__(self, csv_files, seq_length=SEQ_LENGTH):
        if isinstance(csv_files, str):
            csv_files = [csv_files]

        self.seq_length = seq_length
        self.data = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
        self.features = [torch.tensor(eval(f)) for f in self.data['features']]
        
        # Extract only the first label
        lb = [eval(f)[0] for f in self.data['labels']]
        self.labels = torch.tensor(lb).float()  # Ensure it's a float tensor

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]  # Scalar value

        # Trim or pad feature sequence
        if feature.shape[0] > self.seq_length:
            feature = feature[-self.seq_length:]
        else:
            pad_length = self.seq_length - feature.shape[0]
            feature = torch.cat([torch.zeros(pad_length, feature.shape[1]), feature], dim=0)

        return feature, label.unsqueeze(0)  # Ensure label is a tensor of shape (1,)


In [106]:
dataset = VAPDataset_m(["/Users/shanujha/Desktop/voice_activity_prediction/mfcc_extract_csv/dataset_1.csv"], seq_length=100)

In [107]:
dataset.data.labels

0     [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0...
1     [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0...
2     [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0...
3     [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0...
4     [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0...
                            ...                        
92    [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0...
93    [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0...
94    [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0...
95    [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0...
96    [[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0...
Name: labels, Length: 97, dtype: object

In [None]:
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [None]:
loader.dataset.data

Unnamed: 0,features,labels
0,"[[-337.7912902832031, 1.5555504432995804e-05, ...","[[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0..."
1,"[[-359.0429992675781, -7.660827350264299e-08, ...","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0..."
2,"[[-308.6006774902344, 1.2662911103689112e-05, ...","[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0..."
3,"[[-318.2252197265625, 2.025314825004898e-05, -...","[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0..."
4,"[[-296.0960693359375, 2.2307773178908974e-05, ...","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0..."
...,...,...
92,"[[-327.2367248535156, 1.7432679669582285e-05, ...","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0..."
93,"[[-328.3695068359375, 6.071793450246332e-06, -...","[[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0..."
94,"[[-378.6844177246094, -1.5926906371532823e-06,...","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0..."
95,"[[-292.9357604980469, 1.2051293651893502e-07, ...","[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0..."


In [None]:
data = pd.read_csv("/Users/shanujha/Desktop/voice_activity_prediction/mfcc_extract_csv/dataset_1.csv")

In [None]:
b = eval(data['labels'][0])
a = []

for i in range(len(b)):
    print(b[i][0])
    a.append(b[i][0])



0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [None]:
torch.tensor(a)

tensor([0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 