In [73]:
# from google.colab import drive
# drive.mount('/content/drive')

In [74]:
# # Define dataset path (update with actual path on your Google Drive or Colab storage)
# dataset_path = '/content/drive/My Drive/kaggle'

In [75]:
# !pip install librosa soundfile torch torchvision torchaudio


# NOISY DATA GENERATOR

In [76]:
# import librosa
# import soundfile as sf
# import numpy as np
# import os

# # Add noise to audio
# def add_noise(audio, noise_level=0.01):
#     noise = np.random.normal(0, noise_level, len(audio))
#     return audio + noise

# # Create noisy dataset
# def create_noisy_dataset(input_folder, output_folder, noise_level=0.01):
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     for root, _, files in os.walk(input_folder):
#         for file in files:
#             if file.endswith(".wav"):
#                 file_path = os.path.join(root, file)
#                 y, sr = librosa.load(file_path, sr=16000)
#                 noisy_y = add_noise(y, noise_level)
#                 output_path = os.path.join(output_folder, os.path.relpath(file_path, input_folder))
#                 os.makedirs(os.path.dirname(output_path), exist_ok=True)
#                 sf.write(output_path, noisy_y, sr)

# # Example usage
# NOISY_DATASET_PATH = '/content/drive/My Drive/kaggle_noisy'
# create_noisy_dataset(dataset_path, NOISY_DATASET_PATH, noise_level=0.01)


# MAIN CODE

In [77]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.nn.functional import pad


In [78]:
def extract_features(file_path, max_frames=32):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    # Pad or truncate to fixed size
    if mfcc.shape[1] < max_frames:
        mfcc = pad(torch.tensor(mfcc), (0, max_frames - mfcc.shape[1]), mode='constant', value=0)
    else:
        mfcc = torch.tensor(mfcc[:, :max_frames])
    return mfcc


In [79]:
class SpeechCommandDataset(Dataset):
    def __init__(self, data_folders, max_frames=32):
        self.files = []
        self.label_mapping = {"yes": 0, "no": 1, "up": 2, "down": 3}  # Adjust based on your dataset

        for folder in data_folders:
            for root, _, files in os.walk(folder):
                for f in files:
                    if f.endswith(".wav"):
                        file_path = os.path.join(root, f)
                        label = self.get_label(file_path)
                        if label != -1:  # Only include valid files
                            self.files.append(file_path)

        self.max_frames = max_frames

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        mfcc = extract_features(file_path, max_frames=self.max_frames)
        label = self.get_label(file_path)
        return mfcc, label

    def get_label(self, file_path):
        label_name = os.path.basename(os.path.dirname(file_path))  # Assuming folder names are the labels
        return self.label_mapping.get(label_name, -1)  # Return -1 if label_name is not in mapping


In [80]:
def collate_fn(batch):
    features, labels = zip(*batch)
    features_padded = pad_sequence(features, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return features_padded, labels


In [81]:
class SpeechCommandModel(nn.Module):
    def __init__(self, num_classes):
        super(SpeechCommandModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Output shape: (batch_size, 32, height/2, width/2)
        )
        self.rnn_input_size = 32 * (32 // 2)  # Channels * (Width after max pooling)
        self.rnn = nn.LSTM(self.rnn_input_size, 64, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(64 * 2, num_classes)  # Bi-directional LSTM

    def forward(self, x):
        # CNN expects a 4D input (batch_size, channels, height, width)
        x = self.cnn(x.unsqueeze(1))  # Add channel dimension
        x = x.permute(0, 2, 1, 3)  # Reorder to (batch_size, time_steps, channels, features)
        x = x.reshape(x.size(0), x.size(1), -1)  # Flatten last two dimensions (batch_size, time_steps, features)
        x, _ = self.rnn(x)  # Pass to LSTM
        x = self.fc(x[:, -1, :])  # Use the last time step for classification
        return F.log_softmax(x, dim=1)


In [82]:
dummy_input = torch.randn(1, 1, 40, 32)  # Example input: (batch_size, channels, height, width)
cnn_output = model.cnn(dummy_input)
print(cnn_output.shape)  # Check the shape before reshaping


torch.Size([1, 32, 20, 16])


In [83]:
def train_model(model, dataloader, epochs, device):
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=0.001)
    loss_fn = nn.NLLLoss()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for features, labels in dataloader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(features)
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")


In [84]:
def evaluate_model(model, dataloader, device):
    model = model.to(device)
    model.eval()
    total, correct = 0, 0

    with torch.no_grad():
        for features, labels in dataloader:
            features, labels = features.to(device), labels.to(device)
            output = model(features)
            _, predicted = torch.max(output, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")


In [85]:
data_folders = ["/content/drive/My Drive/kaggle", "/content/drive/My Drive/kaggle_noisy"]  # Replace with actual paths
dataset = SpeechCommandDataset(data_folders=data_folders, max_frames=32)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SpeechCommandModel(num_classes=4)  # Adjust num_classes based on your labels
train_model(model, dataloader, epochs=10, device=device)



Epoch 1/10, Loss: 5.6506
Epoch 2/10, Loss: 0.0495
Epoch 3/10, Loss: 0.0239
Epoch 4/10, Loss: 0.0146
Epoch 5/10, Loss: 0.0100
Epoch 6/10, Loss: 0.0074
Epoch 7/10, Loss: 0.0057
Epoch 8/10, Loss: 0.0045
Epoch 9/10, Loss: 0.0037
Epoch 10/10, Loss: 0.0031


In [96]:
evaluate_model(model, dataloader, device=device)


Accuracy: 96.00%


In [87]:
torch.save(model.state_dict(), "speech_command_model.pth")


In [88]:
model = SpeechCommandModel(num_classes=4)  # Ensure the architecture matches
model.load_state_dict(torch.load("speech_command_model.pth"))
model.eval()


  model.load_state_dict(torch.load("speech_command_model.pth"))


SpeechCommandModel(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(512, 64, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)

In [89]:
import torch

def predict(file_path, model, device):
    # Extract features from the input audio file
    mfcc = extract_features(file_path).unsqueeze(0).to(device)  # Add batch dimension
    # Get predictions
    with torch.no_grad():
        output = model(mfcc)
        _, predicted = torch.max(output, 1)
    return predicted.item()

# Label mapping (same as used during training)
label_mapping = {0: "yes", 1: "no", 2: "up", 3: "down"}


In [95]:
# Example audio file path
test_file = "/content/drive/My Drive/kaggle_noisy/dog/00f0204f_nohash_2.wav"

# Make a prediction
predicted_label_index = predict(test_file, model, device)
predicted_label = label_mapping[predicted_label_index]

print(f"Predicted Label: {predicted_label}")


Predicted Label: dog
