In [13]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

def audio_to_spectrogram(audio_path, save_path):
    """
    Convert an audio file to a spectrogram and save it as a PNG.
    
    Args:
        audio_path (str): Path to the input audio file.
        save_path (str): Path to save the spectrogram image.
    """
    try:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=None)

        # Generate a Mel spectrogram
        mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)

        # Plot the spectrogram
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(mel_spect_db, sr=sr, x_axis='time', y_axis='mel', fmax=8000, cmap='viridis')
        plt.axis('off')  # Hide axes for cleaner images
        plt.tight_layout()

        # Save the plot as a PNG image
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()
        print(f"Saved spectrogram: {save_path}")
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")


In [14]:
import os

dataset_path = "dataset"
spectrograms_path = "spectrograms"

# Ensure the audio_to_spectrogram function is defined before calling
for label in ["scream", "non_scream"]:
    audio_files = os.listdir(os.path.join(dataset_path, label))
    for audio_file in audio_files:
        audio_path = os.path.join(dataset_path, label, audio_file)
        save_path = os.path.join(spectrograms_path, label, f"{audio_file.split('.')[0]}.png")
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        audio_to_spectrogram(audio_path, save_path)


Saved spectrogram: spectrograms\scream\1.png
Saved spectrogram: spectrograms\scream\10.png
Saved spectrogram: spectrograms\scream\100.png
Saved spectrogram: spectrograms\scream\1000.png
Saved spectrogram: spectrograms\scream\1001.png
Saved spectrogram: spectrograms\scream\1002.png
Saved spectrogram: spectrograms\scream\1003.png
Saved spectrogram: spectrograms\scream\1004.png
Saved spectrogram: spectrograms\scream\1005.png
Saved spectrogram: spectrograms\scream\1006.png
Saved spectrogram: spectrograms\scream\1007.png
Saved spectrogram: spectrograms\scream\1008.png
Saved spectrogram: spectrograms\scream\101.png
Saved spectrogram: spectrograms\scream\1010.png
Saved spectrogram: spectrograms\scream\1011.png
Saved spectrogram: spectrograms\scream\1012.png
Saved spectrogram: spectrograms\scream\1013.png
Saved spectrogram: spectrograms\scream\1014.png
Saved spectrogram: spectrograms\scream\1015.png
Saved spectrogram: spectrograms\scream\1016.png
Saved spectrogram: spectrograms\scream\1017.png

In [16]:
# Transform and load data
import torch
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

dataset = datasets.ImageFolder(spectrograms_path, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [18]:
# Define the CNN model
import torch.nn as nn
class ScreamCNN(nn.Module):
    def __init__(self):
        super(ScreamCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 32 * 32, 512)
        self.fc2 = nn.Linear(512, 2)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [20]:
# Set device, model, loss, and optimizer
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ScreamCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
epochs = 20
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait before stopping if validation loss doesn't improve
patience_counter = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    # Training phase
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Average loss for this epoch
    train_loss /= len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}")

    # Early stopping (if validation loss does not improve)
    if train_loss < best_val_loss:
        best_val_loss = train_loss
        patience_counter = 0
        # Save the model when the loss improves
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping: No improvement in validation loss for 3 epochs.")
            break

# Load the best model after training
model.load_state_dict(torch.load('best_model.pth'))

Epoch 1/20, Training Loss: 0.4292
Epoch 2/20, Training Loss: 0.1858
Epoch 3/20, Training Loss: 0.1383
Epoch 4/20, Training Loss: 0.0955
Epoch 5/20, Training Loss: 0.0759
Epoch 6/20, Training Loss: 0.0761
Epoch 7/20, Training Loss: 0.0448
Epoch 8/20, Training Loss: 0.0259
Epoch 9/20, Training Loss: 0.0431
Epoch 10/20, Training Loss: 0.0162
Epoch 11/20, Training Loss: 0.0228
Epoch 12/20, Training Loss: 0.0081
Epoch 13/20, Training Loss: 0.0040
Epoch 14/20, Training Loss: 0.0032
Epoch 15/20, Training Loss: 0.0004
Epoch 16/20, Training Loss: 0.0003
Epoch 17/20, Training Loss: 0.0001
Epoch 18/20, Training Loss: 0.0001
Epoch 19/20, Training Loss: 0.0001
Epoch 20/20, Training Loss: 0.0001


  model.load_state_dict(torch.load('best_model.pth'))


<All keys matched successfully>

In [25]:
import torch
import pyaudio
import librosa
import numpy as np
import torch.nn as nn
from torchvision import transforms
import librosa.display
import matplotlib.pyplot as plt

# PyAudio parameters
CHUNK = 1024  # Number of frames per buffer
FORMAT = pyaudio.paInt16  # Audio format
CHANNELS = 1  # Mono
RATE = 22050  # Sampling rate
DEVICE_INDEX = 0  # Select input device (0 is usually the default)

# Define the CNN model (assuming your ScreamCNN is already defined)
model = ScreamCNN().to(device)
model.eval()  # Set the model to evaluation mode

# Define the transformation for input data
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

# Initialize PyAudio
p = pyaudio.PyAudio()

def capture_audio():
    """Capture live audio from the microphone."""
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=DEVICE_INDEX)
    
    print("Recording...")

    frames = []
    for _ in range(0, int(RATE / CHUNK * 1)):  # Record for 1 second
        data = stream.read(CHUNK)
        frames.append(data)
    
    print("Recording stopped.")
    stream.stop_stream()
    stream.close()
    
    audio_data = b''.join(frames)
    
    # Convert audio data from int16 to float32
    audio_data = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
    
    # Normalize the audio data to be in the range of [-1, 1]
    audio_data /= np.max(np.abs(audio_data), axis=0)
    
    return audio_data

def audio_to_mfcc(y, sr=RATE, n_mfcc=13):
    """Convert audio to MFCC features."""
    # Librosa expects audio in float32 or float64 format, normalized to [-1, 1].
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfcc

def plot_mfcc(mfcc, save_path='mfcc.png'):
    """Plot the MFCC features as a spectrogram."""
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfcc, x_axis='time', y_axis='mel', cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('MFCC')
    plt.savefig(save_path)
    plt.close()

def classify_audio(mfcc):
    """Classify audio using the trained CNN model."""
    # Transform MFCC into a tensor suitable for CNN input
    mfcc_tensor = torch.tensor(mfcc).unsqueeze(0).unsqueeze(0).float().to(device)  # Adding batch and channel dimensions

    # Run the model
    with torch.no_grad():
        output = model(mfcc_tensor)
    
    # Get predicted class
    _, predicted = torch.max(output, 1)
    label = 'scream' if predicted.item() == 0 else 'non_scream'
    
    return label

def continuous_audio_detection():
    """Continuously capture audio and classify until a scream is detected."""
    while True:
        # Capture audio input
        audio_input = capture_audio()

        # Convert to MFCC
        mfcc = audio_to_mfcc(audio_input, sr=RATE)

        # Plot the MFCC spectrogram
        plot_mfcc(mfcc, save_path='live_mfcc.png')

        # Classify the audio input
        prediction = classify_audio(mfcc)
        print(f"Predicted Label: {prediction}")

        # If a scream is detected, stop the loop
        if prediction == 'scream':
            print("Scream detected! Stopping the loop.")
            break

# Start the continuous detection
continuous_audio_detection()


Recording...
Recording stopped.


RuntimeError: Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 1, 13, 43] to have 3 channels, but got 1 channels instead