In [None]:
# fo colab - ignore on local

In [None]:
%cd ..
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive

In [None]:
!cp /content/gdrive/MyDrive/UrbanSound_mini.zip /content/

In [None]:
%cd content

In [None]:
!unzip UrbanSound_mini.zip

In [None]:
#importing libraries

In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from pydub import AudioSegment
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
#configuration

In [None]:
DATASET_PATH = "./UrbanSound/data"
POSITIVE_CLASS = "gun_shot"
SAMPLE_RATE = 22050
DURATION = 2.0
MFCC_NUM = 40
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128

In [None]:
# audio and feature extraction

In [None]:
def load_audio_file(file_path, target_sr=SAMPLE_RATE, duration=DURATION):
    try:
        audio = AudioSegment.from_file(file_path)
        audio = audio.set_channels(1)
        audio = audio.set_frame_rate(target_sr)
        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)
        target_length = int(target_sr * duration)
        if len(samples) < target_length:
            samples = np.pad(samples, (0, target_length - len(samples)))
        else:
            samples = samples[:target_length]
        return samples, target_sr
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None, None

def extract_features(audio, sr=SAMPLE_RATE, n_mfcc=MFCC_NUM, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfccs = librosa.util.normalize(mfccs)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel = librosa.power_to_db(mel, ref=np.max)
    mel = librosa.util.normalize(mel)
    features = np.stack([mfccs, mel[:n_mfcc, :]], axis=-1)
    return features

In [None]:
#Load Dataset for Binary Classification

In [None]:
def load_dataset_binary(dataset_path, positive_class="gun_shot", max_files_per_class=None):
    features = []
    labels = []
    all_classes = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    for class_name in all_classes:
        class_dir = os.path.join(dataset_path, class_name)
        audio_files = [f for f in os.listdir(class_dir) if f.endswith(('.wav', '.mp3', '.aif', '.flac'))]
        if max_files_per_class:
            audio_files = audio_files[:max_files_per_class]
        for audio_file in tqdm(audio_files, desc=f"Processing {class_name}"):
            file_path = os.path.join(class_dir, audio_file)
            audio, sr = load_audio_file(file_path)
            if audio is None:
                continue
            try:
                feature = extract_features(audio, sr)
                features.append(feature)
                labels.append(1 if class_name == positive_class else 0)
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue
    return np.array(features), np.array(labels)

In [None]:
# Prepare Data

In [None]:
print("Loading dataset...")
X, y = load_dataset_binary(DATASET_PATH, positive_class=POSITIVE_CLASS, max_files_per_class=200)
if len(X) == 0:
    raise ValueError("No data was loaded. Please check your dataset path and file formats.")
print(f"Loaded {len(X)} samples")
print(f"Feature shape: {X[0].shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

In [None]:
# Build and Train the Model

In [None]:
def build_model(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Flatten(),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    return model

input_shape = X_train[0].shape
model = build_model(input_shape)
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

print("Training the model...")
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=100,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
# Evaluate and Visualize

In [None]:
print("Evaluating the model...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {test_accuracy:.4f}")

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Gunshot', 'Gunshot'], yticklabels=['Not Gunshot', 'Gunshot'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=['Not Gunshot', 'Gunshot']))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Example: y_true and y_pred
# y_true = [0, 1, 0, 1, ...]
# y_pred = [0, 1, 1, 0, ...]

cm = confusion_matrix(y_true, y_pred)
# cm layout for binary classification:
# [[TN, FP],
#  [FN, TP]]

plt.figure(figsize=(6,5))
ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                 xticklabels=['Not Gunshot', 'Gunshot'], 
                 yticklabels=['Not Gunshot', 'Gunshot'])

# Annotate with TP, FP, TN, FN
ax.text(0.5, 0.5, 'TN', ha='center', va='center', color='blue', fontsize=14)
ax.text(1.5, 0.5, 'FP', ha='center', va='center', color='blue', fontsize=14)
ax.text(0.5, 1.5, 'FN', ha='center', va='center', color='blue', fontsize=14)
ax.text(1.5, 1.5, 'TP', ha='center', va='center', color='blue', fontsize=14)

plt.title('Confusion Matrix with TP, FP, TN, FN')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# save the model

In [None]:
model.save("gunshot_cnn_model.h5")  # or use .keras for the new format
model.save('gunshot_cnn_model.keras')

In [None]:
# loading model for inference : 

In [None]:
from tensorflow.keras.models import load_model
model = load_model("gunshot_cnn_model.h5")

In [None]:
## for audio stream processing: [not tested] 

In [None]:
import sounddevice as sd
import numpy as np

DURATION = 2.0  # seconds
SAMPLE_RATE = 22050

def audio_callback(indata, frames, time, status):
    # indata: shape (frames, channels)
    audio = indata[:, 0]  # mono
    # Preprocess and extract features as in training
    features = extract_features(audio, SAMPLE_RATE)
    features = np.expand_dims(features, axis=0)
    prediction = model.predict(features)[0][0]
    if prediction > 0.5:
        print("Gunshot detected!")

# Start streaming
with sd.InputStream(channels=1, samplerate=SAMPLE_RATE, callback=audio_callback, blocksize=int(SAMPLE_RATE * DURATION)):
    print("Listening for gunshots...")
    while True:
        pass  # Keep the stream alive

In [None]:
# on audio file

In [None]:
import numpy as np
import librosa
from tensorflow.keras.models import load_model

# --- CONFIGURATION ---
MODEL_PATH = "gunshot_cnn_model.keras"
AUDIO_PATH = "/content/UrbanSound_mini/data/jack_hammer/105029.wav"  # Change to your file
SAMPLE_RATE = 22050
DURATION = 2.0  # seconds
MFCC_NUM = 40
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 128
THRESHOLD = 0.5

# --- LOAD MODEL ---
model = load_model(MODEL_PATH)

# --- FEATURE EXTRACTION (same as training) ---
def extract_features(audio, sr=SAMPLE_RATE, n_mfcc=MFCC_NUM, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfccs = librosa.util.normalize(mfccs)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel = librosa.power_to_db(mel, ref=np.max)
    mel = librosa.util.normalize(mel)
    features = np.stack([mfccs, mel[:n_mfcc, :]], axis=-1)
    return features

# --- LOAD AND PREPROCESS AUDIO FILE ---
audio, sr = librosa.load(AUDIO_PATH, sr=SAMPLE_RATE, mono=True)
target_length = int(SAMPLE_RATE * DURATION)
if len(audio) < target_length:
    audio = np.pad(audio, (0, target_length - len(audio)))
else:
    audio = audio[:target_length]

features = extract_features(audio, SAMPLE_RATE)
features = np.expand_dims(features, axis=0)  # Add batch dimension

# --- INFERENCE ---
prob = model.predict(features)[0][0]
if prob > THRESHOLD:
    print(f"Gunshot detected! (confidence: {prob:.2f})")
else:
    print(f"Not gunshot (confidence: {1-prob:.2f})")

In [None]:
# more : 
https://github.com/hasnainnaeem/Gunshot-Detection-in-Audio/blob/master/US8K-Binary%20Visualization%2C%20Training%20%26%20Predictions-updated.ipynb