In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the model architecture
class SoundCNN(nn.Module):
    def __init__(self):
        super(SoundCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3)
        self.dropout = nn.Dropout(0.3)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(64 * 1 * 1, 128)
        self.fc2 = nn.Linear(128, 50)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, (2, 2))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, (2, 2))
        x = F.relu(self.conv3(x))
        x = self.global_pool(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)  # Reshape x to [batch_size, 64*1*1]
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Load the model and weights
model = SoundCNN()
model.load_state_dict(torch.load('final_model.pth'))
model.eval()  # Set the model to evaluation mode


  model.load_state_dict(torch.load('final_model.pth'))


SoundCNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (dropout): Dropout(p=0.3, inplace=False)
  (global_pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=50, bias=True)
)

In [14]:
import librosa
import numpy as np
import torch



In [15]:
# Define the feature extraction method (same as training)
def extract_features(data, sr, max_frames=512):
    mel_spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    mel_spectrogram_db = pad_or_truncate_spectrogram(mel_spectrogram_db, max_frames)
    mean = np.mean(mel_spectrogram_db)
    std = np.std(mel_spectrogram_db)
    mel_spectrogram_db = (mel_spectrogram_db - mean) / std
    return mel_spectrogram_db

# Padding or truncating the spectrogram to a fixed size
def pad_or_truncate_spectrogram(spectrogram, max_frames):
    if spectrogram.shape[1] < max_frames:
        pad_width = max_frames - spectrogram.shape[1]
        padded_spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
    else:
        padded_spectrogram = spectrogram[:, :max_frames]
    return padded_spectrogram



In [16]:


# Load the audio file
audio_file = 'coding-fast-typing-on-keyboard-sound-247411.mp3'  # Replace with your audio file path
audio_data, sampling_rate = librosa.load(audio_file)

# Print the sampling rate
print(f"Sampling Rate: {sampling_rate} Hz")

Sampling Rate: 22050 Hz


In [17]:

data, sr = librosa.load(audio_file, sr=22050)
features = extract_features(data, sr)
features = features.reshape(1, 1, features.shape[0], features.shape[1])

In [18]:
features_tensor = torch.tensor(features, dtype=torch.float32)

In [19]:
import pandas as pd

metadata = pd.read_csv('archive-3/esc50.csv')


In [20]:
# Perform inference
with torch.no_grad():
    output = model(features_tensor)
    _, predicted = torch.max(output, 1)

# Map the predicted label to the corresponding category
decoder = dict(zip(metadata['target'], metadata['category']))
predicted_category = decoder[predicted.item()]
print(f'Predicted Category: {predicted_category}')


Predicted Category: keyboard_typing
