In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install python_speech_features

Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone
  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5867 sha256=2d3849129fd80f3dff07f3741691ac790dfc45cafaec2560609ce893b408b672
  Stored in directory: /root/.cache/pip/wheels/5a/9e/68/30bad9462b3926c29e315df16b562216d12bdc215f4d240294
Successfully built python_speech_features
Installing collected packages: python_speech_features
Successfully installed python_speech_features-0.6


In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Define the CNN model for spectrogram images
class SpectrogramCNN(nn.Module):
    def __init__(self, num_classes):
        super(SpectrogramCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 32 * 32, 256)  # Adjust the size based on your input image dimensions
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Load the augmented spectrogram images
def load_spectrogram_images(directory, transform=None):
    dataset = datasets.ImageFolder(root=directory, transform=transform)
    return dataset

# Define transformations for the spectrogram images
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),  # Convert images to grayscale
    transforms.Resize((128, 128)),  # Resize images to a fixed size
    transforms.ToTensor(),  # Convert images to PyTorch tensors
])

# Load the dataset
directory = "/content/drive/MyDrive/Augumented Spectogram"
dataset = load_spectrogram_images(directory, transform=transform)

# Split the dataset into training and testing sets
train_size = int(0.67 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(dataset.targets)

# Define the neural network hyperparameters
num_classes = len(dataset.classes)

# Create the CNN model
model = SpectrogramCNN(num_classes)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

# Train the CNN model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    scheduler.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader.dataset):.4f}")

# Save the model after training
model_save_path = "/content/drive/MyDrive/cnn_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Evaluate the CNN model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print("Accuracy:", accuracy)


Epoch [1/50], Loss: 2.2169
Epoch [2/50], Loss: 1.9761
Epoch [3/50], Loss: 1.8484
Epoch [4/50], Loss: 1.7462
Epoch [5/50], Loss: 1.6824
Epoch [6/50], Loss: 1.5917
Epoch [7/50], Loss: 1.5470
Epoch [8/50], Loss: 1.5314
Epoch [9/50], Loss: 1.4437
Epoch [10/50], Loss: 1.3951
Epoch [11/50], Loss: 1.3452
Epoch [12/50], Loss: 1.2954
Epoch [13/50], Loss: 1.2495
Epoch [14/50], Loss: 1.1963
Epoch [15/50], Loss: 1.1308
Epoch [16/50], Loss: 1.1263
Epoch [17/50], Loss: 1.1078
Epoch [18/50], Loss: 1.0622
Epoch [19/50], Loss: 1.0046
Epoch [20/50], Loss: 0.9477
Epoch [21/50], Loss: 0.8896
Epoch [22/50], Loss: 0.8888
Epoch [23/50], Loss: 0.8378
Epoch [24/50], Loss: 0.8292
Epoch [25/50], Loss: 0.7603
Epoch [26/50], Loss: 0.7306
Epoch [27/50], Loss: 0.6978
Epoch [28/50], Loss: 0.6854
Epoch [29/50], Loss: 0.6494
Epoch [30/50], Loss: 0.6124
Epoch [31/50], Loss: 0.5925
Epoch [32/50], Loss: 0.5280
Epoch [33/50], Loss: 0.4929
Epoch [34/50], Loss: 0.4571
Epoch [35/50], Loss: 0.4525
Epoch [36/50], Loss: 0.4220
E

In [6]:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image

class SpectrogramCNN(nn.Module):
    def __init__(self, num_classes):
        super(SpectrogramCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 32 * 32, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Load the model
num_classes = 10  # Replace with your number of classes
model = SpectrogramCNN(num_classes)
model_load_path = "/content/drive/MyDrive/cnn_model.pth"
model.load_state_dict(torch.load(model_load_path))
model.eval()

# Define the image transformation
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

# Load and transform the image
image_path = "/content/drive/MyDrive/Augumented Spectogram/jazz/flipped_jazz.00004.png"
image = Image.open(image_path)
image = transform(image)
image = image.unsqueeze(0)  # Add a batch dimension

# Move to device
image = image.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Make a prediction
with torch.no_grad():
    output = model(image)
    _, predicted_class = torch.max(output, 1)

# Assuming you have the label encoder from training
labels = dataset.classes  # Replace with your label list
predicted_label = labels[predicted_class.item()]
print(f"Predicted label: {predicted_label}")


Predicted label: jazz


  model.load_state_dict(torch.load(model_load_path))
