In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Define the directory containing the WAV files
audio_dir = '/content/drive/MyDrive/Testing'

# Define the directory to save the spectrogram images
spectrogram_dir = '/content/drive/MyDrive'

# Create the spectrogram directory if it doesn't exist
os.makedirs(spectrogram_dir, exist_ok=True)

# Loop over each WAV file in the audio directory
for file in os.listdir(audio_dir):
    if file.endswith('.wav'):
        try:
            # Load the audio file
            audio_path = os.path.join(audio_dir, file)
            audio, sr = librosa.load(audio_path)

            # Generate the spectrogram
            spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
            spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

            # Plot and save the spectrogram image
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(spectrogram_db, sr=sr, x_axis='time', y_axis='mel')
            plt.colorbar(format='%+2.0f dB')
            plt.title('Spectrogram - {}'.format(file))
            plt.tight_layout()
            plt.savefig(os.path.join(spectrogram_dir, '{}.png'.format(os.path.splitext(file)[0])))
            plt.close()

        except (librosa.LibrosaError, FileNotFoundError) as e:
            print(f"Error processing file '{file}': {str(e)}")


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
from PIL import Image
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Define the directory containing the spectrogram images
spectrogram_dir = '/content/drive/MyDrive/Deep Learning/Augumented Spectogram'

# Define the list of target musical genres
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

# Define the transformation to apply to the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Custom dataset class for loading the spectrogram images
class SpectrogramDataset(Dataset):
    def __init__(self, spectrogram_dir, genres, transform=None):
        self.spectrogram_dir = spectrogram_dir
        self.genres = genres
        self.transform = transform
        self.image_files = []
        self.labels = []

        # Initialize label encoder
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(genres)

        # Iterate over each genre folder
        for genre in genres:
            genre_dir = os.path.join(spectrogram_dir, genre)
            if os.path.isdir(genre_dir):
                files = os.listdir(genre_dir)
                self.image_files.extend(files)
                self.labels.extend([genre] * len(files))

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.spectrogram_dir, self.labels[idx], self.image_files[idx])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        label = self.labels[idx]
        label = self.label_encoder.transform([label])[0]

        return image, label

# Create the dataset and dataloader
dataset = SpectrogramDataset(spectrogram_dir, genres, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the ResNet model
class ResNet(nn.Module):
    def __init__(self, num_classes):
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(64, 64, 2)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, in_channels, out_channels, blocks, stride=1):
        layers = []
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False))
        layers.append(nn.BatchNorm2d(out_channels))
        layers.append(nn.ReLU(inplace=True))

        for _ in range(1, blocks):
            layers.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False))
            layers.append(nn.BatchNorm2d(out_channels))
            layers.append(nn.ReLU(inplace=True))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

# Modify the last fully connected layer for the number of classes
num_classes = len(genres)

# Create the ResNet model
model = ResNet(num_classes)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Train the model
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Track the loss
        running_loss += loss.item() * images.size(0)

        # Track the accuracy
        _, predicted_labels = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted_labels == labels).sum().item()

    epoch_loss = running_loss / len(dataset)
    epoch_accuracy = (correct_predictions / total_predictions) * 100

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

# Save the trained model
torch.save(model.state_dict(), '/content/drive/MyDrive/Deep Learning/resnetmodel.pth')


Epoch [1/10], Loss: 1.9401, Accuracy: 31.88%
Epoch [2/10], Loss: 1.3877, Accuracy: 57.76%
Epoch [3/10], Loss: 0.9924, Accuracy: 74.32%
Epoch [4/10], Loss: 0.5324, Accuracy: 89.54%
Epoch [5/10], Loss: 0.3278, Accuracy: 94.79%
Epoch [6/10], Loss: 0.1701, Accuracy: 98.50%
Epoch [7/10], Loss: 0.1045, Accuracy: 99.55%
Epoch [8/10], Loss: 0.0696, Accuracy: 99.90%
Epoch [9/10], Loss: 0.0468, Accuracy: 100.00%
Epoch [10/10], Loss: 0.0384, Accuracy: 99.95%


In [3]:
import torch
import torch.nn as nn
from torchvision.transforms import transforms
from PIL import Image

# Define the number of classes and genre names
num_classes = 10
genre_names = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

# Load the pre-trained model
model = ResNet(num_classes)
model.load_state_dict(torch.load('/content/drive/MyDrive/Deep Learning/resnetmodel.pth'))
model.eval()

# Define the preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load and preprocess the input image
image = Image.open('/content/drive/MyDrive/01 Who R U.png')  # Replace with the path to your input image
if image.mode != 'RGB':
    image = image.convert('RGB')
preprocessed_image = preprocess(image)

# Add a batch dimension to the preprocessed image
preprocessed_image = preprocessed_image.unsqueeze(0)

# Make predictions
with torch.no_grad():
    output = model(preprocessed_image)
    _, predicted_class = torch.max(output, 1)

# Get the predicted genre name
predicted_genre = genre_names[predicted_class.item()]

print(predicted_genre)


hiphop
