In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install python_speech_features

Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py) ... [?25l[?25hdone
  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5867 sha256=21f1b50174a068da8c821cebdfc27aab3c74b138ed036df7e8c49138aabff349
  Stored in directory: /root/.cache/pip/wheels/5a/9e/68/30bad9462b3926c29e315df16b562216d12bdc215f4d240294
Successfully built python_speech_features
Installing collected packages: python_speech_features
Successfully installed python_speech_features-0.6


In [None]:
import os
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the CRNN model
class CRNN(nn.Module):
    def __init__(self, input_size, num_filters, rnn_hidden_size, output_size):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=num_filters, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_filters, out_channels=num_filters*2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.rnn = nn.LSTM(input_size=num_filters*2, hidden_size=rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(rnn_hidden_size*2, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change shape to (batch, seq_len, features)
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = self.pool(x)

        x = x.permute(0, 2, 1)  # Change shape to (batch, seq_len, input_size)
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # Take the output of the last time step

        x = self.dropout(x)
        x = self.fc(x)
        return x

# Load the audio dataset
def loadDataset(directory, max_folders):
    dataset = []
    i = 0
    for folder in os.listdir(directory):
        i += 1
        if i > max_folders:
            break
        for file in os.listdir(os.path.join(directory, folder)):
            file_path = os.path.join(directory, folder, file)
            if file == ".DS_Store" or os.path.isdir(file_path):
                continue
            try:
                (rate, sig) = wav.read(file_path)
                mfcc_feat = mfcc(sig, rate, winlen=0.020, appendEnergy=False)
                feature = (mfcc_feat, folder)
                dataset.append(feature)
            except ValueError as e:
                print(f"Skipping file '{file_path}': {str(e)}")
    return dataset

# Load the dataset
directory = "/content/drive/MyDrive/Deep Learning/genres_original"
dataset = loadDataset(directory, max_folders=100)

# Find the maximum length of the features
max_length = max(len(data[0]) for data in dataset)

# Pad or truncate sequences to the maximum length
def pad_or_truncate(feature, max_length):
    length = feature.shape[0]
    if length > max_length:
        return feature[:max_length]
    elif length < max_length:
        padded_feature = np.zeros((max_length, feature.shape[1]))
        padded_feature[:length] = feature
        return padded_feature
    return feature

# Process features
X = np.array([pad_or_truncate(data[0], max_length) for data in dataset])
y = np.array([data[1] for data in dataset])

# Normalize the input features
scaler = StandardScaler()
X_scaled = np.array([scaler.fit_transform(x) for x in X])

# Convert labels to integers using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert the data to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(device)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.33, random_state=1)

# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network hyperparameters
input_size = X_scaled[0].shape[1]  # Number of features (13 for MFCC)
num_filters = 32  # Reduced for faster training
rnn_hidden_size = 64  # Reduced for faster training
output_size = len(np.unique(y_encoded))

# Create the CRNN model
model = CRNN(input_size, num_filters, rnn_hidden_size, output_size).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the CRNN model
num_epochs = 50  # Reduced for faster experimentation
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print average loss for the epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Save the model after training
model_save_path = "/content/drive/MyDrive/Deep Learning/crnn_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Evaluate the CRNN model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print("Accuracy:", accuracy)


Epoch [1/50], Loss: 2.2719
Epoch [2/50], Loss: 2.0430
Epoch [3/50], Loss: 1.7489
Epoch [4/50], Loss: 1.5983
Epoch [5/50], Loss: 1.4840
Epoch [6/50], Loss: 1.4081
Epoch [7/50], Loss: 1.4039
Epoch [8/50], Loss: 1.3462
Epoch [9/50], Loss: 1.2623
Epoch [10/50], Loss: 1.2477
Epoch [11/50], Loss: 1.2159
Epoch [12/50], Loss: 1.2023
Epoch [13/50], Loss: 1.1978
Epoch [14/50], Loss: 1.1491
Epoch [15/50], Loss: 1.1196
Epoch [16/50], Loss: 1.1578
Epoch [17/50], Loss: 1.0827
Epoch [18/50], Loss: 1.0502
Epoch [19/50], Loss: 1.0371
Epoch [20/50], Loss: 1.0259
Epoch [21/50], Loss: 0.9903
Epoch [22/50], Loss: 0.9470
Epoch [23/50], Loss: 0.9843
Epoch [24/50], Loss: 0.9525
Epoch [25/50], Loss: 0.9522
Epoch [26/50], Loss: 0.9239
Epoch [27/50], Loss: 0.9303
Epoch [28/50], Loss: 0.8749
Epoch [29/50], Loss: 0.8503
Epoch [30/50], Loss: 0.9045
Epoch [31/50], Loss: 0.9155
Epoch [32/50], Loss: 0.8782
Epoch [33/50], Loss: 0.8124
Epoch [34/50], Loss: 0.8192
Epoch [35/50], Loss: 0.8059
Epoch [36/50], Loss: 0.7781
E

In [None]:
import torch
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms

# Define the CRNN model (same as used during training)
class CRNN(nn.Module):
    def __init__(self, input_size, num_filters, rnn_hidden_size, output_size):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_size, out_channels=num_filters, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_filters, out_channels=num_filters*2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.rnn = nn.LSTM(input_size=num_filters*2, hidden_size=rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(rnn_hidden_size*2, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change shape to (batch, seq_len, features)
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = self.pool(x)

        x = x.permute(0, 2, 1)  # Change shape to (batch, seq_len, input_size)
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # Take the output of the last time step

        x = self.dropout(x)
        x = self.fc(x)
        return x

# Load the CRNN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = 13  # Number of MFCC features
num_filters = 32
rnn_hidden_size = 64
output_size = len(label_encoder.classes_)  # Number of classes
model = CRNN(input_size, num_filters, rnn_hidden_size, output_size).to(device)

model_load_path = "/content/drive/MyDrive/Deep Learning/crnn_model.pth"
model.load_state_dict(torch.load(model_load_path))
model.eval()

# Define preprocessing and transformation functions
def preprocess_audio(file_path, max_length=1000):
    rate, sig = wav.read(file_path)
    mfcc_feat = mfcc(sig, rate, winlen=0.020, appendEnergy=False)
    padded_mfcc = pad_or_truncate(mfcc_feat, max_length)
    return padded_mfcc

def pad_or_truncate(feature, max_length):
    length = feature.shape[0]
    if length > max_length:
        return feature[:max_length]
    elif length < max_length:
        padded_feature = np.zeros((max_length, feature.shape[1]))
        padded_feature[:length] = feature
        return padded_feature
    return feature

# Load and preprocess the audio file
audio_path = "/content/drive/MyDrive/Deep Learning/genres_original/pop/pop.00005.wav"
mfcc_features = preprocess_audio(audio_path)
mfcc_features = StandardScaler().fit_transform(mfcc_features)  # Normalize
mfcc_features_tensor = torch.tensor(mfcc_features, dtype=torch.float32).unsqueeze(0).to(device)

# Make a prediction
with torch.no_grad():
    output = model(mfcc_features_tensor)
    _, predicted_class = torch.max(output, 1)

# Map prediction to label
predicted_label = label_encoder.inverse_transform([predicted_class.item()])[0]
print(f"Predicted label: {predicted_label}")


Predicted label: blues


  model.load_state_dict(torch.load(model_load_path))
