In [None]:
!pip install python_speech_features

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pickle
import random
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Define a neural network model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define a function to load the audio dataset
def loadDataset(directory, max_folders):
    dataset = []
    i = 0
    for folder in os.listdir(directory):
        i += 1
        if i > max_folders:
            break
        for file in os.listdir(os.path.join(directory, folder)):
            file_path = os.path.join(directory, folder, file)
            if file == ".DS_Store" or os.path.isdir(file_path):
                continue
            try:
                (rate, sig) = wav.read(file_path)
                mfcc_feat = mfcc(sig, rate, winlen=0.020, appendEnergy=False)
                covariance = np.cov(np.transpose(mfcc_feat))
                mean_matrix = mfcc_feat.mean(0)
                feature = (mean_matrix, covariance, folder)
                dataset.append(feature)
            except ValueError as e:
                print(f"Skipping file '{file_path}': {str(e)}")
    return dataset

# Load the audio dataset
directory = "/content/drive/MyDrive/Deep Learning/genres_original"
dataset = loadDataset(directory, max_folders=100)

# Split the dataset into features (X) and labels (y)
X = np.array([data[0] for data in dataset])
y = np.array([data[2] for data in dataset])

# Normalize the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert labels to integers using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert the encoded labels to PyTorch tensor
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_tensor, test_size=0.33, random_state=1)

# Convert the dataset into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Define the neural network hyperparameters
input_size = X.shape[1]
hidden_size = 100
output_size = len(np.unique(y_encoded))

# Create the neural network model
model = NeuralNetwork(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the neural network
num_epochs = 10000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate the neural network
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)

    accuracy = (predicted == y_test).sum().item() / len(y_test)
    print("Accuracy:", accuracy)


Skipping file '/content/drive/MyDrive/Deep Learning/genres_original/jazz/jazz.00054.wav': File format b'\xcb\x15\x1e\x16' not understood. Only 'RIFF' and 'RIFX' supported.
Epoch [100/10000], Loss: 2.0885
Epoch [200/10000], Loss: 1.9039
Epoch [300/10000], Loss: 1.7735
Epoch [400/10000], Loss: 1.6753
Epoch [500/10000], Loss: 1.5980
Epoch [600/10000], Loss: 1.5353
Epoch [700/10000], Loss: 1.4832
Epoch [800/10000], Loss: 1.4389
Epoch [900/10000], Loss: 1.4008
Epoch [1000/10000], Loss: 1.3675
Epoch [1100/10000], Loss: 1.3381
Epoch [1200/10000], Loss: 1.3119
Epoch [1300/10000], Loss: 1.2882
Epoch [1400/10000], Loss: 1.2667
Epoch [1500/10000], Loss: 1.2470
Epoch [1600/10000], Loss: 1.2289
Epoch [1700/10000], Loss: 1.2120
Epoch [1800/10000], Loss: 1.1962
Epoch [1900/10000], Loss: 1.1814
Epoch [2000/10000], Loss: 1.1674
Epoch [2100/10000], Loss: 1.1542
Epoch [2200/10000], Loss: 1.1416
Epoch [2300/10000], Loss: 1.1296
Epoch [2400/10000], Loss: 1.1181
Epoch [2500/10000], Loss: 1.1071
Epoch [2600/