In [1]:

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from google.colab import drive
from torchvision import models, transforms
from torch.utils.data import DataLoader, TensorDataset
from PIL import Image
import requests
import tqdm as tqdm
import json
import os
import sys


In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Import custom functions
sys.path.append('/content/drive/My Drive/Colab Notebooks/final project/AudioNNRep/functions')
import functions as f

# Define the path to the data
path = '/content/drive/My Drive/Colab Notebooks/final project/audio/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define the spectrogram to RGB conversion function
def spectrogram_to_rgb(spectrogram, eps=1e-6):
    # Min-max scale to fit inside 8-bit RGB
    img = 255 * (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min() + eps)
    # Convert to uint8
    img = img.astype(np.uint8)

    # Stack the image to create a 3-channel image
    img = np.stack([img] * 3, axis=-1)
    return img

# Load resnet18 model with pretrained weights
def get_model():
    model = models.resnet18(pretrained=True)
    for param in model.parameters():
        param.require_grad = False
    model.avgpool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
    model.fc = nn.Sequential(
        nn.Flatten(),
        nn.Linear(512, 128),  # 512 for resnet18 or 2048 for resnet 50
        nn.ReLU(inplace=True),
        nn.Dropout(.2),
        nn.Linear(128, 6),
        nn.Sigmoid()
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    return model, optimizer

def loss_fn(y_pred, y_true):
    return torch.sum(torch.abs(y_pred - y_true)) / y_pred.shape[0]

In [None]:
# Generate spectrograms
mixed_spectograms, labels = f.generate_mixed_spectrograms(10000, 3, path=path)

# Convert spectrograms to RGB
X_rgb = np.array([spectrogram_to_rgb(s) for s in mixed_spectograms])

# Split into training, validation, and test sets (80/10/10)
X_train, X_val, X_test, y_train, y_val, y_test = f.split_data(X_rgb, labels, 0.1, 0.1)

# Convert numpy arrays to torch tensors and permute dimensions
X_train_rgb = torch.tensor(X_train).permute(0, 3, 1, 2).float()
y_train = torch.tensor(y_train).float()
X_val_rgb = torch.tensor(X_val).permute(0, 3, 1, 2).float()
y_val = torch.tensor(y_val).float()
X_test_rgb = torch.tensor(X_test).permute(0, 3, 1, 2).float()
y_test = torch.tensor(y_test).float()

# Scale the data to be between 0 and 1
def torch_min_max_normalization(X):
    X = (X - X.min()) / (X.max() - X.min())
    return X

X_train_rgb = torch_min_max_normalization(X_train_rgb)
X_val_rgb = torch_min_max_normalization(X_val_rgb)
X_test_rgb = torch_min_max_normalization(X_test_rgb)

# Create data loaders
batch_size = 16
train_dataset = TensorDataset(X_train_rgb, y_train)
val_dataset = TensorDataset(X_val_rgb, y_val)
test_dataset = TensorDataset(X_test_rgb, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)





  6%|▌         | 603/10000 [00:49<01:46, 88.56it/s]

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the model
model, optimizer = get_model()

# Function to calculate accuracy
def get_accuracy(y_pred, y_true):
    y_pred = y_pred.detach().cpu().numpy()
    y_true = y_true.detach().cpu().numpy()
    y_pred = np.round(y_pred)
    accuracy = np.mean(y_pred == y_true)
    return accuracy

# Function to train the model
def train_model(model, train_loader, val_loader, optimizer, loss_fn, num_epochs=100, patience=5):
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(num_epochs):
        model.train()
        epoch_train_loss = 0
        epoch_train_acc = 0
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                y_pred = model(x_batch.to(device))
                loss = loss_fn(y_pred, y_batch.to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            epoch_train_loss += loss.item()
            epoch_train_acc += get_accuracy(y_pred, y_batch)

        train_loss.append(epoch_train_loss / len(train_loader))
        train_acc.append(epoch_train_acc / len(train_loader))

        model.eval()
        epoch_val_loss = 0
        epoch_val_acc = 0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                y_pred = model(x_batch.to(device))
                loss = loss_fn(y_pred, y_batch.to(device))
                epoch_val_loss += loss.item()
                epoch_val_acc += get_accuracy(y_pred, y_batch)

        val_loss.append(epoch_val_loss / len(val_loader))
        val_acc.append(epoch_val_acc / len(val_loader))

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss[-1]}, Train Acc: {train_acc[-1]}, Val Loss: {val_loss[-1]}, Val Acc: {val_acc[-1]}")

        if epoch > patience and val_loss[-1] > min(val_loss[-patience:]):
            print("Early stopping due to no improvement in validation loss.")
            break

    return model, train_loss, train_acc, val_loss, val_acc

# Train the model
trained_model, train_loss, train_acc, val_loss, val_acc = train_model(model, train_loader, val_loader, optimizer, loss_fn, num_epochs=100, patience=10)

In [None]:

final_train_loss = train_loss[-1]
final_train_acc = train_acc[-1]
final_val_loss = val_loss[-1]
final_val_acc = val_acc[-1]

# Plotting with customization
plt.figure(figsize=(10, 5))

# Plot the training and validation loss
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='train loss', color='lightblue', linewidth=2)
plt.plot(val_loss, label='validation loss', color='darkred', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('ResNet50, Loss Value')
plt.legend()

# Annotate final values
plt.text(len(train_loss)-1, final_train_loss, f'{final_train_loss:.2f}', bbox=dict(facecolor='lightblue', alpha=0.5))
plt.text(len(val_loss)-1, final_val_loss, f'{final_val_loss:.2f}', bbox=dict(facecolor='darkred', alpha=0.5))

# Plot the training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(train_acc, label='train accuracy', color='lightblue', linewidth=2)
plt.plot(val_acc, label='validation accuracy', color='darkred', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('ResNet50, Accuracy Value')
plt.legend()

# Annotate final values
plt.text(len(train_acc)-1, final_train_acc, f'{final_train_acc:.2f}', bbox=dict(facecolor='lightblue', alpha=0.5))
plt.text(len(val_acc)-1, final_val_acc, f'{final_val_acc:.2f}', bbox=dict(facecolor='darkred', alpha=0.5))

plt.tight_layout()
plt.show()

In [None]:
trained_model.eval()
with torch.no_grad():
    y_pred_test = trained_model(X_test_rgb.to(device))
    test_loss = loss_fn(y_pred_test, y_test.to(device))
    test_accuracy = get_accuracy(y_pred_test, y_test.to(device))

In [None]:
# Generate spectrograms
mixed_spectograms, labels = f.generate_mixed_spectrograms(1000, 3, path=path)

# Convert spectrograms to RGB
X_rgb = np.array([spectrogram_to_rgb(s) for s in mixed_spectograms])

# Split into training, validation, and test sets (80/10/10)
X_train, X_val, X_test, y_train, y_val, y_test = f.split_data(X_rgb, labels, 0.1, 0.1)

# Convert numpy arrays to torch tensors and permute dimensions
X_train_rgb = torch.tensor(X_train).permute(0, 3, 1, 2).float()
y_train = torch.tensor(y_train).float()
X_val_rgb = torch.tensor(X_val).permute(0, 3, 1, 2).float()
y_val = torch.tensor(y_val).float()
X_test_rgb = torch.tensor(X_test).permute(0, 3, 1, 2).float()
y_test = torch.tensor(y_test).float()

# Scale the data to be between 0 and 1
def torch_min_max_normalization(X):
    X = (X - X.min()) / (X.max() - X.min())
    return X

X_train_rgb = torch_min_max_normalization(X_train_rgb)
X_val_rgb = torch_min_max_normalization(X_val_rgb)
X_test_rgb = torch_min_max_normalization(X_test_rgb)

# Create data loaders
batch_size = 16
train_dataset = TensorDataset(X_train_rgb, y_train)
val_dataset = TensorDataset(X_val_rgb, y_val)
test_dataset = TensorDataset(X_test_rgb, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

100%|██████████| 1000/1000 [00:10<00:00, 97.77it/s]


In [None]:
y_pred_test = trained_model(X_train_rgb.to(device))
def get_accuracy(y_pred, y_true):
    y_pred = y_pred.detach().cpu().numpy()
    y_true = y_true.detach().cpu().numpy()
    y_pred = np.round(y_pred)
    accuracy = np.mean(y_pred == y_true)
    return accuracy
get_accuracy(y_pred_test, y_train)

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.14 GiB. GPU 

In [None]:
y_pred_test