<a href="https://colab.research.google.com/github/skj092/learn_sound_classification/blob/main/Sound_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
chrisfilo_urbansound8k_path = kagglehub.dataset_download('chrisfilo/urbansound8k')
print('Data source import complete.')

Data source import complete.


In [None]:
from fastai.vision.all import *
import librosa
import numpy as np
import pandas as pd
import wandb
from fastai.callback.wandb import *
from dataclasses import dataclass
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define dataclass for hyperparameters
@dataclass
class HyperParams:
    n_mels: int = 64
    n_fft: int = 1024
    hop_length: int = 512
    fmax: int = 8000  # <--- Added to avoid librosa empty filter warning
    batch_size: int = 32
    image_size: int = 224
    architecture: str = "resnet18"
    epochs: int = 8
    lr: float = 3e-3
    debug_epochs: int = 1
    debug_lr: float = 1e-3

# Init hyperparams
hparams = HyperParams()

# Control debug mode
debug = False

# WandB init
if not debug:
    from google.colab import userdata
    key = userdata.get('wandb')
    os.environ['WANDB_API_KEY'] = key
    wandb.init(project="urbansound8k-fastai", name="mel-spectrogram-cnn", reinit=True)
    wandb.config.update(vars(hparams))

# Define paths and load metadata
path = Path('/kaggle/input/urbansound8k')
df = pd.read_csv(path/'UrbanSound8K.csv')

# Audio to spectrogram function
def audio_to_spectrogram(fn, n_mels=hparams.n_mels, n_fft=hparams.n_fft, hop_length=hparams.hop_length, fmax=hparams.fmax):
    y, sr = librosa.load(fn, sr=None)
    if fmax is None:
        fmax = sr // 2
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, fmax=fmax)
    S_db = librosa.power_to_db(S, ref=np.max)
    return S_db

def spectrogram_to_image(S_db):
    S_db = (S_db - S_db.min()) / (S_db.max() - S_db.min())
    return np.stack([S_db, S_db, S_db], axis=-1)

def get_x(row):
    return path/f'fold{row["fold"]}'/row['slice_file_name']

def get_y(row):
    return row['class']

def get_spectrogram(row):
    try:
        S_db = audio_to_spectrogram(get_x(row))
        img = spectrogram_to_image(S_db)
        return PILImage.create((img * 255).astype(np.uint8))
    except Exception as e:
        print(f"Error processing {get_x(row)}: {e}")
        return None

# Check for missing files
df['file_path'] = df.apply(get_x, axis=1)
df['file_exists'] = df['file_path'].apply(lambda x: x.exists())
print(f"Missing files: {df[~df['file_exists']].shape[0]}")
df = df[df['file_exists']].reset_index(drop=True)

# Custom splitter
def custom_splitter(df):
    train_idx = df[df['fold'] != 10].index
    valid_idx = df[df['fold'] == 10].index
    return train_idx, valid_idx

# DataBlock
dblock = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    get_x=get_spectrogram,
    get_y=get_y,
    splitter=custom_splitter,
    item_tfms=Resize(hparams.image_size),
    batch_tfms=[*aug_transforms(size=hparams.image_size), Normalize.from_stats(*imagenet_stats)]
)

# Create DataLoaders
dls = dblock.dataloaders(df, bs=hparams.batch_size)
print(f"Training batches: {len(dls.train)}, Validation batches: {len(dls.valid)}")

# Create learner
cbs = [] if debug else [WandbCallback()]
learn = vision_learner(dls, resnet18, metrics=accuracy, cbs=cbs)

# Train
if debug:
    print("Running in DEBUG mode ⚡️")
    learn.fine_tune(hparams.debug_epochs, base_lr=hparams.debug_lr)
else:
    learn.fine_tune(hparams.epochs, base_lr=hparams.lr)

# Evaluate
learn.show_results()
interp = ClassificationInterpretation.from_learner(learn)

# Plot confusion matrix (and optionally log to wandb)
preds, targs = learn.get_preds()
pred_labels = preds.argmax(dim=1)
vocab = dls.vocab
cm = confusion_matrix(targs, pred_labels)

# Plot using sklearn
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=vocab)
fig, ax = plt.subplots(figsize=(10, 10))
disp.plot(ax=ax, xticks_rotation=90, cmap="Blues", colorbar=True)
plt.title("Confusion Matrix")

# Log to wandb if not debug
if not debug:
 wandb.log({"confusion_matrix": wandb.Image(fig)})

plt.close(fig)

# Export model
learn.export('urbansound8k_model.pkl')

# Finish wandb run
if not debug:
    wandb.finish()


Missing files: 0
Training batches: 246, Validation batches: 27


epoch,train_loss,valid_loss,accuracy,time
0,1.889669,1.339084,0.571087,01:06


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


epoch,train_loss,valid_loss,accuracy,time


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
