In [None]:
import librosa
import librosa.display
import os
import numpy as np
import matplotlib.pyplot as plt

In [None]:
print(os.listdir("../input"))

In [None]:
data_path = '../input/'
train_root = '../input/audio_train/'
test_root = '../input/audio_test/'

# Why spectograms?

Spectograms of sounds turn out to be quite useful for training 2d convolutional networks.  My current enseble in the Freesound competition includes models trained with spectogram data achieving accuracy scores between 60% and 70%. The results I get are a little bit better when I use models which have been pretrained on Imagenet.  If you'd like to try it, here's how to create spectograms: 

In [None]:
def to_log_S(fname, PATH):
    y, sr = librosa.load(os.path.join(PATH, fname))
    S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
    log_S = librosa.amplitude_to_db(S, ref=np.max)
    return log_S

In [None]:
to_log_S('65b299e9.wav', train_root)[:10, :4]

Instead of mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]) as described in https://pytorch.org/docs/master/torchvision/models.html, I use the mean of the mean and the mean of the std, since I've got only 1 channel.

In [None]:
mean = (0.485+0.456+0.406)/3
std = (0.229+0.224+0.225)/3
mean, std

In [None]:
def normalize(x):
    x = -x/80
    x = (x-mean)/std

Here's how to visualize a spectogram:

In [None]:
def display_spectogram(log_S):
    sr = 22050
    plt.figure(figsize=(12,4))
    librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
    plt.title('mel power spectrogram')
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()

In [None]:
display_spectogram(to_log_S('65b299e9.wav', train_root)[:10, :4])

In [None]:
display_spectogram(to_log_S('65b299e9.wav', train_root))