In [65]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as aud_transforms
import pandas as pd
import copy
from tqdm import tqdm

In [6]:
temp_img = np.zeros((2,2))
temp_img[0,0] = 1
temp_img

array([[1., 0.],
       [0., 0.]])

In [7]:
temp_img.mean()

0.25

In [81]:
# need to grab all the songs and calculate mean for all of them
def get_data_frame(data_path):
    temp_df = pd.read_csv(f"{data_path}/features_30_sec.csv")
    temp_df['filename'] = temp_df['filename'].str[:-2] + 'wav'
    temp_df['filePath'] = data_path + '/WAV/' + temp_df['label'] + '/norm/' + temp_df['filename']

    ids = copy.deepcopy(temp_df['filename'])

    for index, id in enumerate(ids):
        bits = id.split('.')
        ids[index] = f"id-{bits[0][0:2]}{bits[1]}-original"
    temp_df['ID'] = ids

    return temp_df.loc[:, ['ID','filePath', 'label']], len(temp_df)

In [82]:
df, leng = get_data_frame('data')

In [83]:
df.head()

Unnamed: 0,ID,filePath,label
0,id-bl00000-original,data/WAV/blues/norm/blues.00000.wav,blues
1,id-bl00001-original,data/WAV/blues/norm/blues.00001.wav,blues
2,id-bl00002-original,data/WAV/blues/norm/blues.00002.wav,blues
3,id-bl00003-original,data/WAV/blues/norm/blues.00003.wav,blues
4,id-bl00004-original,data/WAV/blues/norm/blues.00004.wav,blues


In [84]:
leng

1000

In [85]:
def splitsongs(wd, overlap = 0.5):
    temp_X = []

    # Get the input song array size
    xshape = wd.shape[0]
    chunk = 33000
    offset = int(chunk*(1.-overlap))
    
    # Split the song and create new ones on windows
    spsong = [wd[i:i+chunk] for i in range(0, xshape - chunk + offset, offset)]
    for s in spsong:
        if s.shape[0] != chunk:
            continue

        temp_X.append(s)

    return np.array(temp_X)


In [86]:
dataset_params = dict(
        frames=256,
        bands=128,
        window_size=1024,
        hop_size=256,
        e0=1e-3
    )
mel_transform = nn.Sequential(
    aud_transforms.MelSpectrogram(
            sample_rate=16000,
            n_mels=dataset_params["bands"],
            n_fft=dataset_params["window_size"],
            hop_length=dataset_params["hop_size"]
    )
    # aud_transforms.AmplitudeToDB()
)

In [87]:
def create_full_spec(path, transform):
    wd, sr = torchaudio.load(path)
    # audio_mono = wd.mean(dim=0)
    resampler = aud_transforms.Resample(sr, 16000)
    audio = resampler(wd).squeeze()

    return transform(audio)
def get_metrics(spec):
    np_arr = spec.numpy()
    return np_arr.mean(), np_arr.std(), np_arr.min(), np_arr.max()
# (30.34647, 167.83447, 0.0001938382, 15812.163)


In [89]:
temp_spec = create_full_spec('data/WAV/blues/norm/blues.00000.wav', mel_transform)
get_metrics(temp_spec)

(12.362552, 68.37243, 7.779165e-05, 6441.5522)

In [72]:
mean = 0.0
std = 0.0
most_min = 3000.
most_max = -3000.
for i, row in tqdm(df.iterrows()):
    temp_mean, temp_std, temp_min, temp_max = get_metrics(create_full_spec(row['filePath'], mel_transform))
    mean += temp_mean
    std += temp_std
    if temp_min < most_min: most_min = temp_min
    if temp_max > most_max: most_max = temp_max


print(mean / 1000)
print(std / 1000)
print(most_min)
print(most_max)


1000it [07:01,  2.37it/s]37.13857212095708
203.2593863657117
0.0
52251.395

