<a href="https://colab.research.google.com/github/smhall97/hallucinating_GANs/blob/main/wav2spectrograms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Install dependencies
!sudo apt-get install -y ffmpeg --quiet
!pip install torchaudio --quiet

Reading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
[K     |████████████████████████████████| 1.9 MB 31.3 MB/s 
[?25h

In [2]:
import torch
#import torch.nn as nn
import torchaudio
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
#from IPython.display import display, Audio
import librosa
#import librosa.display
import os
import pickle
from tqdm import tqdm

#print(torch.__version__)
print(torchaudio.__version__)

#Import the Google Drive
from google.colab import drive
drive.mount('/content/drive') #it will ask you for a verification code

data_path = '/content/drive/MyDrive/HallucinatingGANs/Code/data/'

0.9.0
Mounted at /content/drive


In [17]:
transforms = ['mel','stft']

n_fft = 1024
n_mels = 128
hop_length = 256 # smaller hop size leads to better reconstruction but takes longer to compute
power = 2.0 # squared power spectrogram
samplerate =  22050


def config_transform(t):

  if t == 'stft':
    get_spectro = torchaudio.transforms.Spectrogram(
        n_fft=n_fft,
        win_length=None,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        power=None,
        return_complex=False
    )

    params = '{}_{}'.format(str(n_fft), str(hop_length))
  
  elif t == 'mel':

    get_spectro = torchaudio.transforms.MelSpectrogram(
        sample_rate=samplerate,
        n_fft=n_fft,
        win_length=None,
        hop_length=hop_length,
        center=True,
        pad_mode="reflect",
        power=power,
        norm='slaney',
        onesided=True,
        n_mels=n_mels,
        mel_scale="htk",
    )

    params = '{}_{}_{}'.format(str(n_fft), str(hop_length), n_mels)

  return params, get_spectro

In [18]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    return X_scaled

OVERWRITE = True
TEST_RUN = True # test to see output, only write first file of each genre
test_dir = 'spectrograms_dummy' # directory to write test outputs
output_dir = 'spectrograms' # directory to write test outputs


for transform in transforms:
  params, get_spectro = config_transform(transform)

  src_path = data_path + 'genres/'

  if TEST_RUN:
      path2 = os.path.join(os.path.abspath(data_path), test_dir)
  else:
      path2 = os.path.join(os.path.abspath(data_path), output_dir)

  if not os.path.exists(path2):
    os.mkdir(path2)

  path3 = os.path.join(path2, transform)
  if not os.path.exists(path3):
    os.mkdir(path3)

  dst_path = os.path.join(path3, params)
  print("Test Run: ", TEST_RUN)
  print("Source path: ", src_path)
  print("Dest path: ", dst_path)

  if not os.path.exists(dst_path):
      os.mkdir(dst_path)

  for genre in tqdm(os.listdir(src_path)):

      current_src_dir = os.path.join(src_path, genre)
      current_dst_dir = os.path.join(dst_path, genre)

      if not os.path.isdir(current_src_dir):
        continue
        
      print("Genre: ", genre)

      if not os.path.exists(current_dst_dir):
          os.mkdir(current_dst_dir)
      for i, wav_filename in enumerate(os.listdir(current_src_dir)):
                  
          if TEST_RUN and i != 0:
              continue

          pkl_filename = os.path.splitext(wav_filename)[0] + '.pkl'
          src_file = os.path.join(current_src_dir, wav_filename)
          dst_file = os.path.join(current_dst_dir, pkl_filename)
                  
          if not OVERWRITE and os.path.exists(dst_file) and not TEST_RUN:
              print("Skipping existing file ", dst_file)
              continue
              
          # 1. load wav file
          wave, sr = librosa.load(src_file, mono=True)
          
          
          # 2. generate squared power spectrogram
          # TODO: is squared power necessary? converting to uint8 already loses a lot of nuance
  #       spectro = librosa.stft(
  #           wave,
  #           n_fft=n_fft,
  #           hop_length=hop_length,
  #           win_length=win_length,
  #       )#**2 remove for now...

          wave = torch.unsqueeze(torch.Tensor(wave),0)
          spectro = get_spectro(wave)
          print(f"spec1 shape: {spectro.shape}  dtype: {spectro.dtype}")
                          
          # normalize to 0-1 range
          # spectro = scale_minmax(spectro)

          # 3. save as pickle file
          print('writing file: ' + dst_file)
          with open(dst_file, 'wb') as f:
              pickle.dump(spectro, f, pickle.HIGHEST_PROTOCOL)

 64%|██████▎   | 14/22 [00:00<00:00, 99.95it/s]

Test Run:  True
Source path:  /content/drive/MyDrive/HallucinatingGANs/Code/data/genres/
Dest path:  /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128
Genre:  rock
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/rock/rock.00009.pkl
Genre:  reggae
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/reggae/reggae.00006.pkl
Genre:  pop
spec1 shape: torch.Size([1, 128, 2585])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/pop/pop.00002.pkl
Genre: 

 77%|███████▋  | 17/22 [00:00<00:00, 41.82it/s]

 metal
spec1 shape: torch.Size([1, 128, 2585])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/metal/metal.00001.pkl
Genre:  jazz
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/jazz/jazz.00002.pkl
Genre:  hiphop
spec1 shape: torch.Size([1, 128, 2585])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/hiphop/hiphop.00003.pkl
Genre:  disco


100%|██████████| 22/22 [00:00<00:00, 36.82it/s]
  0%|          | 0/22 [00:00<?, ?it/s]

spec1 shape: torch.Size([1, 128, 2603])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/disco/disco.00001.pkl
Genre:  country
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/country/country.00001.pkl
Genre:  classical
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/classical/classical.00004.pkl
Genre:  blues
spec1 shape: torch.Size([1, 128, 2586])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/mel/1024_256_128/blues/blues.00005.pkl
Test Run:  True
Source path:  /content/drive/MyDrive/HallucinatingGANs/Code/data/genres/
Dest path:  /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256
Genre:  rock


  "The use of pseudo complex type in spectrogram is now deprecated."
 64%|██████▎   | 14/22 [00:00<00:00, 118.51it/s]

spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/rock/rock.00009.pkl
Genre:  reggae
spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/reggae/reggae.00006.pkl
Genre:  pop
spec1 shape: torch.Size([1, 513, 2585, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/pop/pop.00002.pkl


 82%|████████▏ | 18/22 [00:00<00:00, 14.27it/s] 

Genre:  metal
spec1 shape: torch.Size([1, 513, 2585, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/metal/metal.00001.pkl
Genre:  jazz
spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/jazz/jazz.00002.pkl
Genre:  hiphop
spec1 shape: torch.Size([1, 513, 2585, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/hiphop/hiphop.00003.pkl
Genre:  disco
spec1 shape: torch.Size([1, 513, 2603, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/disco/disco.00001.pkl


100%|██████████| 22/22 [00:01<00:00, 18.21it/s]

Genre:  country
spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/country/country.00001.pkl
Genre:  classical
spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/classical/classical.00004.pkl
Genre:  blues
spec1 shape: torch.Size([1, 513, 2586, 2])  dtype: torch.float32
writing file: /content/drive/MyDrive/HallucinatingGANs/Code/data/spectrograms_dummy/stft/1024_256/blues/blues.00005.pkl



