# PyTorch Audio Autoencoder

In [None]:
# Numbers
import numpy as np

# Visualization
import matplotlib.pyplot as plt
from IPython.display import Image, Audio, HTML
import librosa.display

# Machine learning
import torch
import torchaudio.transforms as T
from sklearn.model_selection import train_test_split
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split, TensorDataset, DataLoader

# Audio
import torchaudio
import librosa

## Autoencoders
Encoder

$Z = f(X)$

$Z$ = Latent space

Decoder

$X = f^{-1}(Z)$

## Short-Time Fourier Transform (STFT)
<img src="https://www.researchgate.net/publication/346243843/figure/fig1/AS:961807523000322@1606324191138/Short-time-Fourier-transform-STFT-overview.png" width="500px"/>

In [None]:
Sclip = -60
hop_length_ms = 20
duration = 120 # seconds
sample_rate = 22050 # Hz

hop_length = int(sample_rate * hop_length_ms / 1000)
win_length = 4 * hop_length

X = []
y = []
phases = []

# Eventually get all the files in 'wavs' dir
x, sr = librosa.load("wavs/audio.wav", sr=sample_rate, mono=True, duration=duration)
x = np.trim_zeros(x)
F = librosa.stft(x, n_fft=win_length, hop_length=hop_length).T
phases.append(np.angle(F))
S = 10*np.log10(np.abs(F)**2)
S = S.clip(Sclip, None)-Sclip
y.append(np.ones(S.shape[0])*0)
X.append(S)

phases = np.vstack(phases)
X = np.vstack(X)
X_max = X.max()
X = X / X_max
y = np.hstack(y)

In [None]:
print('Win length:', win_length,', Hop length:', hop_length)
print('Data shape:', X.shape)
print('Length min:', X.shape[0]*hop_length_ms/1000/60)
n_features = X.shape[1]

In [None]:
plt.figure(figsize=(14, 4))
librosa.display.specshow(X.T, y_axis='linear', x_axis='time', hop_length=hop_length);

In [None]:
X_ = np.sqrt(10**((X*X_max+Sclip)/10))*np.exp(1j*phases)
x_ = librosa.istft(X_.T, hop_length=hop_length, win_length=win_length)

plt.plot(x_)
display('Fase original')
display(Audio(x_,rate=sr))

phases_ = np.random.rand(*X.shape)*2*np.pi
X_ = np.sqrt(10**((X*X_max+Sclip)/10))*np.exp(1j*phases_)
x_ = librosa.istft(X_.T,hop_length=hop_length, win_length=win_length)

display('Fase random')
display(Audio(x_,rate=sr))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y,  test_size=0.05, shuffle=True)
print('Length mins:', X_train.shape[0]*hop_length_ms/1000/60)

In [None]:
layers_size = [883, 512, 256, 128, 64, 32, 16, 8]
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        layers = []
        for i in range(len(layers_size)-1):
            layers.append(nn.Linear(layers_size[i], layers_size[i+1]))
            layers.append(nn.BatchNorm1d(layers_size[i+1]))
            layers.append(nn.ELU())
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [None]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        layers = []
        for i in range(len(layers_size)-1, 1, -1):
            layers.append(nn.Linear(layers_size[i], layers_size[i-1]))
            layers.append(nn.BatchNorm1d(layers_size[i-1]))
            layers.append(nn.ELU())
        layers.append(nn.Linear(layers_size[1], layers_size[0]))
        layers.append(nn.BatchNorm1d(layers_size[0]))
        layers.append(nn.ReLU())
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

In [None]:
class AutoEncoder(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, x)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.002)
        return optimizer


In [None]:
silence = np.zeros([int(X_train.shape[0]*0.1), X_train.shape[1]])
X_train2 = np.vstack([silence, X_train])

autoencoder = AutoEncoder()
trainer = pl.Trainer(max_epochs=120)
dataset = TensorDataset(torch.tensor(X_train2).float(), torch.tensor(X_train2).float())
dataloader = DataLoader(dataset, batch_size=512)
trainer.fit(model=autoencoder, train_dataloaders=dataloader)

In [None]:
prediction = autoencoder(torch.tensor(X).float())
prediction = prediction.detach().numpy()
prediction -= prediction.min()

In [None]:
plt.figure(figsize=(14, 4))
librosa.display.specshow(prediction.T, y_axis='linear', x_axis='time', hop_length=hop_length);

In [None]:
X_ = np.sqrt(10**((prediction*X_max+Sclip)/10))*np.exp(1j*phases)
x_ = librosa.istft(X_.T,hop_length=hop_length, win_length=win_length)
display(Audio(x_,rate=sr))

In [None]:
Z = autoencoder.encoder(torch.tensor(X).float())
Z = Z.detach().numpy()
plt.figure(figsize=(14, 6))
plt.plot(Z+np.arange(Z.shape[1])*5);