# STFT Model #

An STFT analysis and synthesis notebook.

First we set up the environment.

In [1]:
%matplotlib inline

import math, copy, sys, os
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
import IPython.display as ipd
import glob
from scipy.fftpack import fft, ifft, fftshift
from scipy.signal import blackmanharris, triang, get_window
from scipy.io.wavfile import write, read
from sys import platform
from ipywidgets import interact, interact_manual, interactive

tol = 1e-14 # threshold used to compute phase
INT16_FAC = (2**15)-1
INT32_FAC = (2**31)-1
INT64_FAC = (2**63)-1
norm_fact = {'int16':INT16_FAC, 'int32':INT32_FAC, 'int64':INT64_FAC,'float32':1.0,'float64':1.0}

global iF  # The input file name
global xR  # The raw input samples 
global x   # The input samples normalized
global fs  # The input sample rate
global N   # The FFT size
global w   # The window
global wN  # The window name
global M   # The window size
global H   # The hop size
global mX  # The magnitude spectrum of the input
global pX  # The phase spectrum of the input
global y   # The re-synthesized output
global yR  # The raw re-synthesized output

Now we define some methods to perform the different steps of the model

***dft_analysis***

Analysis of a signal using the discrete Fourier transform 

Params

* x: input signal 
* w: analysis window, 
* N: FFT size 

Returns 

* mX: magnitude spectrum
* pX: phase spectrum

In [2]:
def dft_analysis(x, w, N):
    if (w.size > N):                                        # raise error if window size bigger than fft size
        raise ValueError("Window size (M) is bigger than FFT size")

    hN = (N//2)+1                                           # size of positive spectrum, it includes sample 0
    hM1 = (w.size+1)//2                                     # half analysis window size by rounding
    hM2 = w.size//2                                         # half analysis window size by floor
    fftbuffer = np.zeros(N)                                 # initialize buffer for FFT
    w = w / sum(w)                                          # normalize analysis window
    xw = x*w                                                # window the input sound
    fftbuffer[:hM1] = xw[hM2:]                              # zero-phase window in fftbuffer
    fftbuffer[-hM2:] = xw[:hM2]        
    X = fft(fftbuffer)                                      # compute FFT
    absX = abs(X[:hN])                                      # compute ansolute value of positive side
    absX[absX<np.finfo(float).eps] = np.finfo(float).eps    # if zeros add epsilon to handle log
    mX = 20 * np.log10(absX)                                # magnitude spectrum of positive frequencies in dB
    X[:hN].real[np.abs(X[:hN].real) < tol] = 0.0            # for phase calculation set to 0 the small values
    X[:hN].imag[np.abs(X[:hN].imag) < tol] = 0.0            # for phase calculation set to 0 the small values         
    pX = np.unwrap(np.angle(X[:hN]))                        # unwrapped phase spectrum of positive frequencies
    return mX, pX

***stft_analysis*** 

Analysis of a sound using the short-time Fourier transform

Params

* x: input array sound
* w: analysis window
* N: FFT size
* H: hop size

Returns 

* xmX: magnitude spectra
* xpX: phase spectra

In [3]:
def stft_analysis(x, w, N, H) :
    if (H <= 0):                                   # raise error if hop size 0 or negative
        raise ValueError("Hop size (H) smaller or equal to 0")

    M = w.size                                      # size of analysis window
    hM1 = (M+1)//2                                  # half analysis window size by rounding
    hM2 = M//2                                      # half analysis window size by floor
    x = np.append(np.zeros(hM2),x)                  # add zeros at beginning to center first window at sample 0
    x = np.append(x,np.zeros(hM2))                  # add zeros at the end to analyze last sample
    pin = hM1                                       # initialize sound pointer in middle of analysis window       
    pend = x.size-hM1                               # last sample to start a frame
    w = w / sum(w)                                  # normalize analysis window
    xmX = []                                       # Initialise empty list for mX
    xpX = []                                       # Initialise empty list for pX
    while pin<=pend:                                # while sound pointer is smaller than last sample      
        x1 = x[pin-hM1:pin+hM2]                     # select one frame of input sound
        mX, pX = dft_analysis(x1, w, N)             # compute dft
        xmX.append(np.array(mX))                    # Append output to list
        xpX.append(np.array(pX))
        pin += H                                    # advance sound pointer
    xmX = np.array(xmX)                             # Convert to numpy array
    xpX = np.array(xpX)
    return xmX, xpX

***dft_synthesis***

Synthesis of a signal using the discrete Fourier transform

Params

* mX: magnitude spectrum 
* pX: phase spectrum
* M: window size

Returns 

* y: output signal

In [4]:
def dft_synthesis(mX, pX, M):
    hN = mX.size                                            # size of positive spectrum, it includes sample 0
    N = (hN-1)*2                                            # FFT size
    hM1 = int(math.floor((M+1)/2))                          # half analysis window size by rounding
    hM2 = int(math.floor(M/2))                              # half analysis window size by floor
    fftbuffer = np.zeros(N)                                 # initialize buffer for FFT
    y = np.zeros(M)                                         # initialize output array
    Y = np.zeros(N, dtype = complex)                        # clean output spectrum
    Y[:hN] = 10**(mX/20) * np.exp(1j*pX)                    # generate positive frequencies
    Y[hN:] = 10**(mX[-2:0:-1]/20) * np.exp(-1j*pX[-2:0:-1]) # generate negative frequencies
    fftbuffer = np.real(ifft(Y))                            # compute inverse FFT
    y[:hM2] = fftbuffer[-hM2:]                              # undo zero-phase window
    y[hM2:] = fftbuffer[:hM1]
    return y

***stft_synthesis***

Synthesis of a sound using the short-time Fourier transform

* mY: magnitude spectra
* pY: phase spectra
* M: window size 
* H: hop-size

Returns 

* y: output sound

In [5]:
def stft_synthesis(mY, pY, M, H) :
    hM1 = (M+1)//2                                   # half analysis window size by rounding
    hM2 = M//2                                       # half analysis window size by floor
    nFrames = mY[:,0].size                           # number of frames
    y = np.zeros(nFrames*H + hM1 + hM2)              # initialize output array
    pin = hM1                  
    for i in range(nFrames):                         # iterate over all frames      
        y1 = dft_synthesis(mY[i,:], pY[i,:], M)           # compute idft
        y[pin-hM1:pin+hM2] += H*y1                   # overlap-add to generate output sound
        pin += H                                     # advance sound pointer
    y = np.delete(y, range(hM2))                     # delete half of first window which was added in stftAnal
    y = np.delete(y, range(y.size-hM1, y.size))      # delete the end of the sound that was added in stftAnal
    return y

***stft_system***

STFT analysis and re-synthesis system. Performs an STFT analysis of a signal and then re-synthesizes it

Params

* p_N:  The FFT size
* p_M:  The window size
* p_H:  The hop size
* p_wN: The name of the window funtion to use

Returns void

Plots the input waveform, the magnitude and phase spectra, and the re-synthesized output waveform and allows the output to be played back


In [6]:
def stft_system(p_N, p_M, p_H, p_wN):
    global N, M, H, wN, w, mX, pX, y, yR
    
    # Set the analysis parameters
    N = p_N
    M = p_M if p_M <= N else N
    H = p_H if p_H <= M//2 else M//2
    wN = p_wN
    w = get_window(wN, M)
    
    # Do the analysis step
    mX, pX = stft_analysis(x, w, N, H)
    
    # Do the synthesis step
    y = stft_synthesis(mX, pX, M, H)
    yR = copy.deepcopy(y)                         # copy array
    yR *= INT16_FAC                               # scaling floating point -1 to 1 range signal to int16 range
    yR = np.int16(yR)      
    
    # create figure to plot
    plt.figure(figsize=(17, 20))

    # frequency range to plot
    maxplotfreq = 5000.0

    # plot the input sound
    plt.subplot(4,1,1)
    plt.plot(np.arange(x.size)/float(fs), x)
    plt.axis([0, x.size/float(fs), min(x), max(x)])
    plt.ylabel('amplitude')
    plt.xlabel('time (sec)')
    plt.title('input sound: x')

    # plot magnitude spectrogram
    plt.subplot(4,1,2)
    numFrames = int(mX[:,0].size)
    frmTime = H*np.arange(numFrames)/float(fs)
    binFreq = fs*np.arange(N*maxplotfreq/fs)/N
    plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:,:int(N*maxplotfreq/fs+1)]))
    plt.xlabel('time (sec)')
    plt.ylabel('frequency (Hz)')
    plt.title('magnitude spectrogram')
    plt.autoscale(tight=True)

    # plot the phase spectrogram
    plt.subplot(4,1,3)
    numFrames = int(pX[:,0].size)
    frmTime = H*np.arange(numFrames)/float(fs)
    binFreq = fs*np.arange(N*maxplotfreq/fs)/N
    plt.pcolormesh(frmTime, binFreq, np.transpose(np.diff(pX[:,:int(N*maxplotfreq/fs+1)],axis=1)))
    plt.xlabel('time (sec)')
    plt.ylabel('frequency (Hz)')
    plt.title('phase spectrogram (derivative)')
    plt.autoscale(tight=True)

    # plot the output sound
    plt.subplot(4,1,4)
    plt.plot(np.arange(y.size)/float(fs), y)
    plt.axis([0, y.size/float(fs), min(y), max(y)])
    plt.ylabel('amplitude')
    plt.xlabel('time (sec)')
    plt.title('output sound: y')

    plt.tight_layout()
    plt.ion()
    plt.show()
    
    display(ipd.Audio(yR, rate=fs))

# Playground

Here you can play with a few different inputs, change some parameters and listen to the results

In [7]:
def read_input_file(p_iF):
    global iF, fs, xR, x
    iF = p_iF
    # Read the input file now
    fs, xR = read(iF)
    x = np.float32(xR)/norm_fact[xR.dtype.name]
    display(ipd.Audio(xR, rate=fs))
    
files = glob.glob('audio/*.wav')
interact(read_input_file, p_iF = widgets.Dropdown(options=files,description='Audio File:'))
interact_manual(stft_system,
         p_wN = widgets.Dropdown(options=['blackmanharris', 'blackman', 'hamming', 'hanning', 'rectangular' ],description='Window Type'),
         p_M=widgets.SelectionSlider(options=[2**i for i in range(4,13)],value=512,description='Window Size'),
         p_N=widgets.SelectionSlider(options=[2**i for i in range(4,13)],value=1024,description='FFT Size'), 
         p_H=widgets.SelectionSlider(options=[2**i for i in range(4,13)],value=128,description='Hop Size')) 

<function __main__.stft_system>