In [249]:
%matplotlib inline

In [250]:
import numpy as np
import matplotlib.pyplot as plt
import librosa as lb
import IPython.display as ipd
from scipy.signal import medfilt
import scipy.io as sio
import math
import pyaudio
import wave
import ipywidgets as widgets
import soundfile as sf
import io

### Implement TSM with dynamically changing time stretch factor in an offline setting 
In hw10 we implemented the following function
y = tsm_hybrid(x, alpha=1.0, sr=22050)
Where x is the input audio, y is the output audio, alpha is the global TSM factor, and sr is the sample rate
* You will implement the following function
y = tsm_hybrid_variable(x, sr, t_alpha, alpha)
x, sr, and y are same as above
alpha is a 1-d array specifying the desired TSM factors at different points in time
t_alpha is a 1-d array of the same size as alpha that specifies the time (in sec, in the timeline of the original unmodified audio recording) when the TSM factor should be changed/set; the first value of this array should always be zero (in order to set the initial TSM factor)
* You can think of t_alpha and alpha as a way of simulating a  person controlling the slider where they make a discrete number of changes over the length of the recording
E.g. if t_alpha = [0,2.5, 7.5] and alpha = [1.0, 2.0, 0.5], then the first 2.5 sec of the original audio should be played with no TSM, the next 7.5 - 2.5 = 5 sec of the original audio should be played with time stretch factor 2x, and the rest of the recording should be played with time stretch factor 0.5.

Use a simple example like this to verify that your code is working properly – you should be able to double check that the total duration is what is expected, and also listen to confirm that it is doing the time stretch properly


#### OLA Method

In [251]:
def tsm_overlap_add(x, alpha = 1.0, L = 220):
    '''
    Time stretches the input signal using the overlap-add method.  Uses an synthesis hop size that is half the
    value of L.
    
    Inputs
    x: the input signal
    alpha: the time stretch factor, which is defined as the ratio of the synthesis hop size to the analysis hop size
    L: the length of each analysis frame in samples
    
    Returns the time-stretched signal y.
    '''
    assert(L % 2 == 0), "Frame length must be even."
    Hs = L // 2
    
    # compute analysis frames
    Ha = int(np.round(Hs/alpha))
    numFrames = (len(x) - L) // Ha + 1
    analysisFrames = np.zeros((L, numFrames))
    for i in range(numFrames):
        offset = i * Ha
        analysisFrames[:, i] = x[offset: offset + L]
    
    # reconstruction
    synthesisFrames = analysisFrames * hann_window(L).reshape((-1,1)) # use broadcasting
    y = np.zeros(Hs * (numFrames-1) + L)
    for i in range(numFrames):
        offset = i * Hs
        y[offset:offset+L] += synthesisFrames[:,i]
    
    return y

In [252]:
def hann_window(L):
    w = .5 * (1 - np.cos(2*np.pi * np.arange(L)/ L))
    return w

### Phase Vocoder

In [253]:
def tsm_phase_vocoder(x, alpha = 1.0, L = 2048, sr = 22050):
    '''
    Time stretches the input signal using the phase vocoder method.  Uses a synthesis hop size that is one-fourth the
    value of L.
    
    Inputs
    x: the input signal
    alpha: the time stretch factor, which is defined as the ratio of the synthesis hop size to the analysis hop size
    L: the length of each analysis frame in samples
    sr: sampling rate
    
    Returns the time-stretched signal y.
    '''
    assert(L % 4 == 0), "Frame length must be divisible by four."
    Hs = L // 4
    
    # compute STFT
    Ha = int(np.round(Hs/alpha))
    window = hann_window(L)
    X = lb.core.stft(x, n_fft = L, hop_length = Ha, window=window, center=False)
    
    # compute modified STFT
    w_if = estimateIF(X, sr, Ha)
    phase_mod = np.zeros(X.shape)
    phase_mod[:,0] = np.angle(X[:,0])
    for i in range(1, phase_mod.shape[1]):
        phase_mod[:,i] = phase_mod[:,i-1] + w_if[:,i-1] * Hs / sr
    Xmod = np.abs(X) * np.exp(1j * phase_mod)
    
    # signal reconstruction
    y = invert_stft(Xmod, Hs, window)
    #y = lb.core.istft(Xmod, hop_length=Hs, center=False)
    
    return y

In [254]:
def estimateIF(S, sr, hop_samples):
    '''
    Estimates the instantaneous frequencies in a STFT matrix.
    
    Inputs
    S: the STFT matrix, should only contain the lower half of the frequency bins
    sr: sampling rate
    hop_samples: the hop size of the STFT analysis in samples
    
    Returns a matrix containing the estimated instantaneous frequency at each time-frequency bin.
    This matrix should contain one less column than S.
    '''
    hop_sec = hop_samples / sr
    fft_size = (S.shape[0] - 1) * 2
    w_nom = np.arange(S.shape[0]) * sr / fft_size * 2 * np.pi
    w_nom = w_nom.reshape((-1,1))    
    unwrapped = np.angle(S[:,1:]) - np.angle(S[:,0:-1]) - w_nom * hop_sec
    wrapped = (unwrapped + np.pi) % (2 * np.pi) - np.pi
    w_if = w_nom + wrapped / hop_sec
    return w_if

In [255]:
def invert_stft(S, hop_length, window):
    '''
    Reconstruct a signal from a modified STFT matrix.
    
    Inputs
    S: modified STFT matrix
    hop_length: the synthesis hop size in samples
    window: an array specifying the window used for FFT analysis
    
    Returns a time-domain signal y whose STFT is closest to S in squared error distance.
    '''
    
    L = len(window)
    
    # construct full stft matrix
    fft_size = (S.shape[0] - 1) * 2
    Sfull = np.zeros((fft_size, S.shape[1]), dtype=np.complex64)
    Sfull[0:S.shape[0],:] = S
    Sfull[S.shape[0]:,:] = np.conj(np.flipud(S[1:fft_size//2,:]))
    
    # compute inverse FFTs
    frames = np.zeros_like(Sfull)
    for i in range(frames.shape[1]):
        frames[:,i] = np.fft.ifft(Sfull[:,i])
    frames = np.real(frames) # remove imaginary components due to numerical roundoff
    
    # synthesis frames
    num = window.reshape((-1,1))
    den = calc_sum_squared_window(window, hop_length)
    #den = np.square(window) + np.square(np.roll(window, hop_length))
    frames = frames * window.reshape((-1,1)) / den.reshape((-1,1))
    #frames = frames * window.reshape((-1,1))
    
    # reconstruction
    y = np.zeros(hop_length*(frames.shape[1]-1) + L)
    for i in range(frames.shape[1]):
        offset = i * hop_length
        y[offset:offset+L] += frames[:,i]
    
    return y

In [256]:
def calc_sum_squared_window(window, hop_length):
    '''
    Calculates the denominator term for computing synthesis frames.
    
    Inputs
    window: array specifying the window used in FFT analysis
    hop_length: the synthesis hop size in samples
    
    Returns an array specifying the normalization factor.
    '''
    assert (len(window) % hop_length == 0), "Hop length does not divide the window evenly."
    
    numShifts = len(window) // hop_length
    den = np.zeros_like(window)
    for i in range(numShifts):
        den += np.roll(np.square(window), i*hop_length)
        
    return den

### Hybrid method

In [257]:
def harmonic_percussive_separation(x, sr=22050, fft_size = 2048, hop_length=512, lh=6, lp=6):
    
    window = hann_window(fft_size)
    X = lb.core.stft(x, n_fft=fft_size, hop_length=512, window=window, center=False)
    Y = np.abs(X)
    Yh = medfilt(Y, (1, 2*lh+1))
    Yp = medfilt(Y, (2*lp+1, 1))
    Mh = (Yh > Yp)
    Mp = np.logical_not(Mh)
    Xh = X * Mh
    Xp = X * Mp
    xh = invert_stft(Xh, hop_length, window)
    xp = invert_stft(Xp, hop_length, window)
    
    return xh, xp, Xh, Xp

In [258]:
def mix_recordings(x1, x2):
    min_length = min(len(x1), len(x2))
    y = .5 * (x1[0:min_length] + x2[0:min_length])
    return y

In [259]:
def tsm_hybrid(x, alpha=1.0, sr=22050):
    '''
    Time stretches the input signal using a hybrid method that combines overlap-add and phase vocoding.
    
    Inputs
    x: the input signal
    alpha: the time stretch factor, which is defined as the ratio of the synthesis hop size to the analysis hop size
    sr: sampling rate
    
    Returns the time-stretched signal y.
    '''
    
    xh, xp, _, _ = harmonic_percussive_separation(x)
    xh_stretched = tsm_phase_vocoder(xh, alpha)
    xp_stretched = tsm_overlap_add(xp, alpha)
    y = mix_recordings(xh_stretched, xp_stretched)
    
    return y

### Including dynamic variable

#### tsm_ola_variable, applying OLA when we have a dynamic variable $\alpha$

In [295]:
choir, c_sr = lb.load("choir.wav")
beatbox, bb_sr = lb.load("beatbox.wav")

In [296]:
def init_y(t_alpha, alpha, x, sr):
    tot_size = 0
    for i in range(len(t_alpha)):
        if (i + 1) < len(t_alpha):
            diff = t_alpha[i+1] - t_alpha[i]
            tot_size += (diff * alpha[i] * sr)
    tot_size += ((len(x)/sr) - t_alpha[-1]) * alpha[-1] * sr
    output = np.zeros(int(np.round(tot_size)))
    return output

In [297]:
def tsm_ola_variable(x, sr, t_alpha, alpha, L=2048):
    '''
    Time stretches the input signal using overlap-add with variable stretch factors.
    
    Inputs:
    x: input signal
    sr: sample rate
    t_alpha: array of time points (seconds) when stretch factors change
    alpha: array of stretch factors corresponding to t_alpha
    L: frame length (must be even)
    
    Returns time-stretched signal y
    '''
    assert L % 2 == 0, "Frame length must be even."
    Hs = L // 2
    
    # Convert time points to samples
    t_alpha_samples = [int(t * sr) for t in t_alpha]
    t_alpha_samples.append(len(x))  # Add end of signal as final boundary
    
    # Initialize
    current_alpha_idx = 0
    curr_offset = 0
    analysis_frames = []
        
    # Analysis phase
    while curr_offset <= len(x) - L:
        current_alpha = alpha[current_alpha_idx]
        Ha = int(np.round(Hs / current_alpha))
        # Check if next window would cross boundary
        if curr_offset + Ha > t_alpha_samples[current_alpha_idx + 1]:
            current_alpha_idx += 1
            # curr_offset = t_alpha_samples[current_alpha_idx] 
            # continue
        # Extract and store frame
        frame = x[curr_offset:curr_offset + L]
        analysis_frames.append(frame)        
        # Advance position
        curr_offset += Ha
    
    num_frames = len(analysis_frames)
    if num_frames == 0:
        return np.zeros(0)
    
    analysis_frames = np.array(analysis_frames).T
    
    # Synthesis phase
    synthesis_frames = analysis_frames * hann_window(L).reshape((-1, 1))
    y = np.zeros(Hs * (num_frames-1) + L)
    
    for i in range(num_frames):
        offset = i * Hs
        y[offset:offset + L] += synthesis_frames[:, i]
    
    return y

In [298]:
def hann_window(L):
    w = .5 * (1 - np.cos(2*np.pi * np.arange(L)/ L))
    return w

In [305]:
t_alpha = [0]
alpha = [0.75]
y = tsm_ola_variable(beatbox, bb_sr, t_alpha, alpha)

In [306]:
ipd.Audio(y, rate=bb_sr)

In [310]:
choir_fast = tsm_ola_variable(choir, c_sr, t_alpha, alpha)

In [311]:
ipd.Audio(choir_fast, rate=c_sr)

In [309]:
c_sr

22050

#### tsm_phase_vocoder_variable, applying phase vocoder when we have a dynamic variable $\alpha$

In [None]:
def tsm_phasevocoder_variable(x, sr, t_alpha, alpha):
    return None

You will want to keep track of where you are in the original audio recording (i.e. the sample offset where you are currently located), this will form the outer loop of the algorithm:
 - Figure out what data needs to be accessed in order to compute the next chunk of the output y
- Then process just that chunk and append the result to y
In order to make this more efficient, think about what can be pre-computed beforehand and what has to be computed in real-time


In order to make this more efficient, think about what can be pre-computed beforehand and what has to be computed in real-time
- E.g. you have access to the whole recording beforehand, but you don’t know the desired TSM factor that will be adjusted in real-time (though at this stage we will assume that it is constant)
- So you could, for example, pre-compute the STFT beforehand
- We may also want to consider approximations that will greatly speed up the runtime efficiency
    - e.g. you could pre-compute the STFT and instantaneous frequencies beforehand for the whole recording; at runtime the exact current sample location may not match exactly with the offset of the pre-computed STFT (since the analysis hop size will vary depending on the current TSM factor) but it could be a close enough approximation to just find the nearest neighbor and use its pre-computed instantaneous frequency values.  This approach will not be exactly the same as an offline implementation, but it could be much faster at runtime since a lot of the work could be pre-computed beforehand.  Let’s be open to trying such approximations and comparing the quality of the generated time domain signals.
- Once you implement a simulated real-time version of this, you can compare the output to the hw10 offline implementations – they should match either exactly or very close (they should be indistinguishable to the human ear)
    - Have demos of the hw10 offline implementations and your simulated real-time implementation to verify that they match in behavior
- Find most commonly used packages for real-time audio processing
    - Present pros/cons of different options
    - Select what you think is the best option and justify your choice
    - Show code example(s) of real-time audio processing with that package


In [268]:
def tsm_ola_simRealTime(x, sr, alpha):





    return None

In [269]:
def tsm_phaseVocoder_simRealTime(x, sr, alpha):





    return None

In [270]:
def tsm_hybrid_simRealTime(x, sr, alpha):




    return None