In [4]:
%matplotlib inline

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import librosa as lb
import IPython.display as ipd
from scipy.signal import medfilt
import math

### Implement TSM with dynamically changing time stretch factor in an offline setting 
In hw10 we implemented the following function
y = tsm_hybrid(x, alpha=1.0, sr=22050)
Where x is the input audio, y is the output audio, alpha is the global TSM factor, and sr is the sample rate
* You will implement the following function
y = tsm_hybrid_variable(x, sr, t_alpha, alpha)
x, sr, and y are same as above
alpha is a 1-d array specifying the desired TSM factors at different points in time
t_alpha is a 1-d array of the same size as alpha that specifies the time (in sec, in the timeline of the original unmodified audio recording) when the TSM factor should be changed/set; the first value of this array should always be zero (in order to set the initial TSM factor)
* You can think of t_alpha and alpha as a way of simulating a  person controlling the slider where they make a discrete number of changes over the length of the recording
E.g. if t_alpha = [0,2.5, 7.5] and alpha = [1.0, 2.0, 0.5], then the first 2.5 sec of the original audio should be played with no TSM, the next 7.5 - 2.5 = 5 sec of the original audio should be played with time stretch factor 2x, and the rest of the recording should be played with time stretch factor 0.5.

Use a simple example like this to verify that your code is working properly – you should be able to double check that the total duration is what is expected, and also listen to confirm that it is doing the time stretch properly


#### OLA Method

In [6]:
def tsm_overlap_add(x, alpha=1, L=220):
    """
    Returns a time-stretched signal

    Inputs
     - x: input signal
     - alpha: time stretch factor
     - L: frame length

    Outputs
      - y: time-stretched signal
    """
    Hs = L // 2  
    Ha = int(Hs // alpha)

    output = np.zeros(int(len(x) * alpha))
    window_sum = np.zeros(len(output))

    window = np.hanning(L)

    for n in range(0, len(x) - L, Ha):
        frame = x[n:n+L] * window
        m = int(n * alpha)
        if m + L > len(output):
            break
        output[m:m+L] += frame
        window_sum[m:m+L] += window

    window_sum[window_sum < 1e-10] = 1e-10
    output = output/window_sum

    return output

### Phase Vocoder

In [7]:
def principle_arg(angle):
    """Wrap phase angle to [-π, π] range"""
    return math.fmod(angle + np.pi, 2*np.pi) - np.pi

In [8]:
def getInstantFreq(phase, m, sr, L, Ha):
    """
    Compute instantaneous frequency for phase vocoder
    
    Args:
        phase: Phase matrix (frequency_bins × frames)
        m: Current frame index
        sr: Sample rate
        L: FFT size
        Ha: Analysis hop size
        
    Returns:
        Array of instantaneous frequencies (radians/sample)
    """
    freq_bins = np.arange(phase.shape[0]) * sr / L
    w_nom = 2 * np.pi * freq_bins * Ha / sr  # Nominal frequency
    delta_phi = phase[:, m+1] - phase[:, m]  # Phase difference
    
    # Phase deviation from nominal frequency
    princ_arg = np.vectorize(principle_arg)(delta_phi - w_nom)
    
    # Instantaneous frequency
    w_if = w_nom + princ_arg / Ha
    return w_if

In [9]:
def reconstruction(X_mod, L, Hs, alpha = None):
    """
    Custom inverse STFT implementation
    
    Args:
        X_mod: Modified STFT matrix
        L: Window length
        Hs: Synthesis hop size
        
    Returns:
        Time-domain signal
    """    
    window = np.hanning(L)
    output_length = int((X_mod.shape[1] - 1) * Hs)
    y = np.zeros(output_length, dtype=np.complex128)
    window_sum = np.zeros(output_length)
    
    for i in range(X_mod.shape[1]):
        start = i * Hs
        frame = np.fft.irfft(X_mod[:, i], n=L)
        
        if start + L > len(y):
            break
            
        y[start:start+L] += frame * window
        window_sum[start:start+L] += window
    
    window_sum[window_sum < 1e-10] = 1e-10
    y = np.real(y / window_sum)
    
    return y

In [10]:
def tsm_phase_vocoder(x, alpha=1, L=2048, sr=22050):
    """
    Phase vocoder time-scale modification
    
    Args:
        x: Input signal
        alpha: Time stretch factor
        L: Frame length
        sr: Sample rate
        
    Returns:
        Time-stretched signal
    """
    Hs = int(L // 4)
    Ha = int(Hs / alpha) 
    
    # Compute STFT
    S = lb.stft(x, n_fft=L, hop_length=Ha, win_length=L, window='hann')
    S_mag = np.abs(S)
    S_phase = np.angle(S)
    
    # Initialize modified phase
    phase_mod = np.zeros_like(S_phase)
    phase_mod[:, 0] = S_phase[:, 0]
    
    # Phase propagation
    for m in range(S_phase.shape[1] - 1):
        w_if = getInstantFreq(S_phase, m, sr, L, Ha)
        phase_mod[:, m+1] = phase_mod[:, m] + w_if * Hs
    
    # Reconstruct signal
    X_mod = S_mag * np.exp(1j * phase_mod)
    y = reconstruction(X_mod, L, Hs)

    return y

### Hybrid method

In [11]:
def median_filter(Y, lh=6, lp=6):
    """
    Applys median filter on magnitude spectrograms

    Inputs
     - Y: magnitude spectrogram 
     - lh: harmonic median filter half length
     - lp: percussive median filter half length

    Outputs
      - Yh: filtered harmonic magnitude spectrogram
      - Yp: filtered percussive magnitude spectrogram
    """

    window_h = lh + 1
    window_p = lp + 1

    # Apply horizontal median filter
    Yh = medfilt(Y, kernel_size=(1, window_h))

    # Apply vertical median filter
    Yp = medfilt(Y, kernel_size=(window_p, 1))

    return Yh, Yp

In [12]:
def harmonic_percussive_separation(x, sr=22050, fft_size=2048, hop_length=512, lh=6, lp=6):
    """
    Returns frequency and time domain representaions of the harmonic and percussive signals separated

    Inputs
     - x: input signal
     - fft_size: size of FFT 
     - hop_length: hop size
     - lh: harmonic median filter half length
     - lp: percussive median filter half length

    Outputs
      - xh: time domain harmonic signal
      - xp: time domain percussive signal
      - Xh: harmonic modified STFT
      - Xp: percussive modified STFT
    """

    X = lb.stft(x, n_fft=fft_size, hop_length=hop_length, win_length=fft_size)

    # filter magnitude spectrum
    Y = np.abs(X)
    (m,k) = Y.shape
    Yh, Yp = median_filter(Y, lh, lp) 

    # Generate Binary Masks
    Mh = np.zeros((m,k))
    Mp = np.zeros((m,k))
    
    for j in range(m):
      for i in range(k):
        if Yh[j,i] > Yp[j,i]:
          Mh[j,i] = 1
          Mp[j,i] = 0
        else:
          Mh[j,i] = 0
          Mp[j,i] = 1 

    # Apply Binary Masks
    Xh = X * Mh
    Xp = X * Mp

    # Get time domain signals
    xh = reconstruction(Xh, fft_size, hop_length)
    xp = reconstruction(Xp, fft_size, hop_length)

    return xh, xp, Xh, Xp

In [13]:
def tsm_hybrid(x, alpha, sr=22050, fft_size=2048, hop_length=512, lh=6, lp=6):
    """
    Returns time stretched signal y

    Inputs
     - x: input signal
     - alpha: time stretch factor
     - fft_size: size of FFT 
     - hop_length: hop size
     - lh: harmonic median filter half length
     - lp: percussive median filter half length
     

    Outputsi
      - y: time stretched signal
    """
    xh, xp, Xh, Xp = harmonic_percussive_separation(x, sr=sr, fft_size=fft_size, hop_length=hop_length, lh=lh, lp=lp)

    x_overlap = tsm_overlap_add(xp, alpha=alpha, L=fft_size)
    x_phase_vocoder = tsm_phase_vocoder(xh, alpha=alpha, L=fft_size, sr=sr)

    len_diff = np.abs(len(x_overlap) - len(x_phase_vocoder))

    if len(x_overlap) > len(x_phase_vocoder):
        x_phase_vocoder = np.pad(x_phase_vocoder, (0, len_diff), constant_values=0)
    elif len(x_overlap) < len(x_phase_vocoder):
        x_overlap = np.pad(x_overlap, (0, len_diff), constant_values=0)
            
    y = x_overlap + x_phase_vocoder
    return y

### Including dynamic variable

In [47]:
def tsm_hybrid_variable(x, sr, t_alpha, alpha):
    """
    x: Original time-domain signal
    sr: sampling rate
    t_alpha: 1-d array that specifies time in sec, in the timeline of the original recording (always has to start at 0)
    alpha: 1-d array of stretch factors
    """
    y = []
    for i in np.arange(len(t_alpha)):
        if i + 1 < len(t_alpha):
            diff = t_alpha[i + 1] - t_alpha[i]
        print(diff)
        idx_start = t_alpha[i] * sr
        idx_end = diff * sr
        x = x[int(idx_start): int(idx_end)]
        y.extend(tsm_hybrid(x, alpha[i]))
    return y
    

In [48]:
choir, c_sr = lb.load("choir.wav")

In [49]:
t_alpha = [0, 2.5, 7.5]
alpha = [1, 2, 0.5]
# for i in len(t_alpha):
#     diff = t

In [54]:
t_alpha.extend(alpha)

In [34]:
len(choir)

264600

In [None]:
2.5 * c_sr

110250.0

In [50]:
y = tsm_hybrid_variable(choir, c_sr, t_alpha, alpha)

2.5
5.0
5.0


  Yh = medfilt(Y, kernel_size=(1, window_h))


In [51]:
ipd.Audio(y, rate=c_sr)