In [27]:
from matplotlib import pyplot as plt
import soundfile as sf
import librosa.feature
import numpy as np
import scipy as sc
import torchaudio
import librosa
import torch
import math

In [28]:
# Place your code for this part here
filepath = 'audio_16k/Basta_16k.wav'
data, samplerate = sf.read(filepath)

input = torch.unsqueeze(torch.unsqueeze(torch.tensor(data), dim=0), dim=0)
interpolation_compressed = torch.nn.functional.interpolate(input, scale_factor=0.8, mode='bilinear')
interpolation_stretched = torch.nn.functional.interpolate(input, scale_factor=1.2, mode='bilinear')

compressed_filepath = 'outputs/interpolation_0_8.wav'
compressed_squeezed = torch.squeeze(interpolation_compressed, dim=0)[0]
sf.write(compressed_filepath, compressed_squeezed.numpy(), samplerate)

stretched_filepath = 'outputs/interpolation_1_2.wav'
stretched_squeezed = torch.squeeze(torch.squeeze(interpolation_stretched, dim=0))
sf.write(stretched_filepath, stretched_squeezed.numpy(), samplerate)


In [29]:
# Place your code for this part here
def naive_tempo_shift(wav, factor):
  data, samplerate = sf.read(wav)

  freq_domain = torch.stft(torch.tensor(data.T), n_fft=1024, win_length=1024,
                           hop_length=int(256*factor), return_complex=True)
  return torch.istft(freq_domain, n_fft=1024, win_length=1024,
                     hop_length=256).T, samplerate


In [30]:
output, samplerate = naive_tempo_shift('audio_16k/Basta_16k.wav', 0.8)
sf.write('outputs/naive_pitch_shift_0_8.wav', output, samplerate)

output, samplerate = naive_tempo_shift('audio_16k/Basta_16k.wav', 1.2)
sf.write('outputs/naive_pitch_shift_1_2.wav', output, samplerate)

In [31]:
def get_acc_phase_delta(stft_left, stft_right):
  # calculate angular distance between two complex STFTs
  phase_delta = torch.angle(stft_right) - torch.angle(stft_left)
  phase = torch.Tensor(np.zeros(phase_delta.shape))

  # accumulate phase, follow this recursive formula
  phase[:, :, 0] = phase_delta[:, :, 0]
  for i in range(1, phase.numpy().shape[2]):
    phase[:, :, i] = phase_delta[:, :, i] + phase[:, :, i-1]
  
  # round phase back to 0 - 2 * pi range
  phase = phase - 2 * np.pi * torch.round(phase / (2 * np.pi))

  return phase

In [32]:
def time_stretch(signal, factor, win_size=1024, hop=1024//4):
  # create window
  hann_window = torch.hann_window(win_size)

  # draw two complex STFTs
  new_hop = int(hop * factor)
  stft_left = torch.stft(torch.tensor(signal[:-hop].T), n_fft=win_size, win_length=win_size,
                         hop_length=new_hop, window=hann_window, return_complex=True)
  stft_right = torch.stft(torch.tensor(signal[hop:].T), n_fft=win_size, win_length=win_size,
                         hop_length=new_hop, window=hann_window, return_complex=True)

  # calculate accumulated phase delta with modulus (2 pi)
  phase = get_acc_phase_delta(stft_left, stft_right)

  # reconstruct component from phase
  get_re_im_from_phase = lambda phase: (torch.cos(phase), torch.sin(phase))
  re, im = get_re_im_from_phase(phase)
  complex_new_stft = torch.complex(re, im) * abs(stft_right)
  output = torch.istft(complex_new_stft, n_fft=win_size, win_length=win_size,
                       hop_length=hop, window=hann_window).T

  return output

In [33]:
# Place your code for this part here
data, samplerate = sf.read('audio_16k/Basta_16k.wav')
output = time_stretch(data, 0.8)
sf.write('outputs/phase_vocoder_0_8.wav', output, samplerate)

data, samplerate = sf.read('audio_16k/Basta_16k.wav')
output = time_stretch(data, 1.2)
sf.write('outputs/phase_vocoder_1_2.wav', output, samplerate)