In [None]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchaudio

In [None]:
# dpwmload audio dataset
test_dataset = torchaudio.datasets.LIBRISPEECH("./", url="test-clean", download=True)

In [None]:
# 1 waveform
# 2 samplerate
# 3 transcript
# 4 speaker_id
# 5 chapter_id
# 6 utterace_id
test_dataset[0]

In [None]:
test_dataset[0][0].shape

In [None]:
len(test_dataset[0][0][0]) / test_dataset[0][1]

In [None]:
audioData = test_dataset[0][0][0]
sr = test_dataset[0][1]

In [None]:
import IPython.display as ipd

In [None]:
ipd.Audio(audioData, rate=17000)

In [None]:
y_8k = librosa.resample(audioData.numpy(), sr, sr // 2)
len(y_8k)

In [None]:
ipd.Audio(y_8k, rate=8000)

In [None]:
y_4k = librosa.resample(audioData.numpy(), sr, sr // 4)
len(y_4k)

In [None]:
ipd.Audio(y_4k, rate=4000)

In [None]:
audioData

In [None]:
audio_np = audioData.numpy()
norm_audio = audio_np / max(np.abs(audio_np))
ipd.Audio(norm_audio, rate=sr)

In [None]:
norm_audio

In [None]:
Bit = 8
max_value = 2 ** (Bit - 1)

quantized_audio = norm_audio * max_value
quantized_audio = np.round(quantized_audio).astype(int)
quantized_audio = np.clip(quantized_audio, -max_value, max_value - 1)

ipd.Audio(quantized_audio, rate=sr)

In [None]:
quantized_audio[20000:20100]

In [None]:
import librosa.display

In [None]:
audio_np = audio_np

fig = plt.figure(figsize=(14,5))
librosa.display.waveplot(audio_np[10000:13000], sr=sr)

In [None]:
S = librosa.core.stft(audio_np, n_fft=1024, hop_length=512, win_length=1024)
audio_np.shape, S.shape

In [None]:
S

In [None]:
D = np.abs(S) ** 2
D, D.shape

In [None]:
log_S = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12,4))
librosa.display.specshow(log_S, sr=16000, x_axis='time')

In [None]:
def frame_audio(audio, FFT_size=1024, hop_size=20, sample_rate = 22050):
    audio = np.pad(audio, int(FFT_size/2), mode='reflect')
    frame_len = np.round(sample_rate*hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num, FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    return frames

audio_framed = frame_audio(audio_np)
print("Framed audio shape: {}".format(audio_framed.shape))

In [None]:
from scipy import signal

window = signal.get_window('hann', 1024, fftbins=True)
audio_win = audio_framed * window
ind = 1

# window plot
plt.figure(figsize=(15,6))
plt.subplot(3,1,1)
plt.plot(window)
plt.grid(True)

# frame plot
plt.subplot(3,1,2)
plt.plot(audio_framed[ind])
plt.grid(True)

# frame * win plot
plt.subplot(3,1,3)
plt.plot(audio_win[ind])
plt.grid(True)

In [None]:
D = D

In [None]:
mel_basis = librosa.filters.mel(16000, 1024, n_mels=40)
mel_S = np.dot(mel_basis, D)
mel_S.shape

In [None]:
plt.figure(figsize=(12,4))
log_mel_S = librosa.power_to_db(mel_S)
librosa.display.specshow(log_mel_S, sr=sr, x_axis='time', y_axis='mel')
plt.title('Mel power sepctrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

In [None]:
mfcc = librosa.feature.mfcc(S=log_mel_S, n_mfcc=13)
mfcc = mfcc.astype(np.float32)    # to save the memory (64 to 32 bits)
plt.figure(figsize=(12,4))
print(mfcc.shape)
librosa.display.specshow(mfcc)