In [None]:
pip install b2aiprep

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

In [None]:
import os
from pathlib import Path

import IPython.display as Ipd

from b2aiprep import process as b2p
import torchaudio

In [None]:
import requests
def get_url(url, filename):
    req = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

In [None]:
url = "https://www.mq.edu.au/__data/assets/file/0016/912031/arthur01.wav"
url = "https://www.mq.edu.au/__data/assets/file/0010/911953/24.wav"
filename1 = "saved_file.wav"
get_url(url, filename1)
audio1 = b2p.Audio.from_file(filename1)

In [None]:
from torchaudio.utils import download_asset

filename2 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
audio2 = b2p.Audio.from_file(filename2)
filename2

In [None]:
Ipd.display(Ipd.Audio(data=audio1.signal.squeeze(), rate=audio1.sample_rate))

In [None]:
Ipd.display(Ipd.Audio(data=audio2.signal.squeeze(), rate=audio2.sample_rate))

#### Plot a spectrogram of the signal after resampling to 16KHz

In [None]:
audio1_16 = b2p.resample_iir(audio1, lowcut=7000, new_sample_rate=16000, order=4)
specgram_log = b2p.specgram(audio1_16, toDb=True)
fig, axs = plt.subplots(2, 1)
b2p.plot_waveform(audio1_16.signal,
                  sr=audio1_16.sample_rate,
                  title=f"Original waveform", ax=axs[0])
b2p.plot_spectrogram(specgram_log.T, title="spectrogram", ax=axs[1])
fig.tight_layout()

#### Compute relevant features of the audio and write it out

In [None]:
features, outfile, _ = b2p.to_features(filename1, subject="s1", task="t1",
                                       return_features=True)
outfile

#### Plot the spectrogram from the features

In [None]:
fig, axs = plt.subplots(2, 1)
b2p.plot_waveform(audio1.signal,
                  sr=audio1.sample_rate,
                  title=f"Original waveform", ax=axs[0])
b2p.plot_spectrogram(features['specgram'].T, title="spectrogram", ax=axs[1],
                     norm=LogNorm(vmin=0, vmax=10))
fig.tight_layout()

In [None]:
n_fft = 512
specgram = b2p.specgram(audio1_16, n_fft=n_fft, toDb=False)
win_length = int(audio1_16.sample_rate * 20 / 1000)
hop_length = int(audio1_16.sample_rate * 10 / 1000)
griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, win_length=win_length, hop_length=hop_length)
reconstructed_waveform = griffin_lim(specgram.T) #features['specgram'].T)
Ipd.display(Ipd.Audio(data=reconstructed_waveform, rate=audio1_16.sample_rate))

#### Plot the Mel filterbank

In [None]:
fig, axs = plt.subplots(2, 1)
b2p.plot_waveform(audio1.signal,
                  sr=audio1.sample_rate,
                  title=f"Original waveform", ax=axs[0])
b2p.plot_spectrogram(features['melfilterbank'].T, title="Mel spectrogram",ylabel='Mel filters', ax=axs[1])
fig.tight_layout()

#### Plot the MFCC + delta coefficients

In [None]:
fig, axs = plt.subplots(2, 1)
b2p.plot_waveform(audio1.signal,
                  sr=audio1.sample_rate,
                  title=f"Original waveform", ax=axs[0])
b2p.plot_spectrogram(features['mfcc'].T, title="MFCC", ylabel='MFCC coeffs', ax=axs[1])
fig.tight_layout()

#### Verify if two recordings are from the same speaker

In [None]:
score, prediction = b2p.verify_speaker(audio1, audio2, model='speechbrain/spkrec-ecapa-voxceleb', model_rate=16000)
print(f"Score: {float(score):.2f}, Prediction: {bool(prediction)}")

In [None]:
audio_reconstructed = b2p.Audio(signal=reconstructed_waveform, sample_rate=audio1_16.sample_rate)
score, prediction = b2p.verify_speaker(audio1, audio_reconstructed, model='speechbrain/spkrec-ecapa-voxceleb', model_rate=16000)
print(f"Score: {float(score):.2f}, Prediction: {bool(prediction)}")

### What did they say?

In [None]:
stt = b2p.SpeechToText(
        model_id="openai/whisper-tiny",
        max_new_tokens=128,
        chunk_length_s=5,
        batch_size=16,
        return_timestamps=True,
        device="cpu",
    )

In [None]:
transcription = stt.transcribe(audio1, language=None)
transcription

In [None]:
transcription = stt.transcribe(audio2, language=None)
transcription