In [3]:
!pip install -U torchaudio librosa jiwer datasets transformers huggingface_hub evaluate python-dotenv wandb speechbrain

Collecting torchaudio
  Using cached torchaudio-0.13.0-cp38-cp38-manylinux1_x86_64.whl (4.2 MB)
Collecting librosa
  Using cached librosa-0.9.2-py3-none-any.whl (214 kB)
Collecting jiwer
  Using cached jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.7/451.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting transformers
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [54]:
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement, SepformerSeparation, VAD

SAMPLE_RATE = 16_000;

In [22]:
# audio_path = "/home/vpetukhov/other/Consulting/SEADS/FEM/survey_data/test_audio/test_hausa.wav"

In [49]:
audio_path_raw = "/home/vpetukhov/other/Consulting/SEADS/FEM/survey_data/audio/1663777780248.wav"
audio_path = audio_path_raw.split(".")[0] + f"_{SAMPLE_RATE // 1000}k.wav"

arr, org_sr = torchaudio.load(audio_path_raw)
arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=SAMPLE_RATE)
torchaudio.save(audio_path, src=arr, sample_rate=SAMPLE_RATE)

## Voice Activity Detection

In [87]:
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
boundaries = vad.get_speech_segments(audio_path)
vad.save_boundaries(boundaries, print_boundaries=True)

segment_001  0.00  244000.00 SPEECH
segment_002  244000.00  259199.00 NON_SPEECH
segment_003  259199.00  954560.00 SPEECH
segment_004  954560.00  960000.00 NON_SPEECH
segment_005  960000.00  974560.00 SPEECH
segment_006  974560.00  1027199.00 NON_SPEECH
segment_007  1027199.00  2027519.00 SPEECH
segment_008  2027519.00  2080000.00 NON_SPEECH
segment_009  2080000.00  2155839.00 SPEECH
segment_010  2155839.00  2190079.00 NON_SPEECH
segment_011  2190079.00  3180800.00 SPEECH
segment_012  3180800.00  3200000.00 NON_SPEECH
segment_013  3200000.00  4704640.00 SPEECH
segment_014  4704640.00  5120000.00 NON_SPEECH
segment_015  5120000.00  5175520.00 SPEECH
segment_016  5175520.00  5280000.00 NON_SPEECH
segment_017  5280000.00  5356159.00 SPEECH
segment_018  5356159.00  5440000.00 NON_SPEECH
segment_019  5440000.00  5459679.00 SPEECH
segment_020  5459679.00  5800159.00 NON_SPEECH
segment_021  5800159.00  6244320.00 SPEECH
segment_022  6244320.00  6273440.00 NON_SPEECH
segment_023  6273440.00  6

In [60]:
boundaries = (boundaries * SAMPLE_RATE).detach().numpy().astype(int)

In [77]:
split_audio_path = audio_path_raw.split(".")[0] + "_segments/"
for i,(s,e) in enumerate(boundaries):
    if i % 2 != 0:
        continue

    torchaudio.save(split_audio_path + f"/seg_{i // 2}.wav", src=arr[:,s:e], sample_rate=SAMPLE_RATE)

In [86]:
split_audio_path

'/home/vpetukhov/other/Consulting/SEADS/FEM/survey_data/audio/1663777780248_segments/'

## Enhance audio

In [82]:
sel_seg_path = split_audio_path + "seg_8.wav"

In [83]:
enhance_model = SpectralMaskEnhancement.from_hparams("speechbrain/metricgan-plus-voicebank")

In [85]:
noisy = enhance_model.load_audio(path=sel_seg_path).unsqueeze(0)

# Add relative length tensor
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))

# Saving enhanced signal on disk
torchaudio.save(sel_seg_path.split(".")[0] + '_enhanced.wav', enhanced.cpu(), 16000)

## Split speakers

In [88]:
sel_seg_path = split_audio_path + "seg_5.wav"

In [89]:
model = SepformerSeparation.from_hparams(source="speechbrain/sepformer-wham", savedir='pretrained_models/sepformer-wham')

In [90]:
est_sources = model.separate_file(path=sel_seg_path)

torchaudio.save(sel_seg_path.split(".")[0] + '_s1.wav', est_sources[:, :, 0].detach().cpu(), 8000)
torchaudio.save(sel_seg_path.split(".")[0] + '_s2.wav', est_sources[:, :, 1].detach().cpu(), 8000)

Resampling the audio from 16000 Hz to 8000 Hz
