In [115]:
import logging
import os
import shutil
import subprocess
import torch
import wave
from glob import glob
from pyannote.audio import Pipeline
from pydub import AudioSegment
from tqdm import tqdm

In [95]:
logging.basicConfig(filename='logs.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [9]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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")
pipeline.to(torch.device("cuda"))

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x7f5e0ced2b90>

In [96]:
audio_paths = glob('./audio_raw/*')
for audio_path in tqdm(audio_paths):
    audio_name = audio_path.split('/')[-1]
    audio_name_no_ext = audio_name.split('.')[0]
    output_path = f'./wav_raw/{audio_name_no_ext}.wav'
    ffmpeg_script = f'ffmpeg -i {audio_path} -vn -acodec pcm_s16le -ar 44100 -ac 2 {output_path} -y'
    ffmpeg_output = subprocess.run(
        ffmpeg_script, 
        shell=True, 
        check=True, 
        capture_output=True,
        text=True)
    logger.info(ffmpeg_output)

100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.42s/it]


In [112]:
def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        num_frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()
        duration = num_frames / float(frame_rate)
        return duration


wav_paths = glob('./wav_raw/*.wav')
for wav_path in tqdm(wav_paths):
    duration = get_wav_duration(wav_path)
    wav_name = wav_path.split('/')[-1]
    wav_name_no_ext = wav_name.split('.')[0]
    if duration > 600:
        segment_wav_path = f'./wav_10_minutes/{wav_name_no_ext}_%06d.wav'
        ffmpeg_script = f'ffmpeg -i {wav_path} -f segment -segment_time 600 -c copy {segment_wav_path}'
        ffmpeg_output = subprocess.run(
            ffmpeg_script, 
            shell=True, 
            check=True, 
            capture_output=True,
            text=True)
        logger.info(ffmpeg_output)
    else:
        shutil.copy(wav_path, f'./wav_10_minutes/{wav_name_no_ext}.wav')

100%|███████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.03it/s]


In [111]:
def diarize_and_remove_overlap(segmented_wav_path):
    diarization = pipeline(segmented_wav_path)
    start_end_speakers_list = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        start_end = [turn.start, turn.end]
        start_end_speakers_list.append([start_end, speaker])
    start_end_speakers_list = sorted(start_end_speakers_list, key=lambda sublist: sublist[0][0])
    start_end_speakers_list_no_overlap = []
    compared_sublist = start_end_speakers_list[0]
    for index in range(1, len(start_end_speakers_list)):
        current_sublist = start_end_speakers_list[index]
        if compared_sublist[0][1] > current_sublist[0][0]:
            continue
        start_end_speakers_list_no_overlap.append(current_sublist)
        compared_sublist = current_sublist
    logger.info(
        f'Original diarized segments: {len(start_end_speakers_list)}'
        f'Overlap removed: {len(start_end_speakers_list_no_overlap)}'
    )
    logger.info(f'Lost {1 - len(start_end_speakers_list_no_overlap) / (len(start_end_speakers_list)):.2f}%')
    speaker_dict = {segmented_wav_path: {}}
    for start_end, speaker in start_end_speakers_list_no_overlap:
        if speaker not in speaker_dict[segmented_wav_path]:
            speaker_dict[segmented_wav_path][speaker] = []
        speaker_dict[segmented_wav_path][speaker].append(start_end)
    return speaker_dict

In [113]:
speaker_dict_list = []
segmented_wav_paths = glob('./wav_10_minutes/*.wav')
for segmented_wav_path in tqdm(segmented_wav_paths):
    segmented_wav_name = segmented_wav_path.split('/')[-1]
    segmented_wav_name_no_ext = segmented_wav_name.split('.')[0]
    temp_speaker_dict = diarize_and_remove_overlap(segmented_wav_path)
    speaker_dict_list.append(temp_speaker_dict)

100%|███████████████████████████████████████████████████████████████████████████| 2/2 [00:44<00:00, 22.42s/it]


In [131]:
for speaker_dict in tqdm(speaker_dict_list):
    segmented_wav_name = list(speaker_dict.keys())[0].split('/')[-1]
    segmented_wav_name_no_ext = segmented_wav_name.split('.')[0]
    diarized_sub_dir_path = f'./diarized_results/{segmented_wav_name_no_ext}'
    if not os.path.exists(diarized_sub_dir_path):
        os.mkdir(diarized_sub_dir_path)
    wav_source = AudioSegment.from_wav(list(speaker_dict.keys())[0])
    for speaker in speaker_dict[list(speaker_dict.keys())[0]]:
        speaker_path = f'{diarized_sub_dir_path}/{speaker}'
        if not os.path.exists(speaker_path):
            os.mkdir(speaker_path)
        for index, start_end in enumerate(speaker_dict[list(speaker_dict.keys())[0]][speaker]):
            t1 = start_end[0] * 1000
            t2 = start_end[1] * 1000
            wav_snippet = wav_source[t1: t2]
            wav_snippet_path = f'{speaker_path}/{index}.wav'
            wav_snippet.export(wav_snippet_path, format="wav")

100%|███████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 35.44it/s]


In [132]:
# remove all files with less than 3 seconds

In [133]:
# remove noise from main speaker

In [134]:
# check required properties to send to trainer