In [28]:
%%bash
# Cell to run prior to running following cells in notebook during development.
# Delete this cell prior to publishing.
stereodir=/global/scratch/users/rsprouse/yidcorp/audio/stereo
tgtdir=/global/home/groups/fc_phonlab/spkrcorpus
dur=45

# Clear out existing files.
rm -rf ${tgtdir}/audio
rm -rf ${tgtdir}/diarized

# Speaker 1
spkrdir=${tgtdir}/audio/stereo/speaker_1
mkdir -p ${spkrdir}
for name in 'Moishe_Gorelik_Tape4.wav' 'Moishe_Gorelik_Tape5.wav'; do
    sox ${stereodir}/${name} ${spkrdir}/${name} trim 0 ${dur}
done

# Speaker 2
spkrdir=${tgtdir}/audio/stereo/speaker_2
mkdir -p ${spkrdir}
for name in 'Zhenya_Raykhman_Tape2.wav' 'Zhenya_Raykhman_Tape3.wav'; do
    sox ${stereodir}/${name} ${spkrdir}/${name} trim 0 ${dur}
done

# Prepare audio files for diarization

Our source audio files are stereo files that need to be resampled and the left and right channels separated prior to diarization.

In [10]:
from pathlib import Path
import diarize_utils as utils
from pyannote.audio import Pipeline

## Define the project

The source audio files are stored in a subdirectory named `audio/stereo` in the project root. The left and right channel outputs will be stashed in `left` and `right` subdirectories of `audio`.

In [11]:
projroot = Path('/global/home/groups/fc_phonlab/spkrcorpus')

In [12]:
output_type = 'TextGrid' # Desired output type: 'eaf' or 'TextGrid'

In [13]:
# Map channel numbers to subdirectory names. Each channel identified in `channelmap`
# will be extracted from a source audio file and downsampled, then diarized separately
# into 'left' and 'right' subdirectories.
channelmap = {
    1: 'left',
    2: 'right'
}
# A sample `channelmap` for mono input audio files.
#channelmap = {1: 'ds'}

TODO: more on auth tokens

In [14]:
tokenfile = projroot/'pyannote-auth-token'
with open(tokenfile, 'r') as tf:
    auth_token = tf.readline().strip()

## Extract the channels and downsample audio

The output audio will consist of a single channel from an input audio file that has been downsampled to 16000 Hz.

We use the `compare_dirs` function finds `stereo` files that do not yet have corresponding `left` and `right` output files. The `ext1` and `ext2` values ensure that `compare_dirs` only looks for `.wav` files in the corresponding directories. `compare_rows` returns a dataframe in which each row contains a file that requires processing.

We iterate over the rows of the `todo` dataframe and use `prep_audio` to extract one channel of audio and downsample. The resulting `.wav` file is stored in a `left` or `right` subdirectory.

In [30]:
verbose = True   # Set to false to suppress progress messages
for chan_num, chan_name in channelmap.items():
    chandir = projroot/'audio'/chan_name
    todo = utils.compare_dirs(
        dir1=srcdir, ext1='.wav',
        dir2=chandir, ext2='.wav'
    )
    for row in todo.itertuples():
        infile = srcdir/row.relpath/row.fname
        outfile = chandir/row.relpath/row.fname
        if verbose:
            print(f'prep_audio: {outfile}')
        utils.prep_audio(infile, outfile, chan_num)

     relpath                      fname               barename
0  speaker_1   Moishe_Gorelik_Tape4.wav   Moishe_Gorelik_Tape4
1  speaker_1   Moishe_Gorelik_Tape5.wav   Moishe_Gorelik_Tape5
2  speaker_2  Zhenya_Raykhman_Tape2.wav  Zhenya_Raykhman_Tape2
3  speaker_2  Zhenya_Raykhman_Tape3.wav  Zhenya_Raykhman_Tape3
prep_audio: /global/home/groups/fc_phonlab/spkrcorpus/audio/left/speaker_1/Moishe_Gorelik_Tape4.wav
prep_audio: /global/home/groups/fc_phonlab/spkrcorpus/audio/left/speaker_1/Moishe_Gorelik_Tape5.wav
prep_audio: /global/home/groups/fc_phonlab/spkrcorpus/audio/left/speaker_2/Zhenya_Raykhman_Tape2.wav
prep_audio: /global/home/groups/fc_phonlab/spkrcorpus/audio/left/speaker_2/Zhenya_Raykhman_Tape3.wav
     relpath                      fname               barename
0  speaker_1   Moishe_Gorelik_Tape4.wav   Moishe_Gorelik_Tape4
1  speaker_1   Moishe_Gorelik_Tape5.wav   Moishe_Gorelik_Tape5
2  speaker_2  Zhenya_Raykhman_Tape2.wav  Zhenya_Raykhman_Tape2
3  speaker_2  Zhenya_Raykhman_T

## Instantiate the pipeline

TODO: more on setting params.

In [31]:
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=auth_token
)
parameters = {
    "segmentation": {
        "min_duration_off": 0.3,
    },
}

pipeline.instantiate(parameters)

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x2afef1480e20>

## Diarize the channels

The `compare_dirs` function finds downsampled `.wav` files that do not have a corresponding output file of the type identified by `output_type`. The `ext1` and `ext2` values ensure that `compare_dirs` only looks for `.wav` and `output_type` files in their corresponding directories.

In [None]:
for chan_num, chan_name in channelmap.items():
    wavdir = projroot/'audio'/chan_name
    outdir = projroot/'diarized'/output_type/chan_name
    todo = utils.compare_dirs(
        dir1=wavdir, ext1='.wav',
        dir2=outdir, ext2=f'.{output_type}'
    )
    for row in todo.itertuples():
        wavfile = wavdir/row.relpath/row.fname
        outfile = outdir/row.relpath/f'{row.barename}.{output_type}'
        if verbose:
            print(f'diarize: {outfile}')
        utils.diarize(wavfile, pipeline, chan_num, outfile)

diarize: /global/home/groups/fc_phonlab/spkrcorpus/diarized/TextGrid/left/speaker_1/Moishe_Gorelik_Tape4.TextGrid


[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
