In [4]:
from pathlib import Path
import diarize_utils as utils
from phonlab.utils import dir2df

## Define project



In [5]:
projroot = Path('/global/scratch/users/rsprouse/yidcorp/')
buffer_ms = 250
rttmleft = projroot / 'diarized' / 'rttm' / 'left'
rttmright = projroot / 'diarized' / 'rttm' / 'right'

## Find existing .rttm files and merge left/right channels

In [6]:
rttmdir = projroot / 'diarized' / 'rttm'
rdfleft = dir2df(rttmleft, fnpat='\.rttm$', addcols=['barename'])
rdfright = dir2df(rttmright, fnpat='\.rttm$', addcols=['barename'])

In [7]:
rdf = rdfleft.merge(
    rdfright,
    how='outer',
    on=('fname', 'barename'),
    suffixes=('_left', '_right')
)
rdf

Unnamed: 0,relpath_left,fname,barename,relpath_right
0,.,Abraham_Bursztajn_Tape1.rttm,Abraham_Bursztajn_Tape1,.
1,.,Abraham_Bursztajn_Tape2.rttm,Abraham_Bursztajn_Tape2,.
2,.,Abraham_Bursztajn_Tape3.rttm,Abraham_Bursztajn_Tape3,.
3,.,Abraham_Bursztajn_Tape4.rttm,Abraham_Bursztajn_Tape4,.
4,.,Abraham_Bursztajn_Tape5.rttm,Abraham_Bursztajn_Tape5,.
...,...,...,...,...
935,.,Zwi_Miller_Tape1.rttm,Zwi_Miller_Tape1,.
936,.,Zwi_Miller_Tape2.rttm,Zwi_Miller_Tape2,.
937,.,Zwi_Miller_Tape3.rttm,Zwi_Miller_Tape3,.
938,.,Zwi_Miller_Tape4.rttm,Zwi_Miller_Tape4,.


## Find existing .eaf files and merge with .rttm files to discover which need processing

In [11]:
eafdir = projroot / 'diarized' / 'eaf'
todo = utils.compare_dirs(
    dir1=rttmleft, ext1='.rttm',
    dir2=eafdir, ext2='.eaf'
)
todo

Unnamed: 0,relpath,fname,barename


In [None]:
todo_df = rdf.merge(
    eafdf,
    how='left',
    on=('barename'),
    suffixes=('_rttm', '_eaf')
)
todo_df = todo_df[todo_df['relpath'].isna()]
#todo_df = todo_df[todo_df['barename'].str.startswith('Lola')]
todo_df

In [None]:
rttms = None
for row in todo_df.itertuples():
    if row.Index % 50 == 0:
        print(f'Working on row index {row.Index}.')
    try:
        rttms = {
            'LEFT': rttmdir / row.relpath_left / row.fname_rttm,
            'RIGHT': rttmdir / row.relpath_right / row.fname_rttm,
        }
    except TypeError:
        print(f'ERROR: Skipping {row.fname_rttm}.')
        continue
    stereowav = projroot / 'audio' / 'stereo' / f'{row.barename}.wav'
    data, rate = librosa.load(stereowav, sr=None, mono=False)
    spkrchans = []
    for chan_s, chan_i in (('LEFT', 0), ('RIGHT', 1)):
        df = rttm2df(rttms[chan_s])
        df['t1idx'] = (df['t1']/1000 * rate).astype(int)
        df['t2idx'] = (df['t2']/1000 * rate).astype(int)
        spkrchan = []
        for spkr in df['spkr'].cat.categories.sort_values():
            spkrdf = df[df['spkr'] == spkr]
            spkrdur = spkrdf['dur'].sum()
            uttidx = np.hstack([np.arange(r.t1idx, r.t2idx) for r in spkrdf.itertuples()])
            spkravgmag = np.abs(data[chan_i, uttidx]).sum() / spkrdur
            spkrchan = {'spkr': spkr, 'totdur': spkrdur, 'avgmag': spkravgmag, 'chan': chan_s}
            spkrchans.append(spkrchan)
    spkrdf = pd.DataFrame(spkrchans).sort_values(['totdur', 'avgmag'])
    try:
        assert(len(spkrdf) == 4)  # two channels, two speakers
        interviewer = spkrdf[0:2]
        survivor = spkrdf[2:4]
        assert(np.all(interviewer['chan'].sort_values() == ['LEFT', 'RIGHT']))
        assert(np.all(survivor['chan'].sort_values() == ['LEFT', 'RIGHT']))
        spkrdf['tiername'] = ['Interviewer unlikely', 'Interviewer probable', 'Survivor unlikely', 'Survivor probable']
        tiernames = spkrdf.set_index(['chan', 'spkr']).loc[:, 'tiername'].to_dict()
    except AssertionError:
        tiernames = {}
    eaf = rttm2eaf(rttms, buffer_ms=buffer_ms, names=tiernames)
    eaf.add_linked_file(f"file:///{row.barename}.mp4", relpath=f"./{row.barename}.mp4", mimetype="video/mp4")
    eaf.add_linked_file(f"file:///wav/{row.barename}.wav", relpath=f"./{row.barename}.wav", ex_from=f"file:///video/{row.barename}.mp4")
    eaf.to_file(eafdir / f'{row.barename}.eaf')