# Making a dataset for training a pyannote.audio diarization model

In this notebook, we transform the audio of elicitation sessions and their .eaf annotations for speaker turns and language into a pyannote.database. We get .lst, .uem and .rttm files for train, development and test datasets.


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Installing libraries (pympi-ling is a library that has tools for working with .eaf files)
!pip install pympi-ling

Collecting pympi-ling
  Downloading pympi_ling-1.70.2-py2.py3-none-any.whl (24 kB)
Installing collected packages: pympi-ling
Successfully installed pympi-ling-1.70.2


In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import pympi
import os

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/diarizationcorpora/pyannote.csv')

In [None]:
df

Unnamed: 0,SessionID,Length
0,trg_as_1,5143
1,ksm_as_1,3457
2,atl_as_1,453
3,topm_as_1,324
4,rai_as_1,814
5,gwd_as_1,2324
6,sed_as_1,4303
7,gvd_sds_1,5451
8,tes_as_1,242
9,rai_vm_1,6415


In [None]:
#Making train, development and test sets out of filenames

list_of_wavs = df['SessionID'].tolist()
train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

train_filenames = [filename for filename in train_df['SessionID'].tolist()]
dev_filenames = [filename for filename in validate_df['SessionID'].tolist()]
test_filenames = [filename for filename in test_df['SessionID'].tolist()]


In [None]:
#Making .lst files, which consist of filenames for each set (train, dev, test)
with open('train.lst', 'w') as f:
    for item in train_filenames:
        f.write("%s\n" % item)

with open('development.lst', 'w') as f2:
    for item in dev_filenames:
        f2.write("%s\n" % item)

with open('test.lst', 'w') as f3:
    for item in test_filenames:
        f3.write("%s\n" % item)

In [None]:
def annotated_list(list_for_annotation):

  '''The function annotated_list processes a list of ELAN annotation files by extracting and sorting their annotations
  (based on start and end times) from tiers that do not contain 'language'
  (we don't need language tiers for diarization) in their names.
  It returns a list where each element corresponds to the sorted annotations from one file. '''

    full_annot_list = []
    for filename in list_for_annotation:
        eaf = pympi.Elan.Eaf(str( '/content/gdrive/MyDrive/diarizationcorpora/eafs/' + filename + '.eaf'))
        reftiers = []
        list_of_ref = []
        for i in eaf.tiers:
            if 'language' not in i:
                reftiers.append(i)
        list_of_annot = []
        for i in reftiers:
            annot = eaf.get_annotation_data_for_tier(i)
            for j in annot:
                list_of_annot.append((j[0:2],i, filename))
                list_of_ref.append(i)
        for i in range(len(list_of_annot) - 1):
            min_idx = i
            for j in range(i + 1, len(list_of_annot)):
                if list_of_annot[min_idx][0] > list_of_annot[j][0]:
                    min_idx = j

            if i != min_idx:
                list_of_annot[i], list_of_annot[min_idx] = list_of_annot[min_idx], list_of_annot[i]
        full_annot_list.append(list_of_annot)
    return full_annot_list

In [None]:
def write_rttm(full_annot_list):
  '''The function write_rttm writes annotations from the full_annot_list into an RTTM file
  (which consists of every speaking segment of audio, its start and end time and the code of the speaker).'''
    for i in full_annot_list:
        for j in i:
            fields = ['SPEAKER', j[2], '1', str(j[0][0] /  1000.0),str((j[0][1] - j[0][0])/ 1000.0),
              '<NA>', '<NA>', j[1], '<NA>', '<NA>']
            line = ' '.join(fields)
            f.write(line.encode('utf-8'))
            f.write(b'\n')

In [None]:
#Transforming lengths of audio into an appropriate format
train_uem = [float(value.replace(',', '.')) for value in train_df['Length'].tolist()]
dev_uem = [float(value.replace(',', '.')) for value in validate_df['Length'].tolist()]
test_uem = [float(value.replace(',', '.')) for value in test_df['Length'].tolist()]


In [None]:
uem_train = []
uem_dev = []
uem_test = []
for a in train_uem:
    uem_train.append(a-1)
for a in dev_uem:
    uem_dev.append(a-1)
for a in test_uem:
    uem_test.append(a-1)


In [None]:
#Creating train .rttm and .uem (an .uem file consists of filename, its start and end time) files
annotaited_train = annotated_list(train_filenames)

with open('train.rttm', 'wb') as f:
    write_rttm(annotaited_train)

with open('train.uem', 'wb') as f:
    for i in range(len(train_filenames)):
        fields = [train_filenames[i], 'NA', '0.000',str(uem_train[i] * 60) + '.000']
        line = ' '.join(fields)
        f.write(line.encode('utf-8'))
        f.write(b'\n')

In [None]:
#Creating test .rttm and .uem files
annotaited_test = annotated_list(test_filenames)

with open('test.rttm', 'wb') as f:
    write_rttm(annotaited_test)

with open('test.uem', 'wb') as f:
    for i in range(len(test_filenames)):
        fields = [test_filenames[i], 'NA', '0.000',str(uem_test[i] * 60) + '.000']
        line = ' '.join(fields)
        f.write(line.encode('utf-8'))
        f.write(b'\n')

In [None]:
#Creating dev .rttm and .uem files
annotaited_dev = annotated_list(dev_filenames)

with open('dev.rttm', 'wb') as f:
    write_rttm(annotaited_dev)

with open('dev.uem', 'wb') as f:
    for i in range(len(dev_filenames)):
        fields = [dev_filenames[i], 'NA', '0.000',str(uem_dev[i] * 60) + '.000']
        line = ' '.join(fields)
        f.write(line.encode('utf-8'))
        f.write(b'\n')