In [131]:
from pandas import read_table, DataFrame, concat
from glob import glob
from tqdm import tqdm

In [132]:
voxceleb_dir = 'data/voxceleb1'

## Parse verification protocol 

In [133]:
!cat data/voxceleb1/voxceleb1_txt/A.J._Buckley/1zcIwhmdeo4.txt

POI: 			A.J._Buckley 
Youtube ID: 	1zcIwhmdeo4 
Video length: 	264.3 sec 
Set (V): 		dev 
 
A.J._Buckley/1zcIwhmdeo4_0000001 14.7 22.8 
A.J._Buckley/1zcIwhmdeo4_0000002 90.4 99.0 
A.J._Buckley/1zcIwhmdeo4_0000003 102.4 108.0 


In [134]:
def parse_txt(txt):
    lines = [line.strip() for line in open(txt, 'r').readlines()]
    speaker = lines[0].split('\t')[-1]
    uri = lines[1].split('\t')[-1]
    subset = lines[3].split('\t')[-1]
    if subset == 'test':
        subset = 'tst'
    for line in lines[5:]:
        segment, start, end = line.split()
        yield {'speaker': speaker, 
               'uri': speaker + '/' + uri, 
               'start': float(start), 
               'end': float(end), 
               'segment': segment, 
               'verification': subset}

In [135]:
glob_exp = '{voxceleb_dir}/voxceleb1_txt/*/*.txt'.format(voxceleb_dir=voxceleb_dir)
segments = []
for path_txt in tqdm(glob(glob_exp)):
    segments.extend(list(parse_txt(path_txt)))
verification_split = DataFrame(segments)
verification_split.set_index('segment', inplace=True)

100%|██████████| 22496/22496 [00:01<00:00, 13415.71it/s]


## Parse identification protocol

In [136]:
map_set = lambda x: {'1': 'trn', '2': 'dev', '3': 'tst'}[x]
identification_split = read_table(
    'data/Identification_split.txt', delim_whitespace=True,
    header=None, names=['identification', 'segment'], index_col=['segment'],
    converters={'identification': map_set, 'segment': lambda x: x[:-4]})

In [137]:
data = concat([identification_split, verification_split], axis=1)

## Remove duplicates

In [138]:
duplicates = list(read_table('data/duplicates.txt', header=None, names=['uri']).uri)

In [139]:
columns = ['uri', 'start', 'end', 'speaker', 'verification', 'identification']
voxceleb1 = data[data.apply(lambda r: r.uri not in duplicates, axis=1)][columns]

## Dump to "voxceleb1.csv"

In [140]:
with open('../VoxCeleb/data/voxceleb1.csv', 'w') as fp:
    voxceleb1.to_csv(fp, index_label='segment')

In [141]:
!head ../VoxCeleb/data/voxceleb1.csv

segment,uri,start,end,speaker,verification,identification
A.J._Buckley/1zcIwhmdeo4_0000001,A.J._Buckley/1zcIwhmdeo4,14.7,22.8,A.J._Buckley,dev,trn
A.J._Buckley/1zcIwhmdeo4_0000002,A.J._Buckley/1zcIwhmdeo4,90.4,99.0,A.J._Buckley,dev,trn
A.J._Buckley/1zcIwhmdeo4_0000003,A.J._Buckley/1zcIwhmdeo4,102.4,108.0,A.J._Buckley,dev,trn
A.J._Buckley/7gWzIy6yIIk_0000001,A.J._Buckley/7gWzIy6yIIk,206.8,215.4,A.J._Buckley,dev,trn
A.J._Buckley/7gWzIy6yIIk_0000002,A.J._Buckley/7gWzIy6yIIk,269.1,275.0,A.J._Buckley,dev,trn
A.J._Buckley/7gWzIy6yIIk_0000003,A.J._Buckley/7gWzIy6yIIk,283.2,289.8,A.J._Buckley,dev,trn
A.J._Buckley/7gWzIy6yIIk_0000004,A.J._Buckley/7gWzIy6yIIk,297.3,305.4,A.J._Buckley,dev,trn
A.J._Buckley/7w0IBEWc9Qw_0000001,A.J._Buckley/7w0IBEWc9Qw,8.0,36.0,A.J._Buckley,dev,dev
A.J._Buckley/7w0IBEWc9Qw_0000002,A.J._Buckley/7w0IBEWc9Qw,39.7,45.7,A.J._Buckley,dev,dev


## Parse verification trials

In [142]:
!head data/voxceleb1/voxceleb1_test.txt

1 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eartha_Kitt/8jEAjG6SegY_0000008.wav
0 Eartha_Kitt/x6uYqmx31kE_0000001.wav Ernest_Borgnine/ize_eiCFEg0_0000003.wav
1 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eartha_Kitt/GWXujl-xAVM_0000017.wav
0 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eddie_Izzard/0OCW1HUxZyg_0000001.wav
1 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eartha_Kitt/8jEAjG6SegY_0000022.wav
0 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eli_Wallach/Uzxv7Axh3Z8_0000001.wav
1 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eartha_Kitt/GWXujl-xAVM_0000033.wav
0 Eartha_Kitt/x6uYqmx31kE_0000001.wav Eli_Wallach/7yx9A0yzLYk_0000029.wav
1 Eartha_Kitt/x6uYqmx31kE_0000002.wav Eartha_Kitt/5r0dWxy17C8_0000026.wav
0 Eartha_Kitt/x6uYqmx31kE_0000002.wav Elisabeth_Moss/m-uILToQ9ss_0000009.wav


In [143]:
to_segment = lambda x: x[:-4]
trials = read_table('data/voxceleb1/voxceleb1_test.txt', delim_whitespace=True,
                    names=['trial', 'enrolment', 'test'],
                    converters={'enrolment': to_segment, 'test': to_segment})

## Dump to voxceleb1.verification.test.csv

In [144]:
with open('../VoxCeleb/data/voxceleb1.verification.test.csv', 'w') as fp:
    trials.to_csv(fp, index_label='segment', index=False)

In [145]:
!head ../VoxCeleb/data/voxceleb1.verification.test.csv

trial,enrolment,test
1,Eartha_Kitt/x6uYqmx31kE_0000001,Eartha_Kitt/8jEAjG6SegY_0000008
0,Eartha_Kitt/x6uYqmx31kE_0000001,Ernest_Borgnine/ize_eiCFEg0_0000003
1,Eartha_Kitt/x6uYqmx31kE_0000001,Eartha_Kitt/GWXujl-xAVM_0000017
0,Eartha_Kitt/x6uYqmx31kE_0000001,Eddie_Izzard/0OCW1HUxZyg_0000001
1,Eartha_Kitt/x6uYqmx31kE_0000001,Eartha_Kitt/8jEAjG6SegY_0000022
0,Eartha_Kitt/x6uYqmx31kE_0000001,Eli_Wallach/Uzxv7Axh3Z8_0000001
1,Eartha_Kitt/x6uYqmx31kE_0000001,Eartha_Kitt/GWXujl-xAVM_0000033
0,Eartha_Kitt/x6uYqmx31kE_0000001,Eli_Wallach/7yx9A0yzLYk_0000029
1,Eartha_Kitt/x6uYqmx31kE_0000002,Eartha_Kitt/5r0dWxy17C8_0000026
