In [16]:
import pandas as pd
import os, glob

root = '/mnt/lynx1/datasets/CVSS-C/es-en'

In [17]:
df_tgt = pd.read_csv(os.path.join(root, 'es', 'test.quant.tsv'), sep='|', header=None)
df_tgt.columns = ['audio_id', 'tgt_unit']
df_tgt['audio_id'] = df_tgt.audio_id.apply(lambda x: x.split('.')[0])
df_tgt.head()

Unnamed: 0,audio_id,tgt_unit
0,common_voice_es_19696046,39 39 951 257 374 772 772 344 6 681 681 133 10...
1,common_voice_es_19615622,341 341 520 133 133 257 257 133 133 133 133 13...
2,common_voice_es_19960523,39 772 257 497 497 497 497 497 497 497 497 497...
3,common_voice_es_19663679,661 661 661 661 661 661 661 661 661 661 661 66...
4,common_voice_es_19678892,497 520 520 497 497 497 497 497 497 497 497 49...


In [18]:
def reduce_tokens(tokens):
    """
    input: a list of (unreduced) speech unit tokens
    output: reduced speech units, the duration label for each unit, as well as the indices that are not reduced.
    """
    dedup_tokens = []
    duration_label = []
    index_to_keep = []
    accu_duration = 1
    for i, token in enumerate(tokens):
        if i == 0:
            dedup_tokens.append(token)
            index_to_keep.append(i)
        else:
            if token != tokens[i - 1]:
                duration_label.append(accu_duration)
                dedup_tokens.append(token)
                index_to_keep.append(i)
                accu_duration = 1
            else:
                accu_duration += 1
    duration_label.append(accu_duration)
    return dedup_tokens, duration_label, index_to_keep

df_tgt['tgt_unit_reduced'] = df_tgt['tgt_unit'].apply(lambda x: ' '.join(reduce_tokens(x.split())[0]))
df_tgt.head()

Unnamed: 0,audio_id,tgt_unit,tgt_unit_reduced
0,common_voice_es_19696046,39 39 951 257 374 772 772 344 6 681 681 133 10...,39 951 257 374 772 344 6 681 133 102 681 216 6...
1,common_voice_es_19615622,341 341 520 133 133 257 257 133 133 133 133 13...,341 520 133 257 133 9 133 681 133 681 216 895 ...
2,common_voice_es_19960523,39 772 257 497 497 497 497 497 497 497 497 497...,39 772 257 497 987 497 63 665 991 162 73 338 3...
3,common_voice_es_19663679,661 661 661 661 661 661 661 661 661 661 661 66...,661 63 922 991 35 421 5 844 316 810 480 243 61...
4,common_voice_es_19678892,497 520 520 497 497 497 497 497 497 497 497 49...,497 520 497 184 520 738 681 374 497 63 991 162...


In [20]:
df_src = pd.read_csv(os.path.join(root, 'en', 'test.quant.tsv'), sep='|', header=None)
df_src.columns = ['audio_id', 'src_unit']
df_src['audio_id'] = df_src.audio_id.apply(lambda x: x.split('.')[0])
df_src['src_unit_reduced'] = df_src['src_unit'].apply(lambda x: ' '.join(reduce_tokens(x.split())[0]))
df_src.head()

Unnamed: 0,audio_id,src_unit,src_unit_reduced
0,common_voice_es_19724164,63 665 780 531 531 531 534 534 948 86 470 152 ...,63 665 780 531 534 948 86 470 152 784 173 641 ...
1,common_voice_es_19645907,63 63 991 821 821 534 485 86 319 416 416 416 4...,63 991 821 534 485 86 319 416 426 647 167 761 ...
2,common_voice_es_19744104,63 63 662 662 244 583 15 333 333 212 455 409 5...,63 662 244 583 15 333 212 455 409 501 137 74 6...
3,common_voice_es_19118696,63 665 202 393 946 734 734 259 781 781 303 303...,63 665 202 393 946 734 259 781 303 485 948 813...
4,common_voice_es_19638183,63 63 991 162 116 116 281 281 384 384 384 879 ...,63 991 162 116 281 384 879 70 219 522 67 940 1...


In [21]:
df = pd.merge(df_src, df_tgt, on='audio_id')
df.head()

Unnamed: 0,audio_id,src_unit,src_unit_reduced,tgt_unit,tgt_unit_reduced
0,common_voice_es_19724164,63 665 780 531 531 531 534 534 948 86 470 152 ...,63 665 780 531 534 948 86 470 152 784 173 641 ...,102 520 520 520 588 360 681 681 681 681 497 17...,102 520 588 360 681 497 17 681 497 681 520 681...
1,common_voice_es_19645907,63 63 991 821 821 534 485 86 319 416 416 416 4...,63 991 821 534 485 86 319 416 426 647 167 761 ...,102 158 158 360 497 497 497 497 497 497 497 49...,102 158 360 497 264 63 922 780 294 314 853 319...
2,common_voice_es_19744104,63 63 662 662 244 583 15 333 333 212 455 409 5...,63 662 244 583 15 333 212 455 409 501 137 74 6...,39 497 497 497 497 497 497 681 997 997 682 371...,39 497 681 997 682 371 111 666 371 786 132 644...
3,common_voice_es_19118696,63 665 202 393 946 734 734 259 781 781 303 303...,63 665 202 393 946 734 259 781 303 485 948 813...,661 28 951 951 6 300 63 63 63 63 63 63 63 63 6...,661 28 951 6 300 63 644 991 35 421 190 843 100...
4,common_voice_es_19638183,63 63 991 162 116 116 281 281 384 384 384 879 ...,63 991 162 116 281 384 879 70 219 522 67 940 1...,497 588 588 588 588 588 520 520 520 520 386 59...,497 588 520 386 596 704 520 499 804 604 804 70...


In [29]:
df_orig = df[['audio_id', 'src_unit', 'tgt_unit']]
df_orig['src_n_frames'] = df_orig.src_unit.apply(lambda x: len(x.split()))
df_orig['tgt_n_frames'] = df_orig.tgt_unit.apply(lambda x: len(x.split()))
# reorder columns
df_orig = df_orig[['audio_id', 'src_unit', 'src_n_frames', 'tgt_unit', 'tgt_n_frames']]
df_orig.to_csv(os.path.join(root, 'es/orig_unit/test.tsv'), sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orig['src_n_frames'] = df_orig.src_unit.apply(lambda x: len(x.split()))


In [30]:
df_reduced = df[['audio_id', 'src_unit_reduced', 'tgt_unit_reduced']]
df_reduced['src_n_frames'] = df_reduced.src_unit_reduced.apply(lambda x: len(x.split()))
df_reduced['tgt_n_frames'] = df_reduced.tgt_unit_reduced.apply(lambda x: len(x.split()))
# reorder columns
df_reduced = df_reduced[['audio_id', 'src_unit_reduced', 'src_n_frames', 'tgt_unit_reduced', 'tgt_n_frames']]
df_reduced.to_csv(os.path.join(root, 'es/reduce_unit/test.tsv'), sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['src_n_frames'] = df_reduced.src_unit_reduced.apply(lambda x: len(x.split()))


In [31]:
for split in ['train', 'dev']:
    df_tgt = pd.read_csv(os.path.join(root, 'es', f'{split}.quant.tsv'), sep='|', header=None)
    df_tgt.columns = ['audio_id', 'tgt_unit']
    df_tgt['audio_id'] = df_tgt.audio_id.apply(lambda x: x.split('.')[0])
    df_tgt['tgt_unit_reduced'] = df_tgt['tgt_unit'].apply(lambda x: ' '.join(reduce_tokens(x.split())[0]))

    df_src = pd.read_csv(os.path.join(root, 'en', f'{split}.quant.tsv'), sep='|', header=None)
    df_src.columns = ['audio_id', 'src_unit']
    df_src['audio_id'] = df_src.audio_id.apply(lambda x: x.split('.')[0])
    df_src['src_unit_reduced'] = df_src['src_unit'].apply(lambda x: ' '.join(reduce_tokens(x.split())[0]))

    df = pd.merge(df_src, df_tgt, on='audio_id')

    df_orig = df[['audio_id', 'src_unit', 'tgt_unit']]
    df_orig['src_n_frames'] = df_orig.src_unit.apply(lambda x: len(x.split()))
    df_orig['tgt_n_frames'] = df_orig.tgt_unit.apply(lambda x: len(x.split()))
    # reorder columns
    df_orig = df_orig[['audio_id', 'src_unit', 'src_n_frames', 'tgt_unit', 'tgt_n_frames']]
    df_orig.to_csv(os.path.join(root, 'es/orig_unit', f'{split}.tsv'), sep='\t', header=False, index=False)

    df_reduced = df[['audio_id', 'src_unit_reduced', 'tgt_unit_reduced']]
    df_reduced['src_n_frames'] = df_reduced.src_unit_reduced.apply(lambda x: len(x.split()))
    df_reduced['tgt_n_frames'] = df_reduced.tgt_unit_reduced.apply(lambda x: len(x.split()))
    # reorder columns
    df_reduced = df_reduced[['audio_id', 'src_unit_reduced', 'src_n_frames', 'tgt_unit_reduced', 'tgt_n_frames']]
    df_reduced.to_csv(os.path.join(root, 'es/reduce_unit', f'{split}.tsv'), sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orig['src_n_frames'] = df_orig.src_unit.apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['src_n_frames'] = df_reduced.src_unit_reduced.apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orig['src_n_frames'] = df_orig.src_uni

In [31]:
filepath = '../exps/exp0/cvss/es-en/test.en.tsv'

with open(filepath, 'r') as f:
    lines = f.readlines()[1:]
lines = [line.strip().split('\t') for line in lines]
df = pd.DataFrame(lines, columns=['src_audio', 'src_n_frames'])

filepath_tgt = '../exps/exp0/cvss/es-en/test.es.tsv'
with open(filepath_tgt, 'r') as f:
    lines_tgt = f.readlines()[1:]
lines_tgt = [line.strip().split('\t') for line in lines_tgt]
df_tgt = pd.DataFrame(lines_tgt, columns=['tgt_audio', 'tgt_n_frames'])


df_tgt.head()

Unnamed: 0,tgt_audio,tgt_n_frames
0,common_voice_es_19696046.mp3,360846
1,common_voice_es_19615622.mp3,235278
2,common_voice_es_19960523.mp3,333198
3,common_voice_es_19663679.mp3,307854
4,common_voice_es_19678892.mp3,391950


In [35]:
df['audio_id'] = df['src_audio'].apply(lambda x: x.split('.')[0])
df['merge_id'] = df['audio_id'].apply(lambda x: x.split('.')[0])
df_tgt['merge_id'] = df_tgt['tgt_audio'].apply(lambda x: x.split('.')[0])

df2 = pd.merge(df, df_tgt, on='merge_id', how='left')
# drop rows with NaN
df2 = df2.dropna()
df2 = df2.drop(columns=['merge_id'])
# rearrange columns
df2 = df2[['audio_id', 'src_audio', 'src_n_frames', 'tgt_audio', 'tgt_n_frames']]
df2.head()

Unnamed: 0,audio_id,src_audio,src_n_frames,tgt_audio,tgt_n_frames
0,common_voice_es_19599731,common_voice_es_19599731.mp3.wav,96300,common_voice_es_19599731.mp3,383886
1,common_voice_es_19762450,common_voice_es_19762450.mp3.wav,70200,common_voice_es_19762450.mp3,291726
2,common_voice_es_19942021,common_voice_es_19942021.mp3.wav,88200,common_voice_es_19942021.mp3,237582
3,common_voice_es_19969702,common_voice_es_19969702.mp3.wav,101100,common_voice_es_19969702.mp3,370062
4,common_voice_es_18758543,common_voice_es_18758543.mp3.wav,65700,common_voice_es_18758543.mp3,230670


In [36]:
df2.to_csv('/mnt/lynx1/datasets/CVSS-C/es-en/es/reduce_unit/test.tsv', sep='\t', header=None, index=None)

In [38]:
# do the same for test -> train, dev

for split in ['train', 'dev']:
    filepath = f'../exps/exp0/cvss/es-en/{split}.en.tsv'
    with open(filepath, 'r') as f:
        lines = f.readlines()[1:]
    lines = [line.strip().split('\t') for line in lines]
    df = pd.DataFrame(lines, columns=['src_audio', 'src_n_frames'])
    
    filepath_tgt = f'../exps/exp0/cvss/es-en/{split}.es.tsv'
    with open(filepath_tgt, 'r') as f:
        lines_tgt = f.readlines()[1:]
    
    lines_tgt = [line.strip().split('\t') for line in lines_tgt]
    df_tgt = pd.DataFrame(lines_tgt, columns=['tgt_audio', 'tgt_n_frames'])
    
    df['audio_id'] = df['src_audio'].apply(lambda x: x.split('.')[0])
    df['merge_id'] = df['audio_id'].apply(lambda x: x.split('.')[0])
    df_tgt['merge_id'] = df_tgt['tgt_audio'].apply(lambda x: x.split('.')[0])
    
    df2 = pd.merge(df, df_tgt, on='merge_id', how='left')
    # drop rows with NaN
    df2 = df2.dropna()
    df2 = df2.drop(columns=['merge_id'])
    # rearrange columns
    df2 = df2[['audio_id', 'src_audio', 'src_n_frames', 'tgt_audio', 'tgt_n_frames']]
    df2.to_csv(f'/mnt/lynx1/datasets/CVSS-C/es-en/es/reduce_unit/{split}.tsv', sep='\t', header=None, index=None)
    

In [4]:
import os, glob
import pandas as pd

root = '/mnt/lynx1/datasets/CVSS-C/es-en'

df = pd.read_csv(
    os.path.join(root, "en2es/diff_unit_vae_50/test.tsv"), sep="\t"
)

df.head()

Unnamed: 0,id,src_audio,src_n_frames,tgt_audio,tgt_n_frames
0,common_voice_es_19645907,63 991 821 534 485 86 319 416 426 647 167 761 ...,143,102 158 360 497 264 63 922 780 294 314 853 319...,165
1,common_voice_es_19744104,63 662 244 583 15 333 212 455 409 501 137 74 6...,177,39 497 681 386 682 371 111 666 735 786 132 644...,232
2,common_voice_es_19118696,63 665 202 393 946 734 259 781 303 485 948 813...,74,661 28 951 6 63 644 991 35 421 190 843 100 629...,125
3,common_voice_es_19638183,63 991 162 116 281 384 879 70 219 522 67 940 1...,176,497 588 520 386 804 704 520 499 804 604 804 70...,255
4,common_voice_es_18376911,63 991 479 330 647 167 173 896 627 470 821 677...,71,497 17 264 63 644 254 504 773 8 778 311 182 70...,103


In [7]:
df_trans = pd.read_csv(os.path.join(root, "es/test.tsv"), sep="\t")
df_trans['id'] = df_trans['path'].apply(lambda x: x.split('.')[0])
df_trans.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,id
0,0003b969350f5308dc7347c574bc291834f38fdd92a286...,common_voice_es_19698530.mp3,Habita en aguas poco profundas y rocosas.,2,1,thirties,male,mexicano,common_voice_es_19698530
1,009891ef9a0c11e1b21a9d8a8318c493382963c9eac3ab...,common_voice_es_19987333.mp3,Opera principalmente vuelos de cabotaje y regi...,2,1,,,,common_voice_es_19987333
2,00b0a5cf01f152b2b6771f16e34118a1d95e29716e4c86...,common_voice_es_19691402.mp3,Para visitar contactar primero con la dirección.,2,0,,,,common_voice_es_19691402
3,00de0234aec2cb4d2268ef5b5ba1d72d84ce36fa032096...,common_voice_es_19966634.mp3,"Realizó los estudios primarios en Francia, par...",2,1,,,,common_voice_es_19966634
4,00f3644640cf32c29fda40403d82ca131e97d93e92cb79...,common_voice_es_19600147.mp3,"En los años que siguieron, este trabajo Espart...",2,0,,,,common_voice_es_19600147


In [13]:
df = pd.merge(df, df_trans[['id', 'sentence']], on='id', how='left')

# place 'sentence' column 3rd from the beginning
cols = df.columns.tolist()
cols = cols[:1] + [cols[-1]] + cols[1:-1]
df = df[cols]
df.head()

Unnamed: 0,id,sentence,src_audio,sentence_y,src_n_frames,tgt_audio,tgt_n_frames,sentence_x
0,common_voice_es_19645907,Se graduó como ingeniero civil en la Universid...,63 991 821 534 485 86 319 416 426 647 167 761 ...,Se graduó como ingeniero civil en la Universid...,143,102 158 360 497 264 63 922 780 294 314 853 319...,165,Se graduó como ingeniero civil en la Universid...
1,common_voice_es_19744104,"En ellos se solía citar a pensadores, filósofo...",63 662 244 583 15 333 212 455 409 501 137 74 6...,"En ellos se solía citar a pensadores, filósofo...",177,39 497 681 386 682 371 111 666 735 786 132 644...,232,"En ellos se solía citar a pensadores, filósofo..."
2,common_voice_es_19118696,Las hojas tienen dos foliolos y un zarcillo.,63 665 202 393 946 734 259 781 303 485 948 813...,Las hojas tienen dos foliolos y un zarcillo.,74,661 28 951 6 63 644 991 35 421 190 843 100 629...,125,Las hojas tienen dos foliolos y un zarcillo.
3,common_voice_es_19638183,"A continuación, se hizo amigo de Johannes Kepl...",63 991 162 116 281 384 879 70 219 522 67 940 1...,"A continuación, se hizo amigo de Johannes Kepl...",176,497 588 520 386 804 704 520 499 804 604 804 70...,255,"A continuación, se hizo amigo de Johannes Kepl..."
4,common_voice_es_18376911,"A quien mucho miente, le huye la gente.",63 991 479 330 647 167 173 896 627 470 821 677...,"A quien mucho miente, le huye la gente.",71,497 17 264 63 644 254 504 773 8 778 311 182 70...,103,"A quien mucho miente, le huye la gente."


In [14]:
df.to_csv('../exps/exp0/cvss/es-en/eval.tsv', sep='\t', index=None)