In [60]:
import pandas as pd
import os
import numpy as np

In [61]:
TRAIN_INDEX_1 = '../wsj/13-34.1/wsj1/doc/indices/si_tr_s.ndx'
TRAIN_INDEX_2 = '../wsj/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx'

VAL_INDEX = '../wsj/13-34.1/wsj1/doc/indices/h1_p0.ndx'

TRAIN_1_BASE_DIR = '/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/'
VAL_BASE_DIR = TRAIN_1_BASE_DIR
TRAIN_2_BASE_DIR = '/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/'

TRAIN_OUTPUT_DIR = '/juice/scr/aharris6/wsj/train/wav/'
VAL_OUTPUT_DIR = '/juice/scr/aharris6/wsj/val/wav/'

In [62]:
def get_all_wav_files_in_train(index_path, wsj0 = False):
    index_file = open(index_path, 'r')
    df_dict = {'wav_file': [], 'id': [], 'raw_filepath': []}
    for line in index_file.readlines():
        #skip comments in index file, which are denoted using the character ';'
        if line[0] == ";": continue
        disc_id, filepath = [item.strip() for item in line.split(":")]
        components = filepath.split("/")
        identifier = components[-1].split(".")[0]
        df_dict['wav_file'].append(filepath)
        df_dict['id'].append(identifier)
        if wsj0:   
            df_dict['raw_filepath'].append(disc_id + "/" + filepath)
        else:
            df_dict['raw_filepath'].append(disc_id + filepath)
    return pd.DataFrame(df_dict)

In [63]:
def format_disc_name(filepath):
    disc = filepath.split("/")
    components = disc[0].split("_")
    new_str = components[0] + "-" + components[1] + "." + components[2]
    final_list = [new_str] + disc[1:]
    return  "/".join(final_list)

def format_filepath(df,  base_dir, output_dir, col_name = 'raw_filepath'):
    df[col_name] = df[col_name].apply(format_disc_name)
    df['input_filepath'] = df[col_name].apply(lambda x: base_dir + x)
    df['output_filepath'] = df[col_name].apply(lambda x: output_dir + "/".join(x.split("/")[-2:]))
    df['output_filepath'] = df['output_filepath'].apply(lambda x: x[:-4] +".wav")
    return df
    

In [64]:
train1 = get_all_wav_files_in_train(TRAIN_INDEX_1)
train1 = format_filepath(train1, TRAIN_1_BASE_DIR, TRAIN_OUTPUT_DIR)
train1.head()

Unnamed: 0,wav_file,id,raw_filepath,input_filepath,output_filepath
0,/wsj1/si_tr_s/4a8/4a8c0201.wv1,4a8c0201,13-11.1/wsj1/si_tr_s/4a8/4a8c0201.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0201...
1,/wsj1/si_tr_s/4a8/4a8c0202.wv1,4a8c0202,13-11.1/wsj1/si_tr_s/4a8/4a8c0202.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0202...
2,/wsj1/si_tr_s/4a8/4a8c0203.wv1,4a8c0203,13-11.1/wsj1/si_tr_s/4a8/4a8c0203.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0203...
3,/wsj1/si_tr_s/4a8/4a8c0204.wv1,4a8c0204,13-11.1/wsj1/si_tr_s/4a8/4a8c0204.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0204...
4,/wsj1/si_tr_s/4a8/4a8c0205.wv1,4a8c0205,13-11.1/wsj1/si_tr_s/4a8/4a8c0205.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0205...


In [65]:
train1.output_filepath[0]

'/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0201.wav'

In [66]:
train2 = get_all_wav_files_in_train(TRAIN_INDEX_2, wsj0 = True)
train2.head()
train2 = format_filepath(train2, TRAIN_2_BASE_DIR, TRAIN_OUTPUT_DIR)
train2.head()

Unnamed: 0,wav_file,id,raw_filepath,input_filepath,output_filepath
0,wsj0/si_tr_s/01i/01ic0201.wv1,01ic0201,11-1.1/wsj0/si_tr_s/01i/01ic0201.wv1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0201...
1,wsj0/si_tr_s/01i/01ic0202.wv1,01ic0202,11-1.1/wsj0/si_tr_s/01i/01ic0202.wv1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0202...
2,wsj0/si_tr_s/01i/01ic0203.wv1,01ic0203,11-1.1/wsj0/si_tr_s/01i/01ic0203.wv1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0203...
3,wsj0/si_tr_s/01i/01ic0204.wv1,01ic0204,11-1.1/wsj0/si_tr_s/01i/01ic0204.wv1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0204...
4,wsj0/si_tr_s/01i/01ic0205.wv1,01ic0205,11-1.1/wsj0/si_tr_s/01i/01ic0205.wv1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0205...


In [67]:
val = get_all_wav_files_in_train(VAL_INDEX, wsj0 = True)
val = format_filepath(val, VAL_BASE_DIR, VAL_OUTPUT_DIR)
val.head()

Unnamed: 0,wav_file,id,raw_filepath,input_filepath,output_filepath
0,wsj1/si_dt_20/4k0/4k0c0301.wv1,4k0c0301,13-16.1/wsj1/si_dt_20/4k0/4k0c0301.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0301.wav
1,wsj1/si_dt_20/4k0/4k0c0302.wv1,4k0c0302,13-16.1/wsj1/si_dt_20/4k0/4k0c0302.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0302.wav
2,wsj1/si_dt_20/4k0/4k0c0303.wv1,4k0c0303,13-16.1/wsj1/si_dt_20/4k0/4k0c0303.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0303.wav
3,wsj1/si_dt_20/4k0/4k0c0304.wv1,4k0c0304,13-16.1/wsj1/si_dt_20/4k0/4k0c0304.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0304.wav
4,wsj1/si_dt_20/4k0/4k0c0305.wv1,4k0c0305,13-16.1/wsj1/si_dt_20/4k0/4k0c0305.wv1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0305.wav


In [68]:
train2.output_filepath[100]

'/juice/scr/aharris6/wsj/train/wav/01i/01io031e.wav'

In [69]:
new_column_order = ['input_filepath', 'output_filepath', 'id']
train1.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
train1 = train1[new_column_order]
train2.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
train2 = train2[new_column_order]
val.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
val = val[new_column_order]
train1.head()

Unnamed: 0,input_filepath,output_filepath,id
0,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0201...,4a8c0201
1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0202...,4a8c0202
2,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0203...,4a8c0203
3,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0204...,4a8c0204
4,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/train/wav/4a8/4a8c0205...,4a8c0205


In [70]:
train2.head()

Unnamed: 0,input_filepath,output_filepath,id
0,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0201...,01ic0201
1,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0202...,01ic0202
2,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0203...,01ic0203
3,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0204...,01ic0204
4,/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-1.1/...,/juice/scr/aharris6/wsj/train/wav/01i/01ic0205...,01ic0205


In [71]:
print(len(val))
val.head()

503


Unnamed: 0,input_filepath,output_filepath,id
0,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0301.wav,4k0c0301
1,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0302.wav,4k0c0302
2,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0303.wav,4k0c0303
3,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0304.wav,4k0c0304
4,/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/1...,/juice/scr/aharris6/wsj/val/wav/4k0/4k0c0305.wav,4k0c0305


In [72]:
val.output_filepath[100]

'/juice/scr/aharris6/wsj/val/wav/4k2/4k2c0301.wav'

In [74]:
train_all = pd.concat([train1, train2])
print(len(train_all))
train_mini = train_all.iloc[:100]
train_all.to_csv("../manifests_wsj/temp/train_all.csv", index = False, header = False)
train_mini.to_csv("../manifests_wsj/temp/train_mini.csv", index = False, header = False)
val.to_csv("../manifests_wsj/temp/val.csv", index = False, header = False)

37514
