In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
TRAIN_INDEX_1 = '../wsj/13-34.1/wsj1/doc/indices/si_tr_s.ndx'
TRAIN_INDEX_2 = '../wsj/11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx'

VAL_INDEX = '../wsj/13-34.1/wsj1/doc/indices/h1_p0.ndx'

TRAIN_1_BASE_DIR = '/u/scr/corpora/ldc/1994/LDC94S13A/csr_2_comp/'
VAL_BASE_DIR = TRAIN_1_BASE_DIR
TRAIN_2_BASE_DIR = '/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/'

TRAIN_OUTPUT_DIR = '/juice/scr/aharris6/wsj/train/wav/'
VAL_OUTPUT_DIR = '/juice/scr/aharris6/wsj/val/wav/'
EXCLUDE_PATH = '/u/scr/corpora/ldc/1993/LDC93S6A/csr_1/11-2.1/wsj0/si_tr_s/401'

In [3]:
def get_all_wav_files_in_train(index_path, wsj0 = False):
    index_file = open(index_path, 'r')
    df_dict = {'wav_file': [], 'id': [], 'raw_filepath': []}
    for line in index_file.readlines():
        #skip comments in index file, which are denoted using the character ';'
        if line[0] == ";": continue
        disc_id, filepath = [item.strip() for item in line.split(":")]
        components = filepath.split("/")
        identifier = components[-1].split(".")[0]
        df_dict['wav_file'].append(filepath)
        df_dict['id'].append(identifier)
        if wsj0:   
            df_dict['raw_filepath'].append(disc_id + "/" + filepath)
        else:
            df_dict['raw_filepath'].append(disc_id + filepath)
    return pd.DataFrame(df_dict)

In [4]:
def format_disc_name(filepath):
    disc = filepath.split("/")
    components = disc[0].split("_")
    new_str = components[0] + "-" + components[1] + "." + components[2]
    final_list = [new_str] + disc[1:]
    return  "/".join(final_list)

def format_filepath(df,  base_dir, output_dir, col_name = 'raw_filepath'):
    df[col_name] = df[col_name].apply(format_disc_name)
    df['input_filepath'] = df[col_name].apply(lambda x: base_dir + x)
    df['output_filepath'] = df[col_name].apply(lambda x: output_dir + "/".join(x.split("/")[-2:]))
    df['output_filepath'] = df['output_filepath'].apply(lambda x: x[:-4] +".wav")
    return df

def drop_rows(df):
    drop_indices = []
    for i, row in df.iterrows():
        input_str = row.input_filepath
        if input_str.startswith(EXCLUDE_PATH):
            drop_indices.append(i)
    df.drop(drop_indices, inplace = True)
    
    

In [5]:
train1 = get_all_wav_files_in_train(TRAIN_INDEX_1)
train1 = format_filepath(train1, TRAIN_1_BASE_DIR, TRAIN_OUTPUT_DIR)
drop_rows(train1)
train1.head()

FileNotFoundError: [Errno 2] No such file or directory: '../wsj/13-34.1/wsj1/doc/indices/si_tr_s.ndx'

In [None]:
train1.output_filepath[0]

In [None]:
train2 = get_all_wav_files_in_train(TRAIN_INDEX_2, wsj0 = True)
train2 = format_filepath(train2, TRAIN_2_BASE_DIR, TRAIN_OUTPUT_DIR)
print(len(train2))
drop_rows(train2)
print(len(train2))
train2.head()

In [None]:
val = get_all_wav_files_in_train(VAL_INDEX, wsj0 = True)
val = format_filepath(val, VAL_BASE_DIR, VAL_OUTPUT_DIR)
val.head()

In [None]:
train2.output_filepath[100]

In [None]:
new_column_order = ['input_filepath', 'output_filepath', 'id']
train1.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
train1 = train1[new_column_order]
train2.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
train2 = train2[new_column_order]
val.drop(['wav_file', 'raw_filepath'], axis = 1, inplace = True)
val = val[new_column_order]
train1.head()

In [None]:
train2.head()

In [None]:
print(len(val))
val.head()

In [None]:
val.output_filepath[100]

In [None]:
train_all = pd.concat([train1, train2])
print(len(train_all))
train_mini = train_all.iloc[:100]
train_all.to_csv("../manifests_wsj/temp/train_all.csv", index = False, header = False)
train_mini.to_csv("../manifests_wsj/temp/train_mini.csv", index = False, header = False)
val.to_csv("../manifests_wsj/temp/val.csv", index = False, header = False)

# Form final manifests w/ wav file paths
TXT file paths will be added when the txt files are generated separately=

In [None]:
val.drop(columns = ['input_filepath'], inplace = True)
val.rename(columns={"output_filepath": "wav_file"}, inplace = True)
train_all.drop(columns = ['input_filepath'], inplace = True)
train_all.rename(columns={"output_filepath": "wav_file"}, inplace = True)
val.head()

In [None]:
train_all.head()

In [None]:
train_mini = train_all.iloc[:100]
train_all.to_csv("../manifests_wsj/wav_only/train_manifest_wsj.csv", index = False)
train_mini.to_csv("../manifests_wsj/wav_only/train_manifest_wsj.csv", index = False)
val.to_csv("../manifests_wsj/wav_only/val_manifest_wsj.csv", index = False)