In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
data_root  = Path('../../Input/DAIC-WOZ')
workspace  = Path('../../Working/DAIC-WOZ')

audio_root = workspace/'audio'
csv_root = workspace/'csv'

train_audio_root = audio_root/'train'
dev_audio_root = audio_root/'dev'
test_audio_root = audio_root/'test'

train_csv  = data_root/'train_split_Depression_AVEC2017.csv'
dev_csv    = data_root/'dev_split_Depression_AVEC2017.csv'
test_csv   = data_root/'test_split_Depression_AVEC2017.csv'

audio_roots = {
    'train': train_audio_root,
    'dev'  : dev_audio_root,
    'test' : test_audio_root,
}

csvs = {
    'train': train_csv,
    'dev'  : dev_csv,
    'test' : test_csv,
}

id_cols = {
    'train': 'Participant_ID',
    'dev'  : 'Participant_ID',
    'test' : 'participant_ID',
}

In [None]:
for dataset, csv in csvs.items():

    audio_root = audio_roots[dataset]

    df_dataset = pd.read_csv(csv)

    ! mkdir -p {audio_root}

    list_out = []

    for _, sr in tqdm(df_dataset.iterrows(), total=len(df_dataset)):

        id = sr[id_cols[dataset]].astype(int)

        audio_dir = audio_root/f'{id}'
        segment_dir = audio_dir/'segment'
        zip_file = data_root/f'{id}_P.zip'
        vision_feature_files = audio_dir/'*_CLNF_*'

        ! mkdir -p {audio_dir}
        ! mkdir -p {segment_dir}
        ! unzip -q {zip_file} -d {audio_dir}
        ! rm {vision_feature_files}

        df = pd.read_csv(audio_dir/f'{id}_TRANSCRIPT.csv', sep='\t')
        df[['start_time_ms', 'stop_time_ms']] = df[['start_time', 'stop_time']].applymap(lambda x: 1000 * x).astype(int)
        df = df[df['speaker'] == 'Participant'].reset_index(drop=True)

        audio = AudioSegment.from_file(audio_dir/f'{id}_AUDIO.wav')

        for i, sr_trans in df[['start_time_ms', 'stop_time_ms', 'value']].iterrows():

            start, stop, script = sr_trans.to_list()
            audio_file = (segment_dir/f'{i}.wav').resolve()

            segment = audio[start: stop]
            segment.export(audio_file, format='wav')
            
            additional_dict = {
                'response_id': i + 1,
                'audio_file': audio_file,
                'script': script,
            }
            list_out.append({**(sr.to_dict()), **additional_dict})
    
    df_out = pd.DataFrame(list_out)
    df_out.to_csv(f'{csv_root}/{dataset}.csv', index=False)