In [1]:
# CONFIG!!
dataset_name = "paido_dataset" # https://phon.talkbank.org/access/Eng-NA/PaidoEnglish.html
long_audio_sub_folder = "original_long_audio_recordings"
output_data_directory_name = "original_audio_segments"
output_xlsx_file_name = "segments.xlsx"
resample_rate = 16000

In [2]:
import os
import pylangacq
import pandas as pd
import pathlib
import torchaudio
import torchaudio.transforms as T

In [3]:
input_data_directory = f"../data/{dataset_name}/{long_audio_sub_folder}"
if not os.path.exists(input_data_directory):
    raise Exception("Input directory doesn't exist.")
output_data_directory = f"../data/{dataset_name}/{output_data_directory_name}"
pathlib.Path(output_data_directory).mkdir(parents=True, exist_ok=True)
output_xlsx_file_name_path = f"{output_data_directory}/{output_xlsx_file_name}"

In [4]:
def get_participant(dataset_name, path_to_file):
    if dataset_name == "paido_dataset":
        return os.path.basename(path_to_file).replace(".cha", "")

In [5]:
segments = []
for root, dirs, files in os.walk(input_data_directory):
    for file in files:
        if file.endswith(".cha"):
            path_to_file = os.path.join(root, file)
            path_to_audio_file = path_to_file.replace(".cha", ".wav")
            
            if os.path.isfile(path_to_audio_file):
                participant = get_participant(dataset_name, path_to_file)
                waveform, sample_rate = torchaudio.load(path_to_audio_file)
                resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
                resampled_waveform = resampler(waveform)[0].unsqueeze(0)

                chat = pylangacq.read_chat(path_to_file)
                participant_details = chat.headers()[0]['Participants']['CHI']
                age = participant_details['age']
                sex = participant_details['sex']
                # print(path_to_file)
                for utterance in chat.utterances(participants="CHI"):
                    start = utterance.time_marks[0] # in milliseconds
                    end = utterance.time_marks[1] # in milliseconds
                    tokens = utterance.tokens
                    tiers = utterance.tiers
                    path_to_audio_segment_file = output_data_directory + "/" +str(participant) + "_" + str(start) + "_" + str(end) + ".wav"
                    audio_segment_waveform = resampled_waveform[:, int(start/1000*resample_rate):int(end/1000*resample_rate)]
                    torchaudio.save(path_to_audio_segment_file, audio_segment_waveform, resample_rate, encoding="PCM_S", bits_per_sample=16, format="wav")
                    #print(start)
                    #print(end)
                    text = ''
                    for token in tokens:
                        text = text + token.word
                    #print(tokens)
                    #print(text)
                    #print(tiers)
                    
                    segments.append([path_to_file, path_to_audio_file, path_to_audio_segment_file, start, end, end-start, text, participant, participant_details, age, sex, tokens, tiers])
            else:
                print("ERROR!!!")
                print(path_to_audio_file)

In [6]:
df = pd.DataFrame(segments, columns=["path_to_chat_file", "path_to_audio_file", "path_to_audio_segment_file", "start_in_ms", "end_in_ms", "duration_in_ms", "text", "participant", "participant_details", "age", "sex", "tokens", "tiers"])

In [7]:
df

Unnamed: 0,path_to_chat_file,path_to_audio_file,path_to_audio_segment_file,start_in_ms,end_in_ms,duration_in_ms,text,participant,participant_details,age,sex,tokens,tiers
0,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,15042,15751,709,sheep.,e3bt16f212,"{'name': 'Target_Child', 'language': 'eng', 'c...",3;09.,female,"[Token(word='sheep', pos=None, mor=None, gra=N...","{'CHI': 'sheep . 15042_15751', '%xwb': 'ʃip ..."
1,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,17376,17962,586,kicking.,e3bt16f212,"{'name': 'Target_Child', 'language': 'eng', 'c...",3;09.,female,"[Token(word='kicking', pos=None, mor=None, gra...","{'CHI': 'kicking . 17376_17962', '%xwb': 'kʰ..."
2,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,19809,20640,831,toast.,e3bt16f212,"{'name': 'Target_Child', 'language': 'eng', 'c...",3;09.,female,"[Token(word='toast', pos=None, mor=None, gra=N...","{'CHI': 'toast . 19809_20640', '%xwb': 'tʰos..."
3,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,22181,23067,886,soldier.,e3bt16f212,"{'name': 'Target_Child', 'language': 'eng', 'c...",3;09.,female,"[Token(word='soldier', pos=None, mor=None, gra...","{'CHI': 'soldier . 22181_23067', '%xwb': 'so..."
4,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,24315,25198,883,dude.,e3bt16f212,"{'name': 'Target_Child', 'language': 'eng', 'c...",3;09.,female,"[Token(word='dude', pos=None, mor=None, gra=No...","{'CHI': 'dude . 24315_25198', '%xwb': 'dud (..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10164,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,376745,377382,637,door.,e2bt16m322,"{'name': 'Target_Child', 'language': 'eng', 'c...",2;09.,male,"[Token(word='door', pos=None, mor=None, gra=No...","{'CHI': 'door . 376745_377382', '%xwb': 'doɹ..."
10165,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,380499,381903,1404,door.,e2bt16m322,"{'name': 'Target_Child', 'language': 'eng', 'c...",2;09.,male,"[Token(word='door', pos=None, mor=None, gra=No...","{'CHI': 'door . 380499_381903', '%xwb': 'doɹ..."
10166,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,385371,386526,1155,chute.,e2bt16m322,"{'name': 'Target_Child', 'language': 'eng', 'c...",2;09.,male,"[Token(word='chute', pos=None, mor=None, gra=N...","{'CHI': 'chute . 385371_386526', '%xwb': 'ʃu..."
10167,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_long_audio_reco...,../data/paido_dataset/original_audio_segments/...,396869,398095,1226,cave.,e2bt16m322,"{'name': 'Target_Child', 'language': 'eng', 'c...",2;09.,male,"[Token(word='cave', pos=None, mor=None, gra=No...","{'CHI': 'cave . 396869_398095', '%xwb': 'kʰe..."


In [8]:
df.to_excel(output_xlsx_file_name_path, index=False)