In [1]:
subsection = "men"
dataset_name = f"English_ECSC_{subsection}_dataset" # https://childes.talkbank.org/access/Frogs/English-ECSC.html
long_audio_sub_folder = "original"
output_data_directory_name = "original_audio_segments"
output_xlsx_file_name = "segments.xlsx"
resample_rate = 16000
# Define the maximum segment duration (in seconds)
max_segment_duration = 30

In [2]:
import os
import pylangacq
import pandas as pd
import pathlib
import torchaudio
import torchaudio.transforms as T
import re
import contractions
import string
import uuid

In [3]:
input_data_directory = f"../data/{dataset_name}/{long_audio_sub_folder}"
if not os.path.exists(input_data_directory):
    raise Exception("Input directory doesn't exist.")
output_data_directory = f"../data/{dataset_name}/{output_data_directory_name}"
pathlib.Path(output_data_directory).mkdir(parents=True, exist_ok=True)
output_xlsx_file_name_path = f"{output_data_directory}/{output_xlsx_file_name}"

if subsection == "child":
    label = 'CHI'
else:
    label = 'ADT'

In [4]:
def replace_numbers_with_words(input_str):
    def replace_number(match):
        numeric_part = match.group()
        return num2words(int(numeric_part))

    # Use regular expression to find the numeric part of the input string and replace with words
    result = re.sub(r'\d+', replace_number, input_str)
    return result
    
def remove_multiple_spaces(input_string):
    words = input_string.split()

    # Join the words back together with a single space between them
    cleaned_string = " ".join(words)
    return cleaned_string

def remove_punctuation_and_lower(text):
    # Convert the text to lowercase
    text = text.replace("_", " ")
    text = replace_numbers_with_words(text)
    text = text.lower()    
    text = contractions.fix(text)
    text = text.replace("'", "").strip()
    text = ''.join(char for char in text if char not in string.punctuation)
    text = remove_multiple_spaces(text)
    return text

def get_participant(path_to_file):
    speaker_id = os.path.basename(path_to_file).replace(".cha", "")[:-1].replace("_YR", "")
    return speaker_id

def get_sex(path_to_file):
    parts = os.path.basename(path_to_file).replace(".cha", "").split("_")
    return parts[0][-1]

def get_age(path_to_file):
    if subsection == "child":
        parts = os.path.basename(path_to_file).replace(".cha", "").split("_")
        months = int(parts[0][:-1]) + (int(parts[-1][-1]) -1) * 12
        return months
    else:
        parts = os.path.basename(path_to_file).replace(".cha", "").split("_")
        return parts[0].replace(get_sex(path_to_file), "")

In [5]:
segments = []
i = 0
for root, dirs, files in os.walk(input_data_directory):
    for file in files:
        if file.endswith(".cha"):
            path_to_file = os.path.join(root, file)
            path_to_audio_file = path_to_file.replace(".cha", ".wav")
            
            if os.path.isfile(path_to_audio_file):
                participant = get_participant(path_to_file)
                waveform, sample_rate = torchaudio.load(path_to_audio_file)
                resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
                resampled_waveform = resampler(waveform)[0].unsqueeze(0)

                chat = pylangacq.read_chat(path_to_file)
                participant_details = chat.headers()[0]['Participants'][label]
                age = participant_details['age']
                sex = participant_details['sex']

                # Create a list to store the file paths
                path_to_audio_segment_file = []

                path_to_audio_segment_f_utility = os.path.join(output_data_directory, f"{participant}___{i}.wav")
                
                max_j = 0
                # Loop through the audio and save segments
                for j, start_sample in enumerate(range(0, resampled_waveform.size(1), max_segment_duration * resample_rate)):
                    max_j = j + 1
                    segment_waveform = resampled_waveform[:, start_sample:start_sample + max_segment_duration * resample_rate]

                    # Handle the last segment which might be shorter than max_segment_duration
                    '''
                    if segment_waveform.size(1) < max_segment_duration * resample_rate:
                        padding = max_segment_duration * resample_rate - segment_waveform.size(1)
                        segment_waveform = torch.nn.functional.pad(segment_waveform, (0, padding))
                    '''
                    # Construct the file path
                    path_to_audio_segment_f = os.path.join(output_data_directory, f"{participant}___{i}##{j}.wav")
                    path_to_audio_segment_file.append(path_to_audio_segment_f)

                    # Save the segment
                    torchaudio.save(path_to_audio_segment_f, segment_waveform, resample_rate, encoding="PCM_S", bits_per_sample=16, format="wav")
                #path_to_audio_segment_file = output_data_directory + "/" +str(participant) + "___" + str(i) +".wav"
                #torchaudio.save(path_to_audio_segment_file, resampled_waveform, resample_rate, encoding="PCM_S", bits_per_sample=16, format="wav")
                i += 1

                start = 0 #utterance.time_marks[0] # in milliseconds
                end = resampled_waveform.shape[1]/resample_rate *1000 #utterance.time_marks[1] # in milliseconds
                # print(path_to_file)
                                
                texts = ""
                for utterance in chat.utterances(participants=label):
                    #print(utterance)
                    tokens = utterance.tokens
                    #tiers = utterance.tiers
                    #print(start)
                    #print(end)
                    text = ''
                    for token in tokens:
                        text = text + " " + token.word
                    texts = texts + " " + text
                    #print(tokens)
                    #print(text)
                    #print(tiers)
                
                segments.append([path_to_file, path_to_audio_file, path_to_audio_segment_f_utility, path_to_audio_segment_file, max_j, start, end, end-start, remove_punctuation_and_lower(texts), participant, participant_details, age, sex, texts, texts])
            else:
                print("ERROR!!!")
                print(path_to_audio_file)

In [6]:
df = pd.DataFrame(segments, columns=["path_to_chat_file", "path_to_audio_file", "path_to_audio_segment_file", "path_to_audio_segment_files", "num_segments", "start_in_ms", "end_in_ms", "duration_in_ms", "text", "participant", "participant_details", "age", "sex", "tokens", "tiers"])


In [7]:
df

Unnamed: 0,path_to_chat_file,path_to_audio_file,path_to_audio_segment_file,path_to_audio_segment_files,num_segments,start_in_ms,end_in_ms,duration_in_ms,text,participant,participant_details,age,sex,tokens,tiers
0,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original_audi...,[../data/English_ECSC_men_dataset/original_aud...,7,0,182006.3125,182006.3125,frog on his own one day timmy his dog his frog...,dad_122F_4040,"{'name': 'Target_Adult', 'language': 'eng', 'c...",18;00.,male,frog on his own . one day . timmy . his d...,frog on his own . one day . timmy . his d...
1,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original_audi...,[../data/English_ECSC_men_dataset/original_aud...,8,0,234947.125,234947.125,frog on his own if i remember the names of the...,dad_120M_1056,"{'name': 'Target_Adult', 'language': 'eng', 'c...",18;00.,male,frog on his . own . if i remember the name...,frog on his . own . if i remember the name...
2,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original_audi...,[../data/English_ECSC_men_dataset/original_aud...,6,0,154362.125,154362.125,alright where are we here alright look it is l...,dad_65M_1053,"{'name': 'Target_Adult', 'language': 'eng', 'c...",18;00.,male,alright . where are we here . alright . l...,alright . where are we here . alright . l...
3,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original/dad_...,../data/English_ECSC_men_dataset/original_audi...,[../data/English_ECSC_men_dataset/original_aud...,6,0,169180.0,169180.0,there is a little boy and the frog doggie walk...,dad_87F_1066,"{'name': 'Target_Adult', 'language': 'eng', 'c...",18;00.,male,there's a little boy . and the frog . dogg...,there's a little boy . and the frog . dogg...


In [8]:
df.to_excel(output_xlsx_file_name_path, index=False)