In [8]:
main_folder = 'kennedy_james_sentencerepetition_dataset'

In [9]:
import os
import pandas as pd
import pathlib
import shutil
import re
import string
import contractions
import torchaudio

def find_folder_with_structure(file_path):
    pattern = r"\b\d{2}_[MF]_\w+\b"
    match = re.search(pattern, file_path)

    if match:
        return match.group()
    else:
        return None

def remove_multiple_spaces(input_string):
    words = input_string.split()

    # Join the words back together with a single space between them
    cleaned_string = " ".join(words)
    return cleaned_string
    
def replace_numbers_with_words(input_str):
    def replace_number(match):
        numeric_part = match.group()
        return num2words(int(numeric_part))

    # Use regular expression to find the numeric part of the input string and replace with words
    result = re.sub(r'\d+', replace_number, input_str)
    return result
    
def remove_punctuation_and_lower(text):
    # Convert the text to lowercase
    text = text.replace("_", " ")
    text = replace_numbers_with_words(text)
    text = text.lower()    
    text = contractions.fix(text)
    text = text.replace("'", "").strip()
    text = ''.join(char for char in text if char not in string.punctuation)
    text = remove_multiple_spaces(text)
    return text
    
def extract_info_from_filename(file_path):
    # Split the filename by underscores to extract information
    filename = os.path.basename(file_path)
    #print(filename)
    human_transcript = filename[:-4].strip()
    human_transcript = remove_punctuation_and_lower(human_transcript)

    # Determine the task based on the folder path
    if "english_free_speech" in file_path:
        task = "english_free_speech"
    elif "english_words_sentences" in file_path:
        task = "english_words_sentences"
    else:
        task = None
        
    if "numbers" in file_path:
        subtask = "numbers"
    elif "sentences" in file_path:
        subtask = "sentences"
    else:
        subtask = None
        
    parts = find_folder_with_structure(file_path).split("_")
    speaker_id = parts[0]
    sex = parts[1]
    native_status = parts[2]
    return speaker_id, sex, native_status, task, subtask, human_transcript
    
def explore_folder(main_folder):
    data = []
    ignored_folders = ["files_in_one_part", "port_mic", "nao_mic"]

    for root, _, files in os.walk(main_folder):
        for file in files:
            if file.endswith(".wav"):
                path_to_file = os.path.join(root, file)
                if any(ignored_folder in path_to_file for ignored_folder in ignored_folders):
                    continue
                #print(path_to_file)
                file_basename = os.path.basename(file)
                speaker_id, sex, native_status, task, subtask, human_transcript = extract_info_from_filename(path_to_file)
                
                waveform, sample_rate = torchaudio.load(path_to_file)
                
                # Append the information to the data list
                data.append([path_to_file, speaker_id, sex, native_status, task, subtask, human_transcript, 0, waveform.shape[1]/sample_rate *1000, waveform.shape[1]/sample_rate *1000])

    # Create a DataFrame with the collected information
    df = pd.DataFrame(data, columns=["path_to_original_file", "speaker_id", "sex", "native_status", "task", "subtask", "text", "start_in_ms", "end_in_ms", "duration_in_ms"])
    df['new_name'] = df.index.astype(str) + ".wav"
    return df


In [10]:
# Replace 'kennedy_james_dataset' with the path to your actual main folder
result_df = explore_folder(f"../data/{main_folder}/original")

In [11]:
# Display the resulting DataFrame
result_df

Unnamed: 0,path_to_original_file,speaker_id,sex,native_status,task,subtask,text,start_in_ms,end_in_ms,duration_in_ms,new_name
0,../data/kennedy_james_sentencerepetition_datas...,09,F,nonNative,english_words_sentences,numbers,four,0,878.276644,878.276644,0.wav
1,../data/kennedy_james_sentencerepetition_datas...,09,F,nonNative,english_words_sentences,numbers,six,0,902.675737,902.675737,1.wav
2,../data/kennedy_james_sentencerepetition_datas...,09,F,nonNative,english_words_sentences,numbers,two,0,1122.244898,1122.244898,2.wav
3,../data/kennedy_james_sentencerepetition_datas...,09,F,nonNative,english_words_sentences,numbers,ten,0,805.079365,805.079365,3.wav
4,../data/kennedy_james_sentencerepetition_datas...,09,F,nonNative,english_words_sentences,numbers,three,0,1073.446712,1073.446712,4.wav
...,...,...,...,...,...,...,...,...,...,...,...
145,../data/kennedy_james_sentencerepetition_datas...,05,F,native,english_words_sentences,sentences,the dog is on top of the she would,0,2796.122449,2796.122449,145.wav
146,../data/kennedy_james_sentencerepetition_datas...,05,F,native,english_words_sentences,sentences,the dog is in front of the horse,0,2241.519274,2241.519274,146.wav
147,../data/kennedy_james_sentencerepetition_datas...,05,F,native,english_words_sentences,sentences,the horse is behind the car,0,2010.453515,2010.453515,147.wav
148,../data/kennedy_james_sentencerepetition_datas...,05,F,native,english_words_sentences,sentences,the fish is in the pond,0,2079.750567,2079.750567,148.wav


In [12]:
def update_path(row):
    new_name = row['new_name']
    new_path = f"{new_folder}/{new_name}"
    return new_path

In [13]:
new_folder = f"../data/{main_folder}/original_audio_segments"
pathlib.Path(new_folder).mkdir(parents=True, exist_ok=True)
for i, row in result_df.iterrows():
    # Replace 'source_file_path' with the path of the file you want to copy
    source_file_path = row['path_to_original_file']

    # Replace 'destination_folder' with the path of the folder where you want to copy the file
    destination_folder = f"{new_folder}/{row['new_name']}"

    # Use shutil.copy() to copy the file
    shutil.copy(source_file_path, destination_folder)

result_df['path_to_audio_segment_file'] = result_df.apply(update_path, axis=1)
result_df.to_excel(f"{new_folder}/segments.xlsx", index=False)


In [14]:
def process_audio_file(file_path):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Check if the audio is stereo, and convert it to mono by taking the left channel
    if waveform.size(0) == 2:
        waveform = waveform[0:1, :]  # Take the left channel

    # Check the sample rate and resample to 16kHz if needed
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Save the processed waveform as a WAV file with the same file path
    torchaudio.save(file_path, waveform, 16000, encoding="PCM_S", bits_per_sample=16)

    #print(f"Processed audio saved at: {save_path}")

def get_wav_files_in_folder(folder_path):
    wav_files = []
    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.lower().endswith('.wav'):
                file_path = os.path.join(dirpath, filename)
                wav_files.append(file_path)
    return wav_files

wav_files = get_wav_files_in_folder(new_folder)

for file in wav_files:
    process_audio_file(file)