In [None]:
import os, random, shutil, pandas as pd, datasets
from datasets import Dataset, DatasetDict
from huggingface_hub import login
from collections import defaultdict

# Login to Hugging Face
os.environ['HUGGINGFACE_HUB_TOKEN'] = '####'
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

In [None]:
# Path to the main directory
main_dir = '/UME-ERJ/JE'

# Traverse the directory
for root, dirs, files in os.walk(main_dir, topdown=False):
    for file in files:
        if file.endswith('.wav'):
            # Get the relative path of the file
            rel_path = os.path.relpath(root, main_dir)
            # Split the relative path into parts
            path_parts = rel_path.split(os.sep)
            # Construct the new file name
            if len(path_parts) >= 2:
                new_name = f"{os.path.splitext(file)[0]}_{path_parts[-1]}_{path_parts[-2]}.wav"
            else:
                new_name = f"{os.path.splitext(file)[0]}_{path_parts[-1]}.wav"
            # Source file path
            src = os.path.join(root, file)
            # Destination file path
            dest = os.path.join(main_dir, new_name)
            # Move and rename the file
            shutil.move(src, dest)

# Remove empty directories
for root, dirs, files in os.walk(main_dir, topdown=False):
    for dir in dirs:
        dir_path = os.path.join(root, dir)
        if not os.listdir(dir_path):
            os.rmdir(dir_path)

In [None]:
# Path to the directory containing the .tab files
tab_dir = '/UME-ERJ/tab'
# Path to the output file
output_file = '/UME-ERJ/tab/combined.tab'

# Open the output file in write mode
with open(output_file, 'w') as outfile:
    # Traverse the directory
    for root, dirs, files in os.walk(tab_dir):
        for file in files:
            if file.endswith('.tab'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as infile:
                    for line in infile:
                        parts = line.split()
                        if len(parts) > 2:
                            # Remove the second file name
                            new_line = f"{parts[0]} {' '.join(parts[2:])}\n"
                            # Write the modified line to the output file
                            outfile.write(new_line)
                # Remove the original file after processing
                os.remove(file_path)

In [None]:
# Collecting audio file information
audio_files = []
for root, _, files in os.walk(main_dir):
    for file in files:
        if file.endswith('.wav'):
            audio_files.append(os.path.join(root, file))

# Extracting codes and paths
codes = [os.path.splitext(os.path.basename(f))[0] for f in audio_files]
audio_paths = [os.path.abspath(f) for f in audio_files]

# Reading combined .tab file and extracting transcripts
transcripts = {}
with open(output_file, 'r') as f:
    for line in f:
        parts = line.split(maxsplit=2)
        if len(parts) > 2:
            key = parts[0][:6]
            transcripts[key] = parts[2].strip()

# Matching audio files with transcripts
trans_list = [transcripts.get(code[:6], '') for code in codes]

# Creating the DataFrame
data_lists = {
    'code': codes,
    'audio': audio_paths,
    'transcript': trans_list
}

In [None]:
df = pd.DataFrame(data_lists)
dataset = Dataset.from_pandas(df, preserve_index=False)

# Extract user IDs from the 'code' field
def extract_user_id(code):
    return code[-7:]

# Create a dictionary to group entries by user ID
user_id_to_entries = defaultdict(list)
for entry in dataset:
    user_id = extract_user_id(entry['code'])
    user_id_to_entries[user_id].append(entry)

# Get all user IDs and shuffle them
user_ids = list(user_id_to_entries.keys())
random.shuffle(user_ids)

# Split user IDs into train/val/test
train_split_index = int(0.8 * len(user_ids))
val_split_index = int(0.9 * len(user_ids))

# Split the user IDs into three sets
train_user_ids = set(user_ids[:train_split_index])
val_user_ids = set(user_ids[train_split_index:val_split_index])
test_user_ids = set(user_ids[val_split_index:])

# Create train/val/test lists
train_entries = []
val_entries = []
test_entries = []

# Assign entries to train/val/test based on the user ID
for user_id in train_user_ids:
    train_entries.extend(user_id_to_entries[user_id])

for user_id in val_user_ids:
    val_entries.extend(user_id_to_entries[user_id])

for user_id in test_user_ids:
    test_entries.extend(user_id_to_entries[user_id])

# Convert lists of dictionaries into dictionary of lists
def convert_entries_to_dict(entries):
    dict_of_lists = defaultdict(list)
    for entry in entries:
        for key, value in entry.items():
            dict_of_lists[key].append(value)
    return dict(dict_of_lists)

# Convert entries without altering the audio format
train_dict = convert_entries_to_dict(train_entries)
val_dict = convert_entries_to_dict(val_entries)
test_dict = convert_entries_to_dict(test_entries)

# Create the new datasets
new_train_set = datasets.Dataset.from_dict(train_dict)
new_val_set = datasets.Dataset.from_dict(val_dict)
new_test_set = datasets.Dataset.from_dict(test_dict)

# Push the new dataset to HuggingFace
dataset_dict = datasets.DatasetDict({
    'train': new_train_set,
    'validation': new_val_set,
    'test': new_test_set
})

dataset_dict.push_to_hub("sage-bergerson/ume_erj_whisper")