In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
os.chdir('drive/MyDrive/CS224S_Final_Project/data')

In [36]:
import os

# Function to parse the duration from the last line of the RTTM file
def get_duration_from_rttm(rttm_file):
    last_line = None
    with open(rttm_file, 'r') as f:
        for line in f:
            last_line = line.strip().split()
    if last_line is None:
        print(f"No data found in {rttm_file}. Skipping...")
        return None
    return float(last_line[3]) + float(last_line[4])  # start time + duration

# Function to read file lists and generate UEM entries
def generate_uem_entries(file_list):
    uem_entries = []
    for audio_file in file_list:
        # Get the corresponding RTTM file
        rttm_file = os.path.join("all_annotations/rttm_dir_fixed", os.path.splitext(audio_file)[0] + ".rttm")
        if not os.path.exists(rttm_file):
            print(f"RTTM file not found for {audio_file}. Skipping...")
            continue

        # Get the duration from the RTTM file
        duration = get_duration_from_rttm(rttm_file)
        if duration is None:
            continue

        # Round the duration to the nearest 0.001
        duration = round(duration, 3)

        # Create the UEM entry
        uem_entry = f"{os.path.splitext(audio_file)[0]} NA 0.000 {duration:.3f}\n"
        uem_entries.append(uem_entry)

    return uem_entries

# Read audio file lists
def read_file_list(file_list_path):
    with open(file_list_path, 'r') as f:
        return [line.strip() for line in f]

In [37]:
train_files = read_file_list("lst_files/all_wav_files_train.lst")
test_files = read_file_list("lst_files/all_wav_files_test.lst")
dev_files = read_file_list("lst_files/all_wav_files_dev.lst")
all_files = train_files + test_files + dev_files
uem_entries = generate_uem_entries(all_files)
print(uem_entries)

No data found in all_annotations/rttm_dir_fixed/7004jp403.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2004cz206.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/7004jp404.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/7003jp301.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2004cz203.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/4012nl406.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2001cz106.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2004cz201.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/7002jp201.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2010cz306.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/2004cz202.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/7004jp402.rttm. Skipping...
No data found in all_annotations/rttm_dir_fixed/7004jp405.rttm. Skipping...
No data foun

In [38]:
print(len(uem_entries))

140


In [39]:
# Write all UEM entries to a single file
output_uem_file = "pyannote_uem_file/all_files.uem"
with open(output_uem_file, 'w') as f:
    f.writelines(uem_entries)