This provides code to create train, test, and dev .lst files (text file of a bunch of filenames). We need .lst files to add to our yaml file input to the model.

Make sure you swap out the right folders/files depending on which languages you want data on + whether its audio or transcription labels.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('drive/MyDrive/CS224S_Final_Project/data/old_organization')

In [3]:
def get_wav_files_by_country(countries):
    root_directory = os.getcwd()
    all_wav_files = []

    # Iterate over each country
    for country in countries:
        country_wav_files = []

        # Walk through the directory structure
        for root, dirs, files in os.walk(root_directory):
            # Check if the current directory represents the specified country
            if os.path.basename(root).lower() == country.lower():
                # Iterate through files in the current directory
                for file in files:
                    # Check if the file is a .wav file and starts with "Copy of "
                    if file.lower().endswith('.wav') and file.startswith('Copy of '):
                        # Remove the "Copy of " prefix and add the filename to the list
                        country_wav_files.append(file[len('Copy of '):-4])

        # Combine the lists for each country
        all_wav_files.extend(country_wav_files)

    return all_wav_files

import os

def filter_bad_files(all_files, bad_wav_dir="../audio_files_with_empty_rttms"):
    """
    Filters out filenames from all_files for which the corresponding .wav file is in bad_wav_dir.

    Args:
    all_files (list of str): List of filenames.
    bad_wav_dir (str): Directory containing bad WAV files.

    Returns:
    list of str: Filtered list of filenames.
    """

    # Get the set of bad wav filenames (without the .wav extension)
    bad_wav_files = {os.path.splitext(f)[0] for f in os.listdir(bad_wav_dir)}

    # Filter out filenames that correspond to bad wav files
    filtered_file_list = [f for f in all_files if f not in bad_wav_files]

    return filtered_file_list

In [None]:
countries_you_want_files_of = ['australia', 'usa'] # Edit this to specify the countries you want in the data subset

files = get_wav_files_by_country(countries_you_want_files_of)
print("All WAV files:", files)
print(len(files))
files = filter_bad_files(files)
print("Filtered WAV files:", files)
print(len(files))

All WAV files: ['1002lv02', '1002lv05', '1001lv05', '1001lv04', '1001lv01', '1002lv03', '1001lv02', '1001lv03', '1002lv01', '1002lv04', '1003lv05', '1004lv102', '1003lv04', '1003lv01', '1003lv03', '1004lv104', '1004lv105', '1004lv103', '1003lv02', '1004lv101', '1004lv203', '1004lv202', '1004lv106', '1004lv201', 'TK09091301', 'CC06301743', 'TK09101843', 'TK09101800', 'CC06301748', '6008us306', '6013us101', '6008us303', '6008us305', '6013us103', '6013us102', '6008us304', '6013us104', '6008us301', '6008us302', '6019us202', '6019us205', '6024us402', '6019us201', '6024us403', '6013us105', '6024us401', '6019us203', '6019us204', '6019us206', '6024us405', '6024us404', 'CC11241849', 'CC11241853', 'CC12021339', 'CC11241819', 'CC11241837']
56
Filtered WAV files: ['1002lv02', '1002lv05', '1001lv05', '1001lv04', '1001lv01', '1002lv03', '1001lv02', '1001lv03', '1002lv01', '1002lv04', '1003lv05', '1004lv102', '1003lv04', '1003lv01', '1003lv03', '1004lv104', '1004lv105', '1004lv103', '1003lv02', '1004

In [None]:
import random

# splits in input list into random 70% training list, 20% test list, and 10% dev list
def split_list(data, train_ratio=0.7, test_ratio=0.2):
    assert train_ratio + test_ratio <= 1.0, "Train and test ratios should sum up to 1.0 or less."
    dev_ratio = 1.0 - (train_ratio + test_ratio)
    total_length = len(data)
    train_length = round(total_length * train_ratio)
    test_length = round(total_length * test_ratio)
    dev_length = total_length - train_length - test_length

    # Shuffle the data
    random.shuffle(data)

    # Split the data into train, test, and dev sets
    train_list = data[:train_length]
    test_list = data[train_length:train_length + test_length]
    dev_list = data[train_length + test_length:]

    return train_list, test_list, dev_list

In [None]:
train, test, dev = split_list(files) # extracting the lists

In [None]:
# Verifying proportions. Note that we might need to tweak the ratios depending on the actual length of the audio that ends up in each set
print(len(train))
print(len(test))
print(len(dev))

38
11
5


In [None]:
# Converts each list of filenames into a .lst file and writes it to the inputted folder
def write_list_to_file(data, filename):
    with open(filename, 'w') as file:
        for item in data:
            file.write(f"{item}\n")

In [None]:
# Executing this cell will create 3 .lst files and add it to data/lst_files folder containing names of each file in the train/test/dev set
write_list_to_file(train, "../lst_files/english_train.lst")
write_list_to_file(test, "../lst_files/english_test.lst")
write_list_to_file(dev, "../lst_files/english_dev.lst")