# Python: Regular Expression

### Filtering

In [10]:
# Only filters and prints Phonetic and English words labeled using Regular Expression (RegEx)
import re

with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
    for line in f:
        # Remove leading/trailing whitespace
        line = line.strip()
        if not line:
            continue  # skip empty lines
        # Split on two or more spaces or tabs
        parts = re.split(r"\s{2,}|\t+", line)
        if len(parts) >= 2:
            english = parts[0].strip()
            phonetic = parts[1].strip()
            print("English:", english)
            print("Phonetic:", phonetic)
        else:
            # Optionally print lines that do not match to debug further
            print("No match for:", line)

No match for: Dÿne Dédliné Yatié
No match for: Æerehtå’íscho
No match for: Denínu Kuç Yatié
No match for: Chipewyan Dictionary
No match for: Published by
No match for: South Slave Divisional Education Council
No match for: 202 McDougal Road, PO Box 510,
No match for: Fort Smith, Northwest Territories, Canada X0E 0P0
No match for: Copyright © South Slave Divisional Education Council 2012
No match for: Printed in Canada.
No match for: National Library of Canada Publication Data
No match for: ISBN 978-0-9878616-0-3
English: Project Coordinator:
Phonetic: Brent Kaulback
English: Linguist:
Phonetic: Betty Harnum
English: Advisor:
Phonetic: Dr. Eung-Do Cook
English: Elders Committee:
Phonetic: Lawrence Fabien
English: Tommy Unka
Phonetic: Freddie King
English: Christine Fabien
Phonetic: Henry Calmut
English: Harvey Mandeville
Phonetic: Mary Jane Beaulieu
No match for: Layout and Design: Brent Kaulback
English: Sound Editor:
Phonetic: Tristan Kaulback
English: Photo Credits:
Phonetic: Jayne M

In [None]:
# Filters words document using RegEx and creates a CSV file with its output for all results including No match for:, English:, Phonetic:
import re
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write CSV header
    csv_writer.writerow(["English", "Phonetic"])

    with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
        for line in f:
            # Remove leading/trailing whitespace
            line = line.strip()
            if not line:
                continue  # skip empty lines
            # Split on two or more spaces or tabs
            parts = re.split(r"\s{2,}|\t+", line)
            if len(parts) >= 2:
                english = parts[0].strip()
                phonetic = parts[1].strip()
                csv_writer.writerow([english, phonetic])
            else:
                print("No match for:", line)

In [16]:
# Prints the unmatching words and creates a csv file with the English and Phonetic words labeled.
import re
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write CSV header (optional)
    csv_writer.writerow(["Word Type", "Word"])

    with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
        for line in f:
            # Remove leading/trailing whitespace
            line = line.strip()
            if not line:
                continue  # skip empty lines
            # Split on two or more spaces or tabs
            parts = re.split(r"\s{2,}|\t+", line)
            if len(parts) >= 2:
                english = parts[0].strip()
                phonetic = parts[1].strip()
                # Write each word with its label as separate rows in CSV
                csv_writer.writerow(["English", english])
                csv_writer.writerow(["Phonetic", phonetic])
            else:
                print("No match for:", line)

No match for: Dÿne Dédliné Yatié
No match for: Æerehtå’íscho
No match for: Denínu Kuç Yatié
No match for: Chipewyan Dictionary
No match for: Published by
No match for: South Slave Divisional Education Council
No match for: 202 McDougal Road, PO Box 510,
No match for: Fort Smith, Northwest Territories, Canada X0E 0P0
No match for: Copyright © South Slave Divisional Education Council 2012
No match for: Printed in Canada.
No match for: National Library of Canada Publication Data
No match for: ISBN 978-0-9878616-0-3
No match for: Layout and Design: Brent Kaulback
No match for: 180, 247, 254, 265, 277, 329, 330)
No match for: 115 (G-1995-001: 0772) 172 (G-1979-001: 0148)
No match for: 225 (N-1979-003: 0065) 270 (N-1979-004: 0022)
No match for: 271 (G-1995-001: 5497)
No match for: All other photos were provided by Shutterstock Royalty Free
No match for: Subscription Stock Photography or through other royalty
No match for: free internet sites.
No match for: The South Slave Divisional Educatio

In [17]:
# Filters document using ReGex and sends all English word results to a new CSV file
import re
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write CSV header (optional)
    csv_writer.writerow(["English Word"])

    with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            # Split on two or more spaces or tabs
            parts = re.split(r"\s{2,}|\t+", line)
            if parts:
                english = parts[0].strip()
                csv_writer.writerow([english])
            else:
                print("No match for:", line)

In [19]:
import re
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write CSV header (optional)
    csv_writer.writerow(["English Word"])

    with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            # Split on two or more spaces or tabs
            parts = re.split(r"\s{2,}|\t+", line)
            if len(parts) >= 2:  # only include lines with a phonetic part
                english = parts[0].strip()
                csv_writer.writerow([english])
            else:
                print("No match for:", line)

No match for: Dÿne Dédliné Yatié
No match for: Æerehtå’íscho
No match for: Denínu Kuç Yatié
No match for: Chipewyan Dictionary
No match for: Published by
No match for: South Slave Divisional Education Council
No match for: 202 McDougal Road, PO Box 510,
No match for: Fort Smith, Northwest Territories, Canada X0E 0P0
No match for: Copyright © South Slave Divisional Education Council 2012
No match for: Printed in Canada.
No match for: National Library of Canada Publication Data
No match for: ISBN 978-0-9878616-0-3
No match for: Layout and Design: Brent Kaulback
No match for: 180, 247, 254, 265, 277, 329, 330)
No match for: 115 (G-1995-001: 0772) 172 (G-1979-001: 0148)
No match for: 225 (N-1979-003: 0065) 270 (N-1979-004: 0022)
No match for: 271 (G-1995-001: 5497)
No match for: All other photos were provided by Shutterstock Royalty Free
No match for: Subscription Stock Photography or through other royalty
No match for: free internet sites.
No match for: The South Slave Divisional Educatio

In [20]:
import re
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    # Write CSV header (optional)
    csv_writer.writerow(["Phonetic Word"])

    with open("Chipewyan-Dictionary.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # skip empty lines
            # Split on two or more spaces or tabs
            parts = re.split(r"\s{2,}|\t+", line)
            if len(parts) >= 2:  # ensure there's a phonetic word
                phonetic = parts[1].strip()
                csv_writer.writerow([phonetic])
            else:
                print("No match for:", line)

No match for: Dÿne Dédliné Yatié
No match for: Æerehtå’íscho
No match for: Denínu Kuç Yatié
No match for: Chipewyan Dictionary
No match for: Published by
No match for: South Slave Divisional Education Council
No match for: 202 McDougal Road, PO Box 510,
No match for: Fort Smith, Northwest Territories, Canada X0E 0P0
No match for: Copyright © South Slave Divisional Education Council 2012
No match for: Printed in Canada.
No match for: National Library of Canada Publication Data
No match for: ISBN 978-0-9878616-0-3
No match for: Layout and Design: Brent Kaulback
No match for: 180, 247, 254, 265, 277, 329, 330)
No match for: 115 (G-1995-001: 0772) 172 (G-1979-001: 0148)
No match for: 225 (N-1979-003: 0065) 270 (N-1979-004: 0022)
No match for: 271 (G-1995-001: 5497)
No match for: All other photos were provided by Shutterstock Royalty Free
No match for: Subscription Stock Photography or through other royalty
No match for: free internet sites.
No match for: The South Slave Divisional Educatio

### Regex Audio Filters

In [3]:
import os


def extract_labels(directory):
    """
    Scans the provided directory for .mp3 files and returns a list of labels
    (filenames without the .mp3 extension).

    Args:
        directory (str): Path to the folder containing the sound files.

    Returns:
        list: A list of labels.
    """
    labels = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(".mp3"):
            # Remove the .mp3 extension
            label, _ = os.path.splitext(filename)
            labels.append(label)
    return labels


# Define the path to your Chipewyan sound files folder
sound_files_folder = "/home/linux/dict_filter/Chipewyan-sound-files"

# Check if the directory exists, if not create it
if not os.path.exists(sound_files_folder):
    os.makedirs(sound_files_folder)
    print(f"Created directory: {sound_files_folder}")

# Extract and print all labels if the directory exists
if os.path.exists(sound_files_folder):
    labels = extract_labels(sound_files_folder)
    for label in labels:
        print(label)
else:
    print(f"Directory does not exist: {sound_files_folder}")

Created directory: /home/linux/dict_filter/Chipewyan-sound-files


### Extract audio titles from folder

In [None]:
# Extract all titles from audio files within folder
import os

# Define the path to your Chipewyan sound files folder
sound_files_folder = "/home/linux/dict_filter/Chipewyan-sound-files"

# Loop through each file in the folder
for file in os.listdir(sound_files_folder):
    if file.lower().endswith(".mp3"):
        # Remove the .mp3 extension to get the title
        title, _ = os.path.splitext(file)
        print(title)

### filtering multiple files

In [None]:
import csv


def match_audio_and_phonetic(audio_csv, phonetic_csv, output_csv):
    # Read audio titles
    with open(audio_csv, "r", encoding="utf-8") as a_file:
        audio_lines = [
            line.strip()
            for line in a_file
            if line.strip() and not line.startswith("//")
        ]

    # Read phonetic words
    with open(phonetic_csv, "r", encoding="utf-8") as p_file:
        phonetic_lines = [
            line.strip()
            for line in p_file
            if line.strip() and not line.startswith("//")
        ]

    # Use the minimum length to avoid index errors if one file is longer
    pair_count = min(len(audio_lines), len(phonetic_lines))
    if len(audio_lines) != len(phonetic_lines):
        print(
            f"Warning: audio titles ({len(audio_lines)}) and phonetic words ({len(phonetic_lines)}) differ in number. Matching first {pair_count} pairs."
        )

    with open(output_csv, "w", newline="", encoding="utf-8") as out_file:
        writer = csv.writer(out_file)
        # Write CSV header
        writer.writerow(["Audio Title", "Phonetic"])
        for i in range(pair_count):
            writer.writerow([audio_lines[i], phonetic_lines[i]])


if __name__ == "__main__":
    audio_csv = "/home/linux/dict_filter/audio-titles.csv"
    phonetic_csv = "/home/linux/dict_filter/.vscode/phonetic.csv"
    output_csv = "/home/linux/dict_filter/matched.csv"
    match_audio_and_phonetic(audio_csv, phonetic_csv, output_csv)
    print("Matching complete! See output in:", output_csv)

##### GitHub Copilot
Below is one way to do it. In this example, we assume that output.csv is written as pairs of rows (an "English" row followed by the matching "Phonetic" row) and that audio-titles.csv holds one title per line. The function reads audio titles, builds a dictionary from output.csv (mapping the English word to its phonetic counterpart), and then—for each audio title—it attempts to find a matching English word (using an exact or substring match). Finally, it writes the three columns (audio title, matched English word, and phonetic) into a result CSV.

In [14]:
import csv


def build_mapping_from_output(output_csv):
    """
    Reads output.csv and builds a dictionary mapping English words (in lowercase)
    to their phonetic words.
    """
    mapping = {}
    with open(output_csv, "r", encoding="utf-8") as o_file:
        reader = csv.reader(o_file)
        header = next(reader, None)  # skip header if any
        rows = list(reader)
        i = 0
        while i < len(rows):
            if rows[i][0].strip().lower() == "english":
                eng = rows[i][1].strip()
                phon = ""
                if i + 1 < len(rows) and rows[i + 1][0].strip().lower() == "phonetic":
                    phon = rows[i + 1][1].strip()
                    i += 2
                else:
                    i += 1
                mapping[eng.lower()] = phon
            else:
                i += 1
    return mapping


def match_audio_titles_with_output(audio_csv, output_csv, result_csv):
    """
    Reads audio-titles.csv and output.csv, then matches each audio title with
    the English words from output.csv to retrieve the associated phonetic value.
    Matching first tries an exact match (ignoring case); if none is found, it attempts
    a simple substring match.

    The results are written to result_csv.
    """
    # Read audio titles (ignore comment lines)
    with open(audio_csv, "r", encoding="utf-8") as a_file:
        audio_titles = [
            line.strip()
            for line in a_file
            if line.strip() and not line.startswith("//")
        ]

    mapping = build_mapping_from_output(output_csv)

    with open(result_csv, "w", newline="", encoding="utf-8") as res_file:
        writer = csv.writer(res_file)
        writer.writerow(["Audio Title", "Matched English", "Phonetic"])
        for title in audio_titles:
            title_lower = title.lower()
            # Try exact match first:
            matched_eng = None
            if title_lower in mapping:
                matched_eng = title  # use original title casing
                phon = mapping[title_lower]
            else:
                # Try substring matching (if only one key contains the title)
                matches = [(eng, mapping[eng]) for eng in mapping if title_lower in eng]
                if len(matches) == 1:
                    matched_eng, phon = matches[0]
                else:
                    matched_eng, phon = "No match", ""
            writer.writerow([title, matched_eng, phon])
    print("Matching complete! See results in:", result_csv)


# Example usage:
if __name__ == "__main__":
    audio_csv = "/home/linux/dict_filter/.vscode/audio-titles.csv"
    output_csv = "/home/linux/dict_filter/.vscode/output.csv"
    result_csv = "/home/linux/dict_filter/matched_output.csv"
    match_audio_titles_with_output(audio_csv, output_csv, result_csv)

Matching complete! See results in: /home/linux/dict_filter/matched_output.csv


### Match words between english.csv and audio-titles.csv

In [21]:
import csv


def extract_matching_words(english_csv, audio_csv, output_csv):
    # Read English words from english.csv into a set
    english_words = set()
    with open(english_csv, "r", encoding="utf-8") as e_file:
        reader = csv.reader(e_file)
        header = next(reader, None)  # ignore header if present
        for row in reader:
            if row:
                word = row[0].strip().lower()
                if word:
                    english_words.add(word)

    # Read audio titles from audio-titles.csv into a set
    audio_words = set()
    with open(audio_csv, "r", encoding="utf-8") as a_file:
        reader = csv.reader(a_file)
        header = next(reader, None)  # ignore header if present
        for row in reader:
            if row:
                word = row[0].strip().lower()
                if word:
                    audio_words.add(word)

    # Get the intersection of both sets
    matching = english_words.intersection(audio_words)

    # Write the matching words to the output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as out_file:
        writer = csv.writer(out_file)
        writer.writerow(["Matching Word"])
        for word in sorted(matching):
            writer.writerow([word])

    print("Extracted matching words written in:", output_csv)


# Example usage:
if __name__ == "__main__":
    english_csv = "/home/linux/dict_filter/.vscode/english.csv"
    audio_csv = "/home/linux/dict_filter/.vscode/audio-titles.csv"
    output_csv = "/home/linux/dict_filter/matched_words.csv"
    extract_matching_words(english_csv, audio_csv, output_csv)

Extracted matching words written in: /home/linux/dict_filter/matched_words.csv


In [29]:
import csv

input_path = "/home/linux/dict_filter/audio-titles.csv"
output_path = "/home/linux/dict_filter/.vscode/all_sorted.csv"

with open(input_path, "r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    header = next(reader)  # assume first row is header
    rows = list(reader)

# Sort rows by the first column in a case-insensitive manner
rows.sort(key=lambda row: row[0].lower() if row else "")

with open(output_path, "w", newline="", encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(header)
    writer.writerows(rows)

print("Sorted CSV saved to:", output_path)

Sorted CSV saved to: /home/linux/dict_filter/.vscode/all_sorted.csv
