<a href="https://colab.research.google.com/github/seanfagan/audacity-similar-label-finder/blob/main/Audacity_similar_label_finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audacity Similar Label Finder
[GitHub](https://github.com/seanfagan/audacity-similar-label-finder)

This tool scans Audacity label tracks to find labels with similar timecodes. It does this by comparing label timecodes and, if they are _similar enough_, noting those labels as a "match".

In [None]:
#@title Run the tool
#@markdown When you run this code, Google will warn you that you are about to run an executable not authored by Google.
#@markdown If you'd like to view the source code that you are about to run, double click.
#@markdown ### Steps
#@markdown 1. (Optional) Adjust the settings below.
#@markdown 2. Press the Play button to begin the script.
#@markdown 3. You will be prompted to upload your label track files.
#@markdown 4. The results will be printed below.

#@markdown ### Config
INCLUDE_TIMECODES_IN_OUTPUT = True #@param {type:"boolean"}
MATCH_TOLERANCE_IN_SECONDS = 0.5 #@param {type:"number"}
#@markdown >_^ The maximum difference between two timecodes that should still be considered a match. Use "0" for exact matches only._

import csv
import io
from google.colab import files

uploaded_labels = files.upload()

def fuzzy_match(label_a, label_b):
    '''Do the start and end times of these labels match, within tolerance?'''
    return (
		abs(float(label_a['start'])-float(label_b['start'])) <= MATCH_TOLERANCE_IN_SECONDS
		and abs(float(label_a['end'])-float(label_b['end'])) <= MATCH_TOLERANCE_IN_SECONDS
	)

print('+====================+')
print('|       RESULTS      |')
print('+====================+')
results = {}
for filename, filedata in uploaded_labels.items():
    # Read the label track file into a list of dictionaries
    reader = csv.DictReader(io.StringIO(filedata.decode()), fieldnames=['start', 'end', 'label'], delimiter='\t')
    labels = list(reader)

    print('\n\n======== FILE ========')
    print(f'Scanning "{filename}"')

    # Iterate through labels to find matches.
    current_group = []  # A group of similar labels
    match_groups = []  # A list of all groups
    prev_l = labels[0]
    for l in labels[1:]:
        if fuzzy_match(l, prev_l):
            # MATCH!
            if not current_group:
                # No ongoing group, so begin new group with previous label
                current_group.append(prev_l)
            current_group.append(l)
        else:
            # No match...
            if current_group:
                # Append the now-ended group to the list, then reset it
                match_groups.append(current_group)
                current_group = []

        prev_l = l
    
    # Print this file's results
    if match_groups:
        for g in match_groups:
            print('\nSimilar labels:')
            for l in g:
                if INCLUDE_TIMECODES_IN_OUTPUT:
                    print(f"- {l['label']}:\t{l['start']}\t{l['end']}")
                else:
                    print(f"- {l['label']}")
    else:
        print('No similar labels found.')

    results[filename] = match_groups


In [None]:
#@title Download results as JSON
#@markdown Press the Play button to download the results in a JSON format. You may specify a filename below.
dl_filename = 'gingersid_similar_label_results.json' #@param {type:"string"}
#@markdown The data follows this format: A file may contain matches, matches contain labels, and labels contain a start, end, and label name.
import json

with open(dl_filename, 'w') as output_file:
    json.dump(results, output_file, indent=2)

files.download(dl_filename)