<h1> Setup </h1>

In [1]:
from Levenshtein import distance
from data_utils import read_abc
from glob import glob
from tqdm import tqdm
from pathlib import Path

<h1> Helper Functions </h1>

In [2]:
def bars_similiarity(bar1, bar2):
    """Calculates similarity based on Levenshtein distance between notes in bars."""
    # Handle potential empty bars to avoid division by zero
    if not bar1 or not bar2:
        return 1.0 # Assign maximum distance (least similarity) if a bar is empty

    distances = []
    for n1 in bar1:
        # Ensure n1 is not empty before calculating length
        len_n1 = len(n1) if n1 else 0
        min_dist = 1.0 # Initialize with max distance
        for n2 in bar2:
            # Ensure n2 is not empty
            len_n2 = len(n2) if n2 else 0
            # Avoid division by zero if both notes are empty strings
            denominator = len_n1 + len_n2
            if denominator == 0:
                 # If both n1 and n2 are empty, distance is 0, similarity is high (1.0 - 0.0)
                 # but let's consider them dissimilar in context of musical content
                 dist = 1.0
            else:
                 dist = distance(n1, n2) / denominator
            if dist < min_dist:
                min_dist = dist
        distances.append(min_dist)

    # Avoid division by zero if distances list is empty (shouldn't happen if bar1 wasn't empty)
    return sum(distances) / len(distances) if distances else 1.0


def get_num_repeats(bars):
    """Counts the number of non-consecutive identical bars."""
    num_repeats = 0
    for i in range(len(bars)):
        for j in range(i + 1, len(bars)):
            if bars[i] == bars[j]:
                num_repeats += 1
    return num_repeats


def is_rest_bar(bar_string):
    """Checks if a bar string consists only of rests (z, x, Z) and duration digits, ignoring spaces."""
    cleaned_bar = bar_string.strip()
    if not cleaned_bar:
        return False # An empty string isn't a bar of rests
    # Remove all spaces within the bar content
    no_spaces_bar = "".join(cleaned_bar.split())
    if not no_spaces_bar: # Handle case where bar was just spaces
        return False
    # Check if it contains at least one rest character
    has_rest = any(c in 'zxZ' for c in no_spaces_bar)
    # Check if it contains only rest characters ('z', 'x', 'Z') and digits
    only_rests_and_digits = all(c in 'zxZ' or c.isdigit() for c in no_spaces_bar)
    return has_rest and only_rests_and_digits

<h1> Filtering </h1>

In [None]:
input_dir = Path('trainset/abc')
output_dir = Path('cleaned_data')

output_dir.mkdir(exist_ok=True)
file_index = 0

# Rejection categories
results = {
    'no_valid_abc': 0,
    'not_enough_bars': 0,
    'chunk_incomplete': 0,
    'contains_empty_bar': 0, 
    'too_many_repeats': 0,
    'too_many_rests': 0,
    'too_similar': 0,
    'similarity_error': 0,
    'accepted': 0
}

print("Preprocessing data...")
input_files = list(input_dir.glob("*.abc"))

for file_path in tqdm(input_files):
    keys, abc_content = read_abc(file_path) # Use read_abc from data_utils

    # Ensure both keys and musical content are present
    if keys is None or abc_content is None or not abc_content.strip():
        results['no_valid_abc'] += 1
        continue

    # read_abc should return notes joined, with spaces around '|'
    # Split by ' | ' and filter out potential empty strings resulting from split
    split_bars = [bar for bar in abc_content.split(' | ') if bar.strip()]

    # Check if there are any bars left after filtering
    if not split_bars:
        results['no_valid_abc'] += 1
        continue

    # Process in 16-bar chunks (8 input, 8 target)
    num_chunks = len(split_bars) // 16

    if num_chunks == 0:
        results['not_enough_bars'] += 1
        continue

    for chunk_idx in range(num_chunks):
        start_idx = chunk_idx * 16
        mid_idx = start_idx + 8
        end_idx = mid_idx + 8

        # The slicing automatically handles lists not perfectly divisible by 16,
        # but we need exactly 8 bars for bar1 and bar2.
        if end_idx > len(split_bars):
             # This case should ideally be caught by num_chunks logic, but double-check
             results['chunk_incomplete'] += 1
             continue # Skip incomplete final chunk

        bar1 = split_bars[start_idx : mid_idx]
        bar2 = split_bars[mid_idx : end_idx]

        # Verify we have exactly 8 bars in each part
        if len(bar1) != 8 or len(bar2) != 8:
            results['chunk_incomplete'] += 1
            continue

        # Explicitly check for empty strings within the selected bars (should be rare after initial filter)
        if any(not bar for bar in bar1) or any(not bar for bar in bar2):
             results['contains_empty_bar'] += 1
             continue

        rest_bar_count = sum(1 for bar in bar1 + bar2 if is_rest_bar(bar))
        if rest_bar_count > 1:
            results['too_many_rests'] += 1
            continue

        if get_num_repeats(bar2) > 4:
            results['too_many_repeats'] += 1
            continue

        # Similarity check
        if bars_similiarity(bar1, bar2) < 0.1:
            results['too_similar'] += 1
            continue

        # --- Write Accepted Data ---
        output_file_path = output_dir.joinpath(f"{file_index}.abc")
        try:
            with open(output_file_path, "w", encoding='utf-8') as f:
                # Write keys (read_abc returns keys as a single space-separated string)
                key_lines = keys.split(' ') # Split if multiple keys were joined
                f.write("\n".join(key_lines) + "\n")
                # Write the 16 bars (bar1 + bar2), joined by ' | ' as expected by read_abc
                f.write(" | ".join(bar1 + bar2))
            results['accepted'] += 1
            file_index += 1
        except Exception as e:
            print(f"Error writing file {output_file_path}: {e}")
            # Optionally remove the partially written file: os.remove(output_file_path)

print(f"\nProcessed {len(input_files)} input files.")
print(f"Generated {file_index} cleaned files.")

Preprocessing data...


100%|██████████| 182000/182000 [21:01<00:00, 144.23it/s]


Processed 182000 input files.
Generated 518951 cleaned files.





<h1> Results </h1>

In [4]:
print("\nRejection/Acceptance Statistics:")
total_processed_chunks = sum(v for k, v in results.items() if k != 'no_valid_abc' and k != 'not_enough_bars')
total_rejected_chunks = total_processed_chunks - results['accepted']

print(f"Total files read: {len(input_files)}")
print(f"Files skipped (no valid ABC/keys): {results['no_valid_abc']}")
print(f"Files skipped (not enough bars for 1 chunk): {results['not_enough_bars']}")
print("-" * 30)
print(f"Total 16-bar chunks considered: {total_processed_chunks}")
print(f"Chunks rejected: {total_rejected_chunks}")
print(f"Chunks accepted: {results['accepted']}")
print("-" * 30)
print("Rejection reasons for considered chunks:")
for reason, count in results.items():
    # Exclude categories that apply before chunk processing
    if reason not in ['no_valid_abc', 'not_enough_bars', 'accepted']:
      if total_processed_chunks > 0:
          percentage = (count / total_rejected_chunks) * 100 if total_rejected_chunks > 0 else 0
          print(f"  {reason}: {count} ({percentage:.1f}% of rejections)")
      else:
          print(f"  {reason}: {count}")



Rejection/Acceptance Statistics:
Total files read: 182000
Files skipped (no valid ABC/keys): 1
Files skipped (not enough bars for 1 chunk): 6672
------------------------------
Total 16-bar chunks considered: 964082
Chunks rejected: 445131
Chunks accepted: 518951
------------------------------
Rejection reasons for considered chunks:
  chunk_incomplete: 0 (0.0% of rejections)
  contains_empty_bar: 0 (0.0% of rejections)
  too_many_repeats: 43213 (9.7% of rejections)
  too_many_rests: 291577 (65.5% of rejections)
  too_similar: 110341 (24.8% of rejections)
  similarity_error: 0 (0.0% of rejections)
