# Part 1: Data Collection and Preprocessing

In [None]:
!apt-get install -y abcmidi

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  abcm2ps timidity | pmidi postscript-viewer
The following NEW packages will be installed:
  abcmidi
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 306 kB of archives.
After this operation, 868 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 abcmidi amd64 20220218+ds1-1 [306 kB]
Fetched 306 kB in 0s (1,812 kB/s)
Selecting previously unselected package abcmidi.
(Reading database ... 121689 files and directories currently installed.)
Preparing to unpack .../abcmidi_20220218+ds1-1_amd64.deb ...
Unpacking abcmidi (20220218+ds1-1) ...
Setting up abcmidi (20220218+ds1-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [4]:
import os
import zipfile
import subprocess
import json
import numpy as np
import hashlib
from pathlib import Path
from multiprocessing import Pool
from tqdm import tqdm
from collections import Counter
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
output_dir = Path('/content/drive/MyDrive/MLProject/data')
output_dir.mkdir(parents=True, exist_ok=True)

## Extract MIDI Files

In [None]:
zip_path = '/content/drive/MyDrive/MLProject/data/lmd-dataset.zip'
extract_path = '/content/lmd_dataset'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

midi_files = list(Path(extract_path).rglob('*.mid'))
midi_files = [str(f) for f in midi_files]
print(f"Found {len(midi_files)} MIDI files")

Found 116189 MIDI files


## MIDI to ABC Conversion

In [None]:
abc_output_dir = '/content/drive/MyDrive/MLProject/data/abc_files'
os.makedirs(abc_output_dir, exist_ok=True)

In [None]:
def convert(args):
    i, midi_path = args
    out_fp = f'{abc_output_dir}/{i}.abc'

    if os.path.exists(out_fp):
        return True

    try:
        abc = subprocess.check_output(
            ['midi2abc', midi_path],
            stderr=subprocess.DEVNULL,
            timeout=5
        ).decode('utf-8')

        with open(out_fp, 'w') as f:
            f.write(abc)
        return True
    except:
        return False

midi_files_indexed = list(enumerate(midi_files))

with Pool(8) as p:
    results = list(tqdm(p.imap(convert, midi_files_indexed), total=len(midi_files)))

success_count = sum(results)
fail_count = len(results) - success_count
print(f"\nConversion complete: {success_count} succeeded, {fail_count} failed")

100%|██████████| 116189/116189 [11:20<00:00, 170.81it/s]


Conversion complete: 115296 succeeded, 893 failed





In [None]:
abc_files = list(Path(abc_output_dir).glob('*.abc'))[:50000]
print(f"Total ABC files: {len(abc_files)}")

Total ABC files: 50000


In [None]:
abc_data = []
for abc_file in tqdm(abc_files):
    with open(abc_file, 'r', errors='ignore') as f:
        content = f.read()
    if content.strip():
        abc_data.append(content)

100%|██████████| 50000/50000 [02:33<00:00, 325.75it/s]


## Tokenization

In [None]:
def tokenize_abc(abc_string):
    tokens = []
    lines = abc_string.strip().split('\n')

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.startswith('T:'):
            continue
        if line.startswith('%%'):
            continue
        if line.startswith('%'):
            continue
        if line.startswith('V:'):
            continue
        if line.startswith('Q:'):
            continue

        if len(line) > 1 and line[1] == ':':
            field = line[0]
            if field == 'X':
                tokens.append('X:')
            elif field == 'M':
                value = line[2:].strip().split('%')[0].strip()
                tokens.append(f'M:{value}')
            elif field == 'L':
                value = line[2:].strip().split('%')[0].strip()
                tokens.append(f'L:{value}')
            elif field == 'K':
                value = line[2:].strip().split('%')[0].strip()
                if value:
                    tokens.append(f'K:{value[0]}')
            continue

        i = 0
        while i < len(line):
            if line[i] == '\\':
                i += 1
            elif line[i] == ' ':
                i += 1
            elif line[i] in '[]':
                tokens.append(line[i])
                i += 1
            elif line[i] == '|':
                tokens.append('|')
                i += 1
            elif line[i] in '^_=':
                note = line[i]
                i += 1
                if i < len(line) and line[i] in 'ABCDEFGabcdefg':
                    note += line[i]
                    i += 1
                    while i < len(line) and line[i] in ",'0123456789/":
                        note += line[i]
                        i += 1
                tokens.append(note)
            elif line[i] in 'ABCDEFGabcdefg':
                note = line[i]
                i += 1
                while i < len(line) and line[i] in ",'0123456789/":
                    note += line[i]
                    i += 1
                tokens.append(note)
            elif line[i] == 'z':
                rest = 'z'
                i += 1
                while i < len(line) and line[i] in '0123456789/':
                    rest += line[i]
                    i += 1
                tokens.append(rest)
            elif line[i] in '0123456789':
                num = ''
                while i < len(line) and line[i] in '0123456789/':
                    num += line[i]
                    i += 1
                tokens.append(num)
            elif line[i] in '-<>()':
                tokens.append(line[i])
                i += 1
            else:
                i += 1

    return tokens

In [None]:
MIN_TOKENS = 30
MAX_TOKENS = 5000

all_token_sequences = []
too_short_count = 0
split_count = 0

for abc in tqdm(abc_data):
    tokens = tokenize_abc(abc)

    if len(tokens) < MIN_TOKENS:
        too_short_count += 1
        continue

    if len(tokens) <= MAX_TOKENS:
        all_token_sequences.append(tokens)
    else:
        for j in range(0, len(tokens), MAX_TOKENS):
            chunk = tokens[j:j + MAX_TOKENS]
            if len(chunk) >= MIN_TOKENS:
                all_token_sequences.append(chunk)
                split_count += 1

print(f"Total sequences: {len(all_token_sequences)}")
print(f"Filtered (too short): {too_short_count}")
print(f"Split into chunks: {split_count}")

100%|██████████| 50000/50000 [11:56<00:00, 69.74it/s] 

Total sequences: 263229
Filtered (too short): 0
Split into chunks: 257141





## Build Vocabulary

In [None]:
token_counts = Counter()
for seq in all_token_sequences:
    token_counts.update(seq)

special_tokens = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
sorted_tokens = sorted(token_counts.keys(), key=lambda x: -token_counts[x])

token2idx = {}
for i, tok in enumerate(special_tokens):
    token2idx[tok] = i

for tok in sorted_tokens:
    if tok not in token2idx:
        token2idx[tok] = len(token2idx)

idx2token = {v: k for k, v in token2idx.items()}
vocab_size = len(token2idx)

print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 27224


## Encode Sequences

In [None]:
def encode_sequence(tokens, token2idx):
    encoded = [token2idx['<BOS>']]
    for tok in tokens:
        if tok in token2idx:
            encoded.append(token2idx[tok])
        else:
            encoded.append(token2idx['<UNK>'])
    encoded.append(token2idx['<EOS>'])
    return encoded

encoded_sequences = []
for seq in tqdm(all_token_sequences):
    encoded_sequences.append(encode_sequence(seq, token2idx))

100%|██████████| 263229/263229 [01:47<00:00, 2451.35it/s]


## Train/Val/Test Split

In [None]:
np.random.seed(42)
indices = np.random.permutation(len(encoded_sequences))
shuffled_sequences = [encoded_sequences[i] for i in indices]

n_total = len(shuffled_sequences)
n_train = int(n_total * 0.98)
n_val = int(n_total * 0.01)

train_sequences = shuffled_sequences[:n_train]
val_sequences = shuffled_sequences[n_train:n_train + n_val]
test_sequences = shuffled_sequences[n_train + n_val:]

train_flat = [tok for seq in train_sequences for tok in seq]
val_flat = [tok for seq in val_sequences for tok in seq]
test_flat = [tok for seq in test_sequences for tok in seq]

train_array = np.array(train_flat, dtype=np.uint16)
val_array = np.array(val_flat, dtype=np.uint16)
test_array = np.array(test_flat, dtype=np.uint16)

print(f"Train: {len(train_sequences)} sequences, {len(train_array)} tokens")
print(f"Val: {len(val_sequences)} sequences, {len(val_array)} tokens")
print(f"Test: {len(test_sequences)} sequences, {len(test_array)} tokens")

Train: 257964 sequences, 1167894118 tokens
Val: 2632 sequences, 11907471 tokens
Test: 2633 sequences, 11808149 tokens


## Compute Statistics

In [None]:
sequence_lengths = [len(seq) for seq in all_token_sequences]

statistics = {
    "vocabulary_size": vocab_size,
    "total_tokens": {
        "train": int(len(train_array)),
        "val": int(len(val_array)),
        "test": int(len(test_array)),
        "total": int(len(train_array) + len(val_array) + len(test_array))
    },
    "sequence_count": {
        "train": len(train_sequences),
        "val": len(val_sequences),
        "test": len(test_sequences),
        "total": len(all_token_sequences)
    },
    "sequence_length_distribution": {
        "min": int(np.min(sequence_lengths)),
        "max": int(np.max(sequence_lengths)),
        "mean": float(np.mean(sequence_lengths)),
        "median": float(np.median(sequence_lengths)),
        "std": float(np.std(sequence_lengths))
    },
    "conversion_success_rate": {
        "total_midi_files": len(midi_files),
        "successful": success_count,
        "failed": fail_count,
        "success_rate": round(success_count / len(midi_files) * 100, 2)
    },
    "quality_filters": {
        "min_tokens": MIN_TOKENS,
        "max_tokens": MAX_TOKENS,
        "filtered_too_short": too_short_count,
        "sequences_from_splits": split_count
    }
}

In [None]:
print("=" * 60)
print("DATASET STATISTICS")
print("=" * 60)
print(f"\nVocabulary Size: {statistics['vocabulary_size']}")
print(f"\nTotal Tokens:")
print(f"  Train: {statistics['total_tokens']['train']:,}")
print(f"  Val: {statistics['total_tokens']['val']:,}")
print(f"  Test: {statistics['total_tokens']['test']:,}")
print(f"  Total: {statistics['total_tokens']['total']:,}")
print(f"\nSequence Count:")
print(f"  Train: {statistics['sequence_count']['train']:,}")
print(f"  Val: {statistics['sequence_count']['val']:,}")
print(f"  Test: {statistics['sequence_count']['test']:,}")
print(f"\nSequence Length Distribution:")
print(f"  Min: {statistics['sequence_length_distribution']['min']}")
print(f"  Max: {statistics['sequence_length_distribution']['max']}")
print(f"  Mean: {statistics['sequence_length_distribution']['mean']:.2f}")
print(f"  Median: {statistics['sequence_length_distribution']['median']:.2f}")
print(f"  Std: {statistics['sequence_length_distribution']['std']:.2f}")
print(f"\nConversion Success Rate:")
print(f"  Total MIDI Files: {statistics['conversion_success_rate']['total_midi_files']:,}")
print(f"  Successful: {statistics['conversion_success_rate']['successful']:,}")
print(f"  Failed: {statistics['conversion_success_rate']['failed']:,}")
print(f"  Success Rate: {statistics['conversion_success_rate']['success_rate']}%")
print(f"\nQuality Filters Applied:")
print(f"  Min Tokens: {statistics['quality_filters']['min_tokens']}")
print(f"  Max Tokens: {statistics['quality_filters']['max_tokens']}")
print(f"  Filtered (too short): {statistics['quality_filters']['filtered_too_short']:,}")
print(f"  Sequences from splits: {statistics['quality_filters']['sequences_from_splits']:,}")
print("=" * 60)

DATASET STATISTICS

Vocabulary Size: 27224

Total Tokens:
  Train: 1,167,894,118
  Val: 11,907,471
  Test: 11,808,149
  Total: 1,191,609,738

Sequence Count:
  Train: 257,964
  Val: 2,632
  Test: 2,633

Sequence Length Distribution:
  Min: 30
  Max: 5000
  Mean: 4524.89
  Median: 5000.00
  Std: 1163.53

Conversion Success Rate:
  Total MIDI Files: 116,189
  Successful: 115,296
  Failed: 893
  Success Rate: 99.23%

Quality Filters Applied:
  Min Tokens: 30
  Max Tokens: 5000
  Filtered (too short): 0
  Sequences from splits: 257,141


## Save Files to Google Drive

In [8]:
np.save(output_dir / 'train.npy', train_array)
np.save(output_dir / 'val.npy', val_array)
np.save(output_dir / 'test.npy', test_array)

with open(output_dir / 'tokenizer.json', 'w') as f:
    json.dump(token2idx, f)

with open(output_dir / 'statistics.json', 'w') as f:
    json.dump(statistics, f, indent=2)

print(f"Files saved to {output_dir}")

Files saved to /content/drive/MyDrive/MLProject/data


In [15]:
for file in output_dir.iterdir():
    if not file.is_dir():
        size = file.stat().st_size / 1024
        print(f"  {file.name}: {size:.1f} KB")

lmd-dataset.zip: 1417133.0 KB
train.npy: 2281043.3 KB
val.npy: 23256.9 KB
test.npy: 23062.9 KB
statistics.json: 0.7 KB
tokenizer.json: 412.5 KB
