# Part 4: Best Model Training and Sample Generation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import math
import time
from pathlib import Path
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = Path('/content/drive/MyDrive/MLProject/data')
output_dir = Path('/content/drive/MyDrive/MLProject/results')
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using device: cuda
GPU: NVIDIA A100-SXM4-80GB
Memory: 85.2 GB


## Load Data

In [None]:
train_data = np.load(data_dir / 'train.npy')
val_data = np.load(data_dir / 'val.npy')
test_data = np.load(data_dir / 'test.npy')

with open(data_dir / 'tokenizer.json', 'r') as f:
    token2idx = json.load(f)

idx2token = {v: k for k, v in token2idx.items()}
vocab_size = len(token2idx)

print(f"Vocab size: {vocab_size}")
print(f"Train tokens: {len(train_data):,}")
print(f"Val tokens: {len(val_data):,}")
print(f"Test tokens: {len(test_data):,}")

Vocab size: 27224
Train tokens: 1,167,894,118
Val tokens: 11,907,471
Test tokens: 11,808,149


## Generate Part 4 Indices

In [None]:
CONTEXT_LENGTH = 256
NEW_NUM_SAMPLES = 100_000_000 // CONTEXT_LENGTH

np.random.seed(123)

train_max_idx = len(train_data) - CONTEXT_LENGTH - 1
val_max_idx = len(val_data) - CONTEXT_LENGTH - 1

train_indices_part4 = np.random.choice(train_max_idx, size=NEW_NUM_SAMPLES, replace=False)
val_indices = np.load(data_dir / 'val_indices.npy')

np.save(data_dir / 'train_indices_part4.npy', train_indices_part4)

print(f"Part 4 train indices: {len(train_indices_part4):,}")
print(f"Val indices: {len(val_indices):,}")

Part 4 train indices: 390,625
Val indices: 39,062


## Dataset

In [None]:
class MusicDataset(Dataset):
    def __init__(self, data, context_length, indices):
        self.data = torch.from_numpy(data.astype(np.int64))
        self.context_length = context_length
        self.indices = indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        start = self.indices[idx]
        x = self.data[start:start + self.context_length]
        y = self.data[start + 1:start + self.context_length + 1]
        return x, y

## Transformer Model

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape

        q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.out_proj(out)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.fc2(self.dropout(F.gelu(self.fc1(x))))

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = x + self.dropout(self.attn(self.ln1(x), mask))
        x = x + self.dropout(self.ff(self.ln2(x)))
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, context_length, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.context_length = context_length

        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(context_length, d_model)
        self.dropout = nn.Dropout(dropout)

        d_ff = 4 * d_model
        self.blocks = nn.ModuleList([TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])

        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)

        self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        batch_size, seq_len = x.shape

        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        x = self.dropout(self.token_emb(x) + self.pos_emb(positions))

        mask = self.mask[:, :, :seq_len, :seq_len]

        for block in self.blocks:
            x = block(x, mask)

        x = self.ln_f(x)
        logits = self.head(x)
        return logits

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

## Training Configuration

In [None]:
CONTEXT_LENGTH = 256
BATCH_TOKENS = 4096
BATCH_SIZE = BATCH_TOKENS // CONTEXT_LENGTH
LEARNING_RATE = 3e-4
WARMUP_RATIO = 0.05

MODEL_CONFIG = {'n_layers': 8, 'd_model': 1024, 'n_heads': 16}

print(f"Context length: {CONTEXT_LENGTH}")
print(f"Batch size: {BATCH_SIZE} sequences ({BATCH_TOKENS} tokens)")
print(f"Samples: {len(train_indices_part4):,}")
print(f"Steps: {len(train_indices_part4) // BATCH_SIZE:,}")

Context length: 256
Batch size: 16 sequences (4096 tokens)
Samples: 390,625
Steps: 24,414


## Load Pre-trained Model

In [None]:
model = Transformer(
    vocab_size=vocab_size,
    d_model=MODEL_CONFIG['d_model'],
    n_heads=MODEL_CONFIG['n_heads'],
    n_layers=MODEL_CONFIG['n_layers'],
    context_length=CONTEXT_LENGTH
).to(device)

model.load_state_dict(torch.load(output_dir / 'xl_model.pt'))
print(f"Model loaded")
print(f"Parameters: {model.count_parameters():,}")

Model loaded
Parameters: 156,788,736


## Resume Training

In [None]:
def get_lr(step, total_steps, warmup_steps, max_lr):
    if step < warmup_steps:
        return max_lr * step / warmup_steps
    else:
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return max_lr * 0.5 * (1 + math.cos(math.pi * progress))

In [None]:
train_dataset = MusicDataset(train_data, CONTEXT_LENGTH, train_indices_part4)
val_dataset = MusicDataset(val_data, CONTEXT_LENGTH, val_indices)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)

total_steps = len(train_loader)
warmup_steps = int(total_steps * WARMUP_RATIO)

print(f"Total steps: {total_steps:,}")
print(f"Warmup steps: {warmup_steps:,}")

Total steps: 24,415
Warmup steps: 1,220


In [None]:
train_losses = []

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

start_time = time.time()

model.train()
for step, (x, y) in enumerate(train_loader):
    x, y = x.to(device), y.to(device)

    lr = get_lr(step, total_steps, warmup_steps, LEARNING_RATE)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    logits = model(x)
    loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    train_losses.append(loss.item())

    if step % 500 == 0:
        elapsed = time.time() - start_time
        remaining = (elapsed / (step + 1)) * (total_steps - step - 1)
        print(f"Step {step}/{total_steps} | Loss: {loss.item():.4f} | LR: {lr:.6f} | Time: {elapsed/60:.1f}m | ETA: {remaining/60:.1f}m")

train_time = time.time() - start_time
print(f"\nTraining completed in {train_time/60:.1f} minutes")

Step 0/24415 | Loss: 0.2398 | LR: 0.000000 | Time: 0.0m | ETA: 559.2m
Step 500/24415 | Loss: 0.5309 | LR: 0.000123 | Time: 1.8m | ETA: 87.4m
Step 1000/24415 | Loss: 0.3138 | LR: 0.000246 | Time: 3.6m | ETA: 85.2m
Step 1500/24415 | Loss: 0.3891 | LR: 0.000300 | Time: 5.5m | ETA: 83.2m
Step 2000/24415 | Loss: 0.5939 | LR: 0.000299 | Time: 7.3m | ETA: 81.4m
Step 2500/24415 | Loss: 0.5058 | LR: 0.000298 | Time: 9.1m | ETA: 79.6m
Step 3000/24415 | Loss: 0.6319 | LR: 0.000296 | Time: 10.9m | ETA: 77.8m
Step 3500/24415 | Loss: 0.3762 | LR: 0.000293 | Time: 12.7m | ETA: 76.0m
Step 4000/24415 | Loss: 0.2577 | LR: 0.000289 | Time: 14.5m | ETA: 74.2m
Step 4500/24415 | Loss: 0.3761 | LR: 0.000285 | Time: 16.4m | ETA: 72.4m
Step 5000/24415 | Loss: 0.3017 | LR: 0.000281 | Time: 18.2m | ETA: 70.5m
Step 5500/24415 | Loss: 0.4483 | LR: 0.000275 | Time: 20.0m | ETA: 68.7m
Step 6000/24415 | Loss: 0.5146 | LR: 0.000270 | Time: 21.8m | ETA: 66.9m
Step 6500/24415 | Loss: 0.4340 | LR: 0.000263 | Time: 23.6m 

In [None]:
model.eval()
val_loss_total = 0
val_steps = 0

with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
        val_loss_total += loss.item()
        val_steps += 1

final_val_loss = val_loss_total / val_steps
final_train_loss = sum(train_losses[-100:]) / min(100, len(train_losses))

print(f"Final train loss: {final_train_loss:.4f}")
print(f"Final val loss: {final_val_loss:.4f}")

Final train loss: 0.3733
Final val loss: 0.3487


In [None]:
torch.save(model.state_dict(), output_dir / 'best_model.pt')
print("Best model saved")

Best model saved


## Test Set Perplexity

In [None]:
np.random.seed(456)
test_max_idx = len(test_data) - CONTEXT_LENGTH - 1
test_indices = np.random.choice(test_max_idx, size=10000, replace=False)

test_dataset = MusicDataset(test_data, CONTEXT_LENGTH, test_indices)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
test_loss_total = 0
test_steps = 0

with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
        test_loss_total += loss.item()
        test_steps += 1

test_loss = test_loss_total / test_steps
perplexity = math.exp(test_loss)

print(f"Test Loss: {test_loss:.4f}")
print(f"Perplexity: {perplexity:.2f}")

Test Loss: 0.3495
Perplexity: 1.42


## Generation Functions

In [None]:
def generate_unconditional(model, max_length=150, temperature=1.0):
    start_tokens = [
        token2idx['<BOS>'],
        token2idx.get('X:', token2idx['<UNK>']),
        token2idx.get('M:4/4', token2idx['<UNK>']),
        token2idx.get('L:1/8', token2idx['<UNK>'])
    ]

    keys = ['K:C', 'K:G', 'K:D', 'K:A', 'K:E']
    random_key = keys[np.random.randint(0, len(keys))]
    start_tokens.append(token2idx.get(random_key, token2idx['<UNK>']))

    return generate(model, start_tokens, max_length, temperature)

In [None]:
def generate(model, start_tokens, max_length=200, temperature=1.0):
    model.eval()
    tokens = start_tokens.copy()

    with torch.no_grad():
        for _ in range(max_length):
            x = torch.tensor([tokens[-256:]]).to(device)
            logits = model(x)
            logits = logits[0, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1).item()
            tokens.append(next_token)
            if next_token == token2idx.get('<EOS>', -1):
                break

    return tokens

In [None]:
def tokens_to_text(tokens):
    return ''.join([idx2token.get(t, '') for t in tokens])

In [None]:
def clean_abc_output(generated_text):
    text = generated_text.replace('<BOS>', '').replace('<EOS>', '').replace('<PAD>', '')
    text = text.replace('X:', 'X:1\n')
    text = text.replace('M:', 'M:')
    text = text.replace('L:', '\nL:')
    text = text.replace('K:C', '\nK:C\n')
    text = text.replace('K:G', '\nK:G\n')
    text = text.replace('K:D', '\nK:D\n')
    text = text.replace('K:A', '\nK:A\n')
    text = text.replace('K:E', '\nK:E\n')
    text = text.replace('K:F', '\nK:F\n')
    text = text.replace('K:B', '\nK:B\n')

    last_bar = text.rfind('|')
    if last_bar > 0:
        text = text[:last_bar+1]

    return text.strip()

## Generate Unconditional Samples

I tested two approaches for unconditional generation. First, I provided only the beginning token and let the model generate everything on its own. This produced invalid outputs like repeated rests or notes without proper ABC headers. This happens because headers appear only once per song at the start, while notes appear hundreds of times throughout. So the model saw headers rarely compared to notes during training. Second, I provided only the header tokens with randomized time signatures and keys, then let the model freely generate the melody. This produced valid, playable music since the headers provide the structure while the melody remains free.

## Unconditional 1: Pure Unconditional

In [None]:
unconditional_samples1 = []

for i in range(5):
    start = [token2idx['<BOS>']]
    generated = generate(model, start, max_length=150, temperature=1)
    text = tokens_to_text(generated)
    cleaned = clean_abc_output(text)
    unconditional_samples1.append(cleaned)
    print(f"\n=== Unconditional Sample {i+1} ===")
    print(cleaned)


=== Unconditional Sample 1 ===
-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|C,,-496-|

=== Unconditional Sample 2 ===
|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|z-428|

=== Unconditional Sample 3 ===
G,/2z/2D,2A,>A,G,/2z/2D,/2z/2|A,,/2z/2A,/2z/2A,/2z/2C/2z/2E>CA,/2z/2G,/2G,/2|E,/2z/2E,/2z/2E,/2z/2E,/2z/2A,,/2z/2A,/2z/2A,/2z/2=B,/2z/2|[A,-G,]2A,/2-A,/2z/2_D>C=B,<_A,A,A,|D,>A,G,/2z/2D,/2=A,/2G,/2z/2F,/2>D,/2F,/2E,/2z/2G,/2|G,/2z/2D,/2z/2D,/2z/2A,,/2z/2A,/2z/2G,/2z/2F,/2z/2G,/2z/2|G,/2z/2D,/2_A,/2G,F,E,/2z/2G,/2z/2F,<G,D,/2|G,/2z/2D,/2z/2A,/2z/2=B,/2z/2A,/2z/2G,/2z/2A,/2z/2B,4-B,/2|

=== Unconditional Sample 4 =

## Unconditional 2: Header-Prompted Generation

In [None]:
unconditional_samples2 = []

for i in range(5):
    generated = generate_unconditional(model, max_length=150, temperature=1)
    text = tokens_to_text(generated)
    cleaned = clean_abc_output(text)
    unconditional_samples2.append(cleaned)
    print(f"\n=== Unconditional Sample {i+1} ===")
    print(cleaned)


=== Unconditional Sample 1 ===
X:1
M:4/4
L:1/8
K:G
E,,4E,,3E,,/2E,,/2|C,,4C,,3C,,/2C,,/2|F,,,4F,,3-F,,/2F,,/2|E,,4E,,3E,,3/2E,,/2|C,,4C,,3-C,,/2C,,/2|F,,4F,,3-F,,/2F,,/2|E,,4E,,3E,,3/2E,,/2|C,,4C,,3C,,/2C,,/2|F,,4F,,3-F,,/2F,,/2|E,,4E,,3E,,2|C,,4C,,3C,,/2C,,/2|F,,4F,,3-F,,/2F,,/2|E,,4E,,3E,,3/2E,,/2|C,,4C,,3C,,/2C,,/2|F,,4F,,3-F,,/2F,,/2|E,,4E,,3E,,/2E,,/2|C,,3C,,3C,,/2C,,/2|F,,4F,,3-F,,/2F,,/2|E,,4E,,3[^CA,E,]2[CA,E,]2z/2[C-A,-E,-]2|

=== Unconditional Sample 2 ===
X:1
M:4/4
L:1/8
K:C
z8|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|C,,C,,zA,,,B,,,2zB,,,|C,,3C,,z4|

=== Unconditional Sample 3 ===
X:1
M:4/4
L:1/8
K:E
z8|z4|z8|z3G,/2z/2G,/2z/2A,/2z/2B,3/2z/2|A,3/2z/2C/2z/2C[D=D]2[EC]2[

## Generate Conditional Samples

In [None]:
conditional_samples = []

prefixes = [
    ['<BOS>', 'X:', 'M:4/4', 'L:1/8', 'K:C', 'C', 'D', 'E', 'F'],
    ['<BOS>', 'X:', 'M:4/4', 'L:1/8', 'K:G', 'G', 'A', 'B', 'c'],
    ['<BOS>', 'X:', 'M:3/4', 'L:1/8', 'K:D', 'D', 'E', 'F'],
    ['<BOS>', 'X:', 'M:4/4', 'L:1/8', 'K:A', 'A', 'B', 'c'],
    ['<BOS>', 'X:', 'M:6/8', 'L:1/8', 'K:E', 'E', 'F', 'G']
]

for i, prefix in enumerate(prefixes):
    prefix_tokens = [token2idx.get(t, token2idx['<UNK>']) for t in prefix]
    generated = generate(model, prefix_tokens, max_length=150, temperature=1.0)
    text = tokens_to_text(generated)
    cleaned = clean_abc_output(text)
    conditional_samples.append(cleaned)
    print(f"\n=== Conditional Sample {i+1} ===")
    print(cleaned)


=== Conditional Sample 1 ===
X:1
M:4/4
L:1/8
K:C
CDEFG^GAF|FGECD3z|CD/2z/2GEFG^GA|cedfgabc'|z8|EG^Acdefd|cFGA/2z/2cz^dc|^Acd^dfgzd|G^GA/2z/2czdc/2z3/2|cdE/2z/2fe^A/2z/2dc|^Acd/2z/2^dc/2z/2c=d^d|gc^df=gf^g^a|z8|z8|z8|^Acd^dfg/2z/2dc|cAFGAcd^d|fd^AGz=A,,,z^A,,,|z^A/2z/2dc/2z/2czdc/2z/2|

=== Conditional Sample 2 ===
X:1
M:4/4
L:1/8
K:G
GABcAGFG|A,,6-A,,3/2z/2|B=AGFED2C|B,4z2|GABcAGFG|E,6-E,3/2z/2|B=AGFED2C|B,4z2|GABcAGFG|A,,6-A,,3/2z/2|B=AGFED2C|CB,6-B,|GABcAGFG|A,,6-A,,3/2z/2|B=AGFED2C|B,C6-C|z8|z8|z8|z3/2CE/2-[G-E]2GG<GG/2-|G6CE|=G3FE3/2z/2CE-|

=== Conditional Sample 3 ===
X:1
M:3/4
L:1/8
K:D
DEFGA>fa-|a/2z3/2f'e'f'e'_g'|a'g'f'(3_e'f'g'f'_g'|a'g'f'(3=e'f'g'f'_d'|a'g'f'e'f'/2=e'd'|e'f'e'd'e'f'e'-|e'd'c'(3e'd'c'd'e'-|e'd'c'(3e'd'c'd''e'|f'_d'=d'e'f'g'f'|a'=g'f'e'e'3/2d'/2|e'f'e'd'e'_d'|e'f'e'd'e'f'e'|f'(3d'4c'4b4|[=cC]2[cC]3/2c/2[cC]3/2[BB,]2|

=== Conditional Sample 4 ===
X:1
M:4/4
L:1/8
K:A
ABcBcdAB|

=== Conditional Sample 5 ===
X:1
M:6/8
L:1/8
K:E
EFGABcd|E4-EDE|F8-|F3EFGA|BAB8|BAc

## Validate ABC Syntax

In [None]:
def is_valid_abc(text):
    has_x = 'X:' in text
    has_m = 'M:' in text
    has_k = 'K:' in text
    has_notes = any(c in text for c in 'ABCDEFGabcdefg')
    has_barlines = '|' in text
    balanced_brackets = text.count('[') == text.count(']')
    return has_x and has_m and has_k and has_notes and has_barlines and balanced_brackets

all_samples = unconditional_samples1 + unconditional_samples2 + conditional_samples
valid_count = 0
valid_samples = []

for s in all_samples:
  if is_valid_abc(s):
    valid_count += 1
    valid_samples.append(s)

print(f"\nValid ABC syntax: {valid_count}/{len(all_samples)} ({100*valid_count/len(all_samples):.1f}%)")


Valid ABC syntax: 10/15 (66.7%)


## Convert to MIDI

In [None]:
!pip install music21 -q

In [None]:
from music21 import converter, stream
import copy

def deep_clone_part(part):
    new_part = stream.Part()
    for el in part.recurse():
        try:
            new_part.insert(el.offset, copy.deepcopy(el))
        except Exception:
            pass
    return new_part

midi_success = 0
midi_total = 0

for i, sample in enumerate(valid_samples):
    midi_total += 1
    try:
        score = converter.parse(sample, format='abc')

        fixed_score = stream.Score()

        for part in score.parts if score.parts else [score]:
            p = part

            try:
                if not p.getElementsByClass(stream.Measure):
                    p = p.makeMeasures()
                p = p.expandRepeats()
                p = p.makeMeasures()
            except Exception:
                pass

            p = deep_clone_part(p)

            fixed_score.append(p)

        midi_path = output_dir / f"sample_{i+1}.mid"
        fixed_score.write("midi", fp=str(midi_path))

        abc_path = output_dir / f"sample_{i+1}.abc"
        with open(abc_path, "w") as f:
            f.write(sample)

        midi_success += 1
        print(f"Sample {i+1}: MIDI conversion successful")

    except Exception as e:
        print(f"Sample {i+1}: MIDI conversion failed - {e}")

print(f"\nMIDI conversion success: {midi_success}/{midi_total} "
      f"({100*midi_success/midi_total:.1f}%)")

Sample 1: MIDI conversion successful
Sample 2: MIDI conversion successful
Sample 3: MIDI conversion successful
Sample 4: MIDI conversion successful
Sample 5: MIDI conversion successful
Sample 6: MIDI conversion successful
Sample 7: MIDI conversion successful
Sample 8: MIDI conversion successful
Sample 9: MIDI conversion successful
Sample 10: MIDI conversion successful

MIDI conversion success: 10/10 (100.0%)


## Results Summary

In [None]:
print("="*60)
print("PART 4 RESULTS SUMMARY")
print("="*60)
print(f"\nModel: XL Transformer ({model.count_parameters():,} parameters)")
print(f"Training: 100M additional tokens (200M total)")
print(f"\nFinal train loss: {final_train_loss:.4f}")
print(f"Final val loss: {final_val_loss:.4f}")
print(f"Test loss: {test_loss:.4f}")
print(f"Test perplexity: {perplexity:.2f}")
print(f"\nGenerated samples: {len(all_samples)}")
print(f"Valid ABC syntax: {valid_count}/{len(all_samples)} ({100*valid_count/len(all_samples):.1f}%)")
print(f"MIDI conversion: {midi_success}/{midi_total} ({100*midi_success/midi_total:.1f}%)")
print("="*60)

PART 4 RESULTS SUMMARY

Model: XL Transformer (156,788,736 parameters)
Training: 100M additional tokens (200M total)

Final train loss: 0.3733
Final val loss: 0.3487
Test loss: 0.3495
Test perplexity: 1.42

Generated samples: 15
Valid ABC syntax: 10/15 (66.7%)
MIDI conversion: 10/10 (100.0%)


## Save Results

In [None]:
results = {
    'model_params': model.count_parameters(),
    'total_tokens_trained': 200_000_000,
    'final_train_loss': final_train_loss,
    'final_val_loss': final_val_loss,
    'test_loss': test_loss,
    'perplexity': perplexity,
    'num_samples': len(all_samples),
    'valid_abc_count': valid_count,
    'valid_abc_percent': 100*valid_count/len(all_samples),
    'midi_success_count': midi_success,
    'midi_success_percent': 100*midi_success/midi_total,
    'unconditional_samples1': unconditional_samples1,
    'unconditional_samples2': unconditional_samples2,
    'conditional_samples': conditional_samples
}

with open(output_dir / 'best_model_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved to best_model_results.json")

Results saved to best_model_results.json
