In [4]:
#import statements 
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display

  from .autonotebook import tqdm as notebook_tqdm


Task 1 <br>
This assignment focuses on symbolic music modeling. The goal is to train a model that learns a distribution \( p(x) \) over symbolic music data (e.g., MIDI)  specifically within the EDM genre. In addition it is capable of sampling new sequences from this learned distribution unconditionally. We will be using the LSTM model for this task. <br>

Get list of files for training/test sets

In [3]:
import os
import glob
print("CWD:", os.getcwd())
print("Train directory exists?", os.path.exists("./train"))
print("Train files (glob):", glob.glob("./train/*.midi"))

train_files = glob.glob("./train/*.midi")
test_files = glob.glob("./test/*.midi")

CWD: c:\Users\sammy\Downloads\cse 153_task1\cse153_task1
Train directory exists? True
Train files (glob): ['./train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_01_Track01_wav.midi', './train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_02_Track02_wav.midi', './train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_03_Track03_wav.midi', './train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_04_Track04_wav.midi', './train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_05_Track05_wav.midi', './train\\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--1.midi', './train\\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--2.midi', './train\\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--3.midi', './train\\MIDI-Unprocessed_01_R1_2009_01-04_ORIG_MID--AUDIO_01_R1_2009_01_R1_2009_01_WAV.midi', './train\\MIDI-Unprocessed_01_R1_2009_01-04_ORIG_MID--AUDIO_01_R1_2009_01_R1_

Construct a PyTorch Dataset

In [158]:
class MIDIDataset(Dataset):
    def __init__(self, file_paths: List[str], tokenizer):
        self.tokenizer = tokenizer
        self.file_paths = file_paths
        
    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        midi = Score(self.file_paths[idx])
        MAX_LEN = 1024  # or even 512 to be safe
        tokens = self.tokenizer(midi)
        tokens = tokens[:MAX_LEN]
        # Return as dictionary to match training function expectations
        return {'input_ids': torch.tensor(tokens, dtype=torch.long)}

Configure the Tokenizer in order to be use to 

In [6]:
config = TokenizerConfig(
    num_velocities=32,           # Classical dynamics (was 1)
    use_chords=True,            # Essential for harmony (was False)
    use_programs=False,         # Piano only (was True)
    use_time_signatures=True,   # Classical changes time sigs
    use_rests=True,             # Important in classical
)
tokenizer = REMI(config)
tokenizer.train(vocab_size=5000, files_paths=train_files)
tokenizer.save("tokenizer.json")

Define PyTorch datasets and dataloaders

In [7]:
from torch.nn.utils.rnn import pad_sequence
from miditok.pytorch_data import DatasetMIDI, DataCollator

train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
print(f"# Train files loaded: {len(train_dataset)}")
print(f"# Test files loaded: {len(test_dataset)}")

collator = DataCollator(tokenizer.pad_token_id)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collator)


# Train files loaded: 938
# Test files loaded: 105


LSTM Model<br>

In [8]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512, num_layers=2, dropout=0.3, bidirectional=False):
        super(MusicRNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True,
            bidirectional=bidirectional
        )

        # If bidirectional, output dim doubles
        rnn_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        self.layer_norm = nn.LayerNorm(rnn_output_dim)
        self.fc = nn.Linear(rnn_output_dim, vocab_size)

    def forward(self, x, hidden=None):
        """
        x: (batch_size, seq_length)
        hidden: hidden states for LSTM
        """
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # (batch_size, seq_length, rnn_output_dim)
        
        out = self.layer_norm(out)          # stabilize activations
        out = self.fc(out)                  # (batch_size, seq_length, vocab_size)

        return out, hidden

Training<br>

In [None]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=10, lr=1e-4, device='cpu'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=256,
    hidden_dim=512,
    num_layers=2,
    dropout=0.3,
    bidirectional=False   # try True if you want to experiment with bidirectional context
)

    train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 7.1732 | Val Loss: 6.6706


KeyboardInterrupt: 

Sampling<br>

In [None]:

def sample_top_k(model, start_token, max_length=100, temperature=1.0, k=10, device='cpu'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)
    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        logits = output[:, -1, :] / temperature       # (1, vocab_size)

        # Top-k filtering
        top_k_logits, top_k_indices = torch.topk(logits, k)
        top_k_probs = F.softmax(top_k_logits, dim=-1)

        next_token_idx = torch.multinomial(top_k_probs, 1).item()
        next_token = top_k_indices[0, next_token_idx].item()

        generated.append(next_token)

        # Stop if EOS or PAD tokens are generated (you can adjust ids accordingly)
        if next_token == tokenizer["EOS_None"] or next_token == tokenizer.pad_token_id:
            break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]  # Typically BOS token

generated_sequence = sample_top_k(model, start_token, max_length=1024, temperature=1.0, k=10)
print("Generated token sequence:")
print(generated_sequence)


Generated token sequence:
[1, 80, 961, 757, 61, 401, 467, 421, 72, 433, 520, 324, 791, 412, 56, 391, 186, 643, 828, 790, 808, 381, 520, 414, 68, 953, 401, 56, 953, 406, 36, 406, 36, 401, 20, 408, 32, 408, 459, 459, 461, 519, 60, 424, 554, 414, 65, 904, 398, 61, 401, 49, 401, 498, 498, 41, 391, 544, 297, 548, 452, 459, 651, 421, 68, 904, 966, 319, 52, 408, 49, 408, 44, 408, 452, 452, 548, 452, 483, 44, 719, 548, 473, 507, 174, 602, 387, 817, 457, 34, 727, 24, 381, 525, 881, 881, 461, 706, 499, 543, 511, 549, 511, 844, 470, 414, 571, 421, 68, 434, 68, 830, 746, 891, 405, 32, 870, 851, 800, 888, 394, 60, 384, 167, 558, 868, 20, 384, 167, 729, 888, 547, 779, 405, 32, 388, 20, 396, 32, 389, 162, 729, 868, 400, 32, 400, 56, 767, 729, 387, 872, 32, 553, 580, 545, 179, 299, 25, 553, 457, 459, 511, 658, 32, 382, 499, 483, 56, 824, 427, 645, 398, 511, 881, 17, 385, 29, 496, 382, 421, 872, 52, 385, 548, 452, 459, 483, 25, 677, 638, 687, 638, 382, 534, 881, 543, 566, 461, 461, 539, 545, 188, 756, 

Generation output of midi files<br>

In [None]:
# Alternative approach with more robust error handling
def generate_midi(tokenizer, generated_sequence, output_filename="rnn.mid"):
    
    #Safely convert generated token sequence to MIDI with error handling

    try:
        # Filter out any invalid tokens
        vocab_size = tokenizer.vocab_size
        valid_sequence = [token for token in generated_sequence if 0 <= token < vocab_size]
        
        print(f"Original sequence length: {len(generated_sequence)}")
        print(f"Valid sequence length: {len(valid_sequence)}")
        
        if len(valid_sequence) < 2:
            print("Sequence too short or no valid tokens found")
            return None
            
        # Try to decode
        output_score = tokenizer.decode(valid_sequence)
        
        # Check if the score has any content
        if len(output_score.tracks) == 0:
            print("Generated MIDI has no tracks")
            return None
            
        # Save MIDI file
        output_score.dump_midi(output_filename)
        print(f"Successfully generated {output_filename}")
        return output_score
        
    except Exception as e:
        print(f"Error during MIDI generation: {e}")
        print(f"Sequence sample: {generated_sequence[:20]}...")
        return None

# Usage
start_token = tokenizer.special_tokens_ids[1] if len(tokenizer.special_tokens_ids) > 1 else 1
generated_sequence = sample(model, start_token, max_length=1024)

# Generate MIDI safely
output_score = generate_midi(tokenizer, generated_sequence, "rnn.mid")


Original sequence length: 1025
Valid sequence length: 1025
Successfully generated rnn.mid
Error during audio conversion: [WinError 2] The system cannot find the file specified
