# Install and load required libraries

In [7]:
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig

# BaseLine 
##Markov Chain for MIDI generation
 Get the list of files for training and test sets

In [35]:
train_files = glob.glob("./train/*.midi")
test_files = glob.glob("./test/*.midi")

In [36]:
type(train_files[0])

str

In [37]:
train_files[0].encode('utf-8').decode('utf-8')

'./train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_01_Track01_wav.midi'

In [38]:
print(train_files[0].encode('utf-8'))

b'./train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_01_Track01_wav.midi'


In [41]:
str.encode(train_files[0], 'utf-8')

b'./train\\MIDI-Unprocessed_01_R1_2006_01-09_ORIG_MID--AUDIO_01_R1_2006_01_Track01_wav.midi'

## Train your MIDI tokenizer

In [None]:
config = TokenizerConfig(num_velocities=1, use_chords=False, use_programs=True)
tokenizer = REMI(config)
tokenizer.train(vocab_size=1000, files_paths=train_files)
tokenizer.save("tokenizer.json")

## Construct a PyTorch Dataset

In [43]:
class MIDIDataset(Dataset):
    def __init__(self, file_paths: List[str], tokenizer):
        self.tokenizer = tokenizer
        self.file_paths = file_paths
    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        midi = Score(self.file_paths[idx])
        tokens = self.tokenizer(midi)
        return np.array(tokens)

## Define PyTorch datasets and dataloaders

In [44]:
train_dataset = MIDIDataset(train_files, tokenizer)
test_dataset = MIDIDataset(test_files, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

## Define a Second Order Markov Chain model

In [46]:
class SecondOrderMarkovChain:
    def __init__(self):
        self.transitions = defaultdict(lambda: defaultdict(int))
        self.probabilities = defaultdict(lambda: defaultdict(float))

    def train(self, train_loader):
        for sequence in train_loader:
            sequence = sequence[0].numpy().astype(int)
            for i in range(len(sequence) - 2):
                state1, state2 = sequence[i], sequence[i + 1]
                next_state = sequence[i + 2]
                self.transitions[(state1, state2)][next_state] += 1

        for (state1, state2), next_states in self.transitions.items():
            total = sum(next_states.values())
            for next_state, count in next_states.items():
                self.probabilities[(state1, state2)][next_state] = count / total
        return self.probabilities

    def generate(self, test_sequence, num_predictions=1):
        test_sequence = test_sequence[0].numpy().astype(int)
        results = [test_sequence[0], test_sequence[1]]
        for i in range(100):
            if (results[-2], results[-1]) not in self.probabilities:
                break
            else:
                probs = self.probabilities[(results[-2], results[-1])]
                states = list(probs.keys())
                probabilities = list(probs.values())
                if not states:
                    break
                try:
                    predictions = np.random.choice(states, size=num_predictions, p=probabilities)
                except:
                    break
                results.append(predictions[0])
        return results

## Train your model and make inferences

In [48]:
def evaluate_markov_accuracy(model, test_loader, device='cpu'):
    total_correct = 0
    total_tokens = 0

    for seq in test_loader:
        if not isinstance(seq, torch.Tensor):
            seq = torch.tensor(seq, dtype=torch.long)
        seq = seq.to(device)

        if seq.size(0) < 3:
            continue

        seed = seq[:2]
        targets = seq[2:]
        generated = model.generate(seed.tolist(), length=targets.size(0))
        if not isinstance(generated, torch.Tensor):
            generated = torch.tensor(generated, dtype=torch.long, device=device)

        min_len = min(generated.size(0), targets.size(0))
        total_correct += (generated[:min_len] == targets[:min_len]).sum().item()
        total_tokens += min_len

    if total_tokens == 0:
        return 0.0

    accuracy = total_correct / total_tokens
    print(f"Test Accuracy: {accuracy:.4f}  ({total_correct}/{total_tokens})")
    return accuracy


model = SecondOrderMarkovChain()
model.train(train_loader)
acc = evaluate_markov_accuracy(model, test_loader, device="cpu")

predictions = []
for test_sequence in test_loader:
    predictions.append(model.generate(test_sequence))
for i, prediction in enumerate(predictions):
    output_score = tokenizer.decode(torch.Tensor(prediction))
    output_score.dump_midi(f"{i}.mid")

In [52]:
print("Accuracy:", acc)

Accuracy: 0.0


## A New Dataset for batch inputs

In [18]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [19]:
len(train_loader), len(test_loader)

(235, 27)

## RNN

In [20]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

### Training

In [None]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=5, lr=0.001, device='cpu'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses = []
    val_losses = []
    learning_rates = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)
            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)
                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        learning_rates.append(optimizer.param_groups[0]['lr'])

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    return train_losses, val_losses, learning_rates


def plot_training_progress(train_losses, val_losses, learning_rates):
    #Plot training metrics to visualize progress"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss plot
    axes[0].plot(train_losses, label='Training Loss', color='blue')
    axes[0].plot(val_losses, label='Validation Loss', color='red')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training Progress')
    axes[0].legend()
    axes[0].grid(True)
    
    # Learning rate plot
    axes[1].plot(learning_rates, color='green')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Learning Rate')
    axes[1].set_title('Learning Rate Schedule')
    axes[1].set_yscale('log')
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()

    
# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train_losses, val_losses, learning_rates = train(model, train_loader, test_loader, vocab_size)

    try:
        import matplotlib.pyplot as plt
        plot_training_progress(train_losses, val_losses, learning_rates)
    except ImportError:
        print("Install matplotlib to see training plots: pip install matplotlib")


In [None]:
def evaluate_accuracy(model, data_loader, vocab_size, device='cpu'):
    model.eval()
    model = model.to(device)
    total_correct = 0
    total_tokens = 0

    with torch.no_grad():
        for batch in data_loader:
            batch = batch['input_ids'].to(device)
            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            preds = torch.argmax(outputs, dim=-1)
            mask = targets != 0  # Ignore padding

            correct = (preds == targets) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()
   
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy


train_acc = evaluate_accuracy(model, train_loader, vocab_size, device = "cpu")

### Sampling

In [None]:
def sample(model, start_token, max_length=100, temperature=1.0, device='cuda'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 189, 43, 113, 127, 43, 113, 127, 48, 113, 127, 193, 48, 112, 125, 53, 112, 125, 195, 53, 112, 125, 197, 48, 112, 125, 198, 51, 111, 125, 200, 53, 112, 125, 198, 55, 112, 125, 199, 53, 112, 127, 200, 52, 112, 125, 48, 112, 125, 201, 50, 112, 125, 202, 53, 112, 125, 203, 53, 111, 126, 205, 48, 110, 125, 207, 55, 110, 125, 208, 43, 112, 125, 209, 51, 110, 125, 211, 46, 111, 125, 213, 53, 111, 125, 214, 55, 111, 125, 216, 58, 112, 125, 217, 55, 112, 125, 218, 53, 110, 125, 204, 53, 112, 125, 205, 51, 111, 140, 214, 51, 112, 147, 215, 55, 110, 138, 4, 189, 55, 115, 140, 197, 53, 113, 132, 205, 58, 114, 132, 48, 114, 132, 213, 55, 113, 132, 53, 113, 132, 48, 113, 132, 4, 189, 60, 115, 140, 65, 115, 140, 60, 115, 140, 41, 115, 140, 4, 189, 60, 116, 134, 44, 113, 140, 48, 113, 140, 197, 58, 112, 132, 205, 63, 115, 140, 43, 113, 132, 209, 48, 114, 132, 213, 55, 113, 132, 58, 113, 132, 38, 113, 130, 4, 189, 39, 114, 138, 193, 55, 112, 128, 197, 56, 111, 128, 43, 

In [None]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
#fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth

output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn.mid")