References:

- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- https://www.tensorflow.org/tutorials/audio/music_generation

In [137]:
import torch
import torch.nn as nn
from pathlib import Path
import csv
from fractions import Fraction


class MelodyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = 1

        # (seq_len, input_size) -> (seq_len, hidden_size)
        self.lstm = nn.LSTM(input_size, hidden_size)
        # (seq_len, hidden_size) -> (seq_len, output_size)
        self.fc = nn.Linear(hidden_size, output_size)
        # Uses `seq_len * hidden_size` features to predict `output_size` targets.

    def forward(self, x):
        """Forward pass

        Args:
            x: torch tensor of shape (seq_len, input_size)

        Returns:
            torch tensor of shape (seq_len, output_size)
        """
        # lstm's default h0 (hidden state) and c0 (cell state) are zeroes.
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out.view(len(x), -1))
        return out  # TODO pass through log_softmax?


def train_model(
    model: MelodyLSTM, train_loader, loss_fn, optimizer, num_epochs=10, progress=True
):
    model.train()
    i = 0
    for epoch in range(num_epochs):
        if progress:
            print(f"Epoch {epoch}")
        for inputs, target in train_loader:
            # TODO Cant do multiple epochs with a generator. Use custom torch Dataset. Not sure how to make it return pieces of a file with each __get__item.
            if progress and (i % 100) == 0:
                print(f"i={i}")

            model.zero_grad()

            inputs = scale_pitch(torch.tensor(inputs, dtype=torch.float32))
            target = scale_pitch(torch.tensor(target, dtype=torch.float32))

            output = model(inputs)
            loss = loss_fn(output[-1], target)
            loss.backward()
            optimizer.step()
            i = i + 1
    return model


def scale_pitch(x, pitch_range=128):
    return x / torch.tensor([pitch_range, 1.0, 1.0])


def make_data_loader(files: list[str | Path], read_fn: callable, sequence_length: int):
    """Make lazy data loader for sequence data

    Args:
        files: list of files containing sequences
        read_fn: function that reads a file into a list
        sequence_length: Length of input sequence to use as context.

    Yields:
        inputs: list of `sequence_length` tokens
        output: next token to predict
    """
    for file in files:
        sequence = read_fn(file)
        # TODO Handle case where len(sequence) < sequence length?
        for i in range(len(sequence) - sequence_length):
            inputs = sequence[i : i + sequence_length]
            output = sequence[i + sequence_length]
            yield inputs, output


def read_time_series(file: str | Path) -> list:
    """Read txt file containing space-delimited sequences of pitch numbers or rest tokens or hold tokens"""
    with open(file) as f:
        return f.read().split(" ")


def read_event_sequence(file: str | Path) -> list:
    """Read txt file containing newline-delimited sequences of (pitch, duration, offset)"""
    with open(file) as f:
        data = []
        for row in csv.reader(f):
            pitch, duration, offset = row
            if duration.find("/"):
                duration = Fraction(duration)
            if offset.find("/"):
                offset = Fraction(offset)
            data.append((int(pitch), float(duration), float(offset)))
    return data


INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, SEQUENCE_LENGTH = 3, 64, 3, 16
path_to_dataset_txt = Path("data/event_sequence")  # 3696777 notes in 1276 files
files = path_to_dataset_txt.glob("*.txt")
data_loader = make_data_loader(
    files, read_event_sequence, sequence_length=SEQUENCE_LENGTH
)

In [110]:
inputs, outputs = next(data_loader)
inputs, outputs

([(33, 0.5, 4.333333333333334),
  (48, 0.3333333333333333, 0.0),
  (46, 0.3333333333333333, 0.16666666666666666),
  (46, 0.25, 1.3333333333333333),
  (46, 0.25, 0.25),
  (33, 0.25, 0.0),
  (46, 0.3333333333333333, 0.0),
  (46, 0.25, 0.5),
  (33, 0.25, 0.0),
  (33, 0.25, 2.1666666666666665),
  (39, 0.25, 0.0),
  (33, 0.25, 0.0),
  (48, 0.25, 0.0),
  (33, 0.3333333333333333, 0.0),
  (51, 0.25, 0.0),
  (57, 0.5, 0.0)],
 (46, 0.25, 0.4166666666666667))

In [118]:
model = MelodyLSTM(
    input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, output_size=OUTPUT_SIZE
)
input_tensor = torch.tensor(inputs, dtype=torch.float32)
output_tensor = torch.tensor(outputs, dtype=torch.float32)
prediction = model(input_tensor)
input_tensor.shape, output_tensor.shape, prediction.shape, prediction[-1].shape

(torch.Size([16, 3]), torch.Size([3]), torch.Size([16, 3]), torch.Size([3]))

In [None]:
from torch import optim

loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
# loss_fn(output_tensor, model(input_tensor)[-1])

model_ = train_model(
    model=model,
    train_loader=data_loader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    num_epochs=1,
)

In [10]:
from datetime import datetime

dataset = "event_sequence"
f"model_{datetime.now().isoformat(timespec='seconds')}_{dataset}.pkl"

'model_2024-10-07T19:46:21_event_sequence.pkl'