In [1]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


In [2]:
# Directory containing JSON files
json_dir = '3_processed_feature_limited/data'

In [3]:
# Load dataset from JSON files
def load_dataset_from_json(json_dir):
    sequences = []
    for filename in os.listdir(json_dir):
        if filename.endswith('.json'):
            filepath = os.path.join(json_dir, filename)
            with open(filepath, 'r') as f:
                data = json.load(f)
                for instrument, notes in data['instruments'].items():
                    sequence = ''.join(map(str, notes))
                    sequences.append(sequence)
    return sequences

In [4]:
# Load sequences from JSON files
sequences = load_dataset_from_json(json_dir)
print(f"sequences data: {len(sequences)} samples")
print(f"sequences data: {sequences[:4]} values")

sequences data: 1113 samples
sequences data: ['1232456789391011121110131413151110162717182192202116222324252611272815282979202203031216571410321233153424352614192031363716383940413142222712322643224244271832452013184618331933181233311471047203248411720301632493450195051525349545535112028414618331933181233311472647203217181720301632375657405758525349545960616263646566673468676970717273747576777874347978807481826383848586878889909192729374349490959697759895991001011021633103361044110510642107108182927491091061110182711111211371141151630161083111652315011711843119120121122411231033612441105106323112518291261271091061281101827112115712911213016301833318752193811713175951321211331191346070621358313613789751381317713992721407453949095969753141721429714362144145146147148149531501511521398278153973415477721551564315718158301591528', '112345678910111213148115161718671920162118672289231718242522926175272881223175292819910301224253132333435363738714394014414243811512211819444438454647484950513336

In [6]:
# Create a vocabulary of unique characters (digits) in the sequences
vocab = sorted(set(''.join(sequences)))  # Extract unique digits from sequences
vocab_size = len(vocab)
print(f"vocab data: {len(vocab)} samples")
print(f"vocab data: {vocab} values")

vocab data: 10 samples
vocab data: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] values


In [10]:

# Create a mapping from characters to indices and vice versa
char_to_idx = {ch: i for i, ch in enumerate(vocab)}
idx_to_char = {i: ch for i, ch in enumerate(vocab)}

In [11]:
# Hyperparameters for the model and training
batch_size = 32  # Batch size
num_epochs = 5  # Number of epochs for training
learning_rate = 0.001
embedding_size = 32
n_head = 2
d_model = 32
n_layer = 2

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available

In [13]:
# Dataset class to convert sequences to tensor indices
class SequenceDataset(Dataset):
    def __init__(self, sequences, char_to_idx, sequence_length):
        self.sequences = sequences
        self.char_to_idx = char_to_idx
        self.sequence_length = sequence_length
        self.data = []
        self.labels = []
        self.prepare_data()

    # Convert sequences into pairs of (input, target) tensors
    def prepare_data(self):
        for seq in self.sequences:
            idx_seq = [self.char_to_idx[ch] for ch in seq]
            for i in range(len(idx_seq) - self.sequence_length):
                self.data.append(idx_seq[i:i+self.sequence_length])
                self.labels.append(idx_seq[i+1:i+self.sequence_length+1])
        print(f'Prepared {len(self.data)} data points')  # Debugging line

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])

In [14]:
# Define the Transformer-XL model
class TransformerXL(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, n_layer):
        super(TransformerXL, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead=n_head, num_encoder_layers=n_layer)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # Convert indices to embeddings
        x = self.transformer(x, x)  # Apply Transformer layers
        x = self.fc(x)  # Project to the size of the vocabulary
        return x

In [15]:
# Define the sequence length
sequence_length = 10


In [16]:
# Create the dataset and data loader
dataset = SequenceDataset(sequences, char_to_idx, sequence_length)
print(f'Dataset length: {len(dataset)}')

data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Prepared 732294 data points
Dataset length: 732294


In [17]:
# Initialize the model, loss function, and optimizer
model = TransformerXL(vocab_size, d_model, n_head, n_layer).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [18]:
# Training loop
for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}/{num_epochs}')
    for batch_idx, (inputs, targets) in enumerate(data_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 2 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(data_loader)}], Loss: {loss.item():.4f}')

    print(f'Epoch [{epoch+1}/{num_epochs}] completed')


Starting epoch 1/5
Epoch [1/5], Batch [2/22885], Loss: 2.5922
Epoch [1/5], Batch [4/22885], Loss: 2.2900
Epoch [1/5], Batch [6/22885], Loss: 2.2975
Epoch [1/5], Batch [8/22885], Loss: 2.2582
Epoch [1/5], Batch [10/22885], Loss: 2.3105
Epoch [1/5], Batch [12/22885], Loss: 2.2985
Epoch [1/5], Batch [14/22885], Loss: 2.2927
Epoch [1/5], Batch [16/22885], Loss: 2.2865
Epoch [1/5], Batch [18/22885], Loss: 2.3008
Epoch [1/5], Batch [20/22885], Loss: 2.2836
Epoch [1/5], Batch [22/22885], Loss: 2.2761
Epoch [1/5], Batch [24/22885], Loss: 2.2866
Epoch [1/5], Batch [26/22885], Loss: 2.3058
Epoch [1/5], Batch [28/22885], Loss: 2.2930
Epoch [1/5], Batch [30/22885], Loss: 2.2977
Epoch [1/5], Batch [32/22885], Loss: 2.3185
Epoch [1/5], Batch [34/22885], Loss: 2.3038
Epoch [1/5], Batch [36/22885], Loss: 2.2904
Epoch [1/5], Batch [38/22885], Loss: 2.2952
Epoch [1/5], Batch [40/22885], Loss: 2.2773
Epoch [1/5], Batch [42/22885], Loss: 2.3248
Epoch [1/5], Batch [44/22885], Loss: 2.2813
Epoch [1/5], Batc

In [19]:
# Function to generate sequences using the trained model
def generate_sequence(model, start_seq, char_to_idx, idx_to_char, length):
    model.eval()
    input_seq = torch.tensor([[char_to_idx[ch] for ch in start_seq]]).to(device)
    generated_seq = start_seq
    for _ in range(length):
        with torch.no_grad():
            output = model(input_seq)
            next_char_idx = torch.argmax(output[0, -1]).item()
            next_char = idx_to_char[next_char_idx]
            generated_seq += next_char
            input_seq = torch.cat((input_seq, torch.tensor([[next_char_idx]]).to(device)), dim=1)
    return generated_seq

In [29]:
# Generate a new sequence starting with '0123456789'
start_sequence = '0123456789'
generated_sequence = generate_sequence(model, start_sequence, char_to_idx, idx_to_char, 20)
print(f'Generated sequence: {generated_sequence}')

Generated sequence: 012345678911111111111111111111
