In [1]:
import os
import numpy as np
import pickle
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## load tokenizer if already saved. else ignore the next 3 cells

In [2]:
def load_tokenizer(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
        tokenizer = data['tokenizer']
    return tokenizer

In [3]:
tokenizer_lyr = load_tokenizer("tokenizers/tokenizer_lyr.pkl")
tokenizer_note = load_tokenizer("tokenizers/tokenizer_note.pkl")
tokenizer_duration = load_tokenizer("tokenizers/tokenizer_duration.pkl")
tokenizer_rest = load_tokenizer("tokenizers/tokenizer_rest.pkl")

In [4]:
notes_size = len(tokenizer_note.word_index) + 1
durations_size = len(tokenizer_duration.word_index) + 1
rests_size = len(tokenizer_rest.word_index) + 1

notes_size, durations_size, rests_size

(110, 22, 11)

In [5]:
tf.random.set_seed(42)

In [6]:
songs_path = './lmd-full_MIDI_dataset/Syllable_Parsing'
songs = os.listdir(songs_path)

In [7]:
max_len = 100

In [8]:
features = []
for song in tqdm(songs):
    features.append(np.load(f"{songs_path}/{song}", allow_pickle=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7597/7597 [00:08<00:00, 910.19it/s]


In [9]:
lyrics = np.array(features)[:, :, 2]
midis = np.array(features)[:, :, 1]

In [10]:
lyrics_list = []
for lyric in tqdm(lyrics):
    l = "BOS " + " ".join(lyric[0][:max_len]) + " EOS"
    lyrics_list.append(l)

notes_list, durations_list, rests_list = [], [], []
for midi in tqdm(midis):
    note = np.array(midi[0])[:, 0]
    duration = np.array(midi[0])[:, 1]
    rest = np.array(midi[0])[:, 2]
    
    note_str = "BOS " + " ".join([str(n) for n in note[:max_len]]) + " EOS"
    duration_str = "BOS " + " ".join([str(d) for d in duration[:max_len]]) + " EOS"
    rest_str = "BOS " + " ".join([str(r) for r in rest[:max_len]]) + " EOS"
    
    notes_list.append(note_str)
    durations_list.append(duration_str)
    rests_list.append(rest_str)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 7597/7597 [00:00<00:00, 158795.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7597/7597 [00:04<00:00, 1659.38it/s]


In [None]:
# create lyric sequences
tokenizer_lyr = Tokenizer(num_words=10000, oov_token="oov")
tokenizer_lyr.fit_on_texts(lyrics_list)

pad_id_lyr = tokenizer_lyr.word_index["eos"]
start_id_lyr = tokenizer_lyr.word_index["bos"]

sequences_lyr = tokenizer_lyr.texts_to_sequences(lyrics_list)
sequences_lyr = pad_sequences(sequences_lyr, maxlen=max_len+2, truncating='post', padding='post', value=pad_id_lyr)

# create note sequences
tokenizer_note = Tokenizer(filters="")
tokenizer_note.fit_on_texts(notes_list)

pad_id_note = tokenizer_note.word_index["eos"]
start_id_note = tokenizer_note.word_index["bos"]

sequences_note = tokenizer_note.texts_to_sequences(notes_list)
sequences_note = pad_sequences(sequences_note, truncating='post', padding='post', value=pad_id_note)

# create duration sequences
tokenizer_duration = Tokenizer(filters="")
tokenizer_duration.fit_on_texts(durations_list)

pad_id_duration = tokenizer_duration.word_index["eos"]
start_id_duration = tokenizer_duration.word_index["bos"]

sequences_duration = tokenizer_duration.texts_to_sequences(durations_list)
sequences_duration = pad_sequences(sequences_duration, truncating='post', padding='post', value=pad_id_duration)

# create rest sequences
tokenizer_rest = Tokenizer(filters="")
tokenizer_rest.fit_on_texts(rests_list)

pad_id_rest = tokenizer_rest.word_index["eos"]
start_id_rest = tokenizer_rest.word_index["bos"]

sequences_rests = tokenizer_rest.texts_to_sequences(rests_list)
sequences_rests = pad_sequences(sequences_rests, truncating='post', padding='post', value=pad_id_rest)

In [None]:
sequences_lyr = np.expand_dims(sequences_lyr, axis=2)
sequences_note = np.expand_dims(sequences_note, axis=2)
sequences_duration = np.expand_dims(sequences_duration, axis=2)
sequences_rests = np.expand_dims(sequences_rests, axis=2)

sequences = np.concatenate([sequences_lyr, sequences_note, sequences_duration, sequences_rests], axis=2)
sequences.shape

In [None]:
if not os.path.exists("tokenizers"):
    os.mkdir("tokenizers")
    
if not os.path.exists("data"):
    os.mkdir("data")

In [None]:
np.save("data/sequences.npy", sequences)

In [None]:
def save_tokenizer(file, tokenizer):
    with open(file, 'wb') as handle:
        pickle.dump({'tokenizer': tokenizer}, handle)

In [None]:
save_tokenizer("tokenizers/tokenizer_lyr.pkl", tokenizer_lyr)
save_tokenizer("tokenizers/tokenizer_note.pkl", tokenizer_note)
save_tokenizer("tokenizers/tokenizer_duration.pkl", tokenizer_duration)
save_tokenizer("tokenizers/tokenizer_rest.pkl", tokenizer_rest)