# Music Generation using RNN

#### Training a RNN for music generation using PyTorch. ABC notations are being used for this training, since they are in text format, they are suitable to use to train a neural network

### 1.1 Dependencies

In [1]:
# COMET ML to track model experiments
import comet_ml

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim

# Remaining imports
import numpy as np
import os
import time
import functools
from IPython.display import display
from tqdm import tqdm
from utils import *
from dotenv import load_dotenv

### 1.2 COMET API KEY

In [None]:
load_dotenv()
comet_api_key = os.getenv('COMET_API_KEY')

# Validation
assert comet_api_key is not None, "Please set COMET_API_KEY in .env file"
# TODO: Need to setup CUDA
assert torch.cuda.is_available(), "CUDA not available"

### 1.3 Dataset

In [None]:
# Download the dataset
s = load_data()

# Print one of the samples
example = s[0]
print("Example of a sample:")
print(example)

play_song(example)

In [None]:
# Join all the lyrics, sort and de-duplicate
all_lyrics = "\n\n".join(s)

# Unique characters
vocab = sorted(set(all_lyrics))

print(f"Unique characters in the dataset: {len(vocab)}")


#### 1.3.1 Vectorize text

In [12]:
# Simple function to convert characters to indices
char_to_index = {c: i for i, c in enumerate(vocab)}

# Simple function to convert indices to characters
index_to_char = {i: c for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

#print(f"index_to_char: {len(index_to_char)}")
#print(f"idx2char: {len(idx2char)}")

In [None]:
print('{')
for char, _ in zip(char_to_index, range(50)):
    print('  {:4s}: {:3d},'.format(repr(char), char_to_index[char]))
print('  ...\n}')

In [21]:
### Vectorize the songs string ###

'''TODO: Write a function to convert the all songs string to a vectorized
    (i.e., numeric) representation. Use the appropriate mapping
    above to convert from vocab characters to the corresponding indices.

  NOTE: the output of the `vectorize_string` function
  should be a np.array with `N` elements, where `N` is
  the number of characters in the input string
'''
def vectorize_string(string):
    return np.array([char_to_index[c] for c in string])
  

vectorized_songs = vectorize_string(all_lyrics)
# print(vectorized_songs[:100])
assert isinstance(vectorized_songs, np.ndarray), "returned result should be a np array"

In [None]:
print ('{} ---- characters mapped to int ----> {}'.format(repr(all_lyrics[20:50]), vectorized_songs[20:50]))

In [None]:
vectorized_songs.shape

#### 1.3.2 Function to create batches of data based on sequence length

In [None]:
### Batch definition to create training examples ###

def get_batch(vectorized_songs, seq_length, batch_size):
  # the length of the vectorized songs string
  n = vectorized_songs.shape[0] - 1
  # randomly choose the starting indices for the examples in the training batch
  idx = np.random.choice(n-seq_length, batch_size)

  # TODO: construct a list of input sequences for the training batch
  input_batch = [vectorized_songs[i : i+seq_length] for i in idx]

  # TODO: construct a list of output sequences for the training batch
  output_batch = [vectorized_songs[i+1 : i+seq_length+1] for i in idx]

  # x_batch, y_batch provide the true inputs and targets for network training
  # Creating a tensor from a list of numpy.ndarrays is extremely slow. 
  # Converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor.
  #x_batch = np.array(input_batch)
  #y_batch = np.array(output_batch)
  x_batch = torch.tensor(np.array(input_batch), dtype=torch.long)
  y_batch = torch.tensor(np.array(output_batch), dtype=torch.long)

  return x_batch, y_batch

# Simple tests
args = (vectorized_songs, 10, 2)
x_batch, y_batch = get_batch(*args)
print("Input: ", x_batch)
print("Target: ", y_batch)

assert x_batch.shape == (2, 10), "Incorrect batch shape"
assert y_batch.shape == (2, 10), "Incorrect batch shape"
print("Passed!")

In [65]:
x_batch, y_batch = get_batch(vectorized_songs, seq_length=5, batch_size=1)

for i, (input_idx, target_idx) in enumerate(zip(x_batch[0], y_batch[0])):
    print(f"Step {i}")
    print(f"   input: {input_idx} ({repr(idx2char[input_idx.item()])})")
    print(f"   expected output: {target_idx} ({repr(idx2char[target_idx.item()])})")

Step 0
   input: 14 ('2')
   expected output: 26 ('A')
Step 1
   input: 26 ('A')
   expected output: 14 ('2')
Step 2
   input: 14 ('2')
   expected output: 82 ('|')
Step 3
   input: 82 ('|')
   expected output: 27 ('B')
Step 4
   input: 27 ('B')
   expected output: 26 ('A')


### 1.4 RNN model

#### 1.4.1 Define the model

In [68]:
#### Defining the RNN ####

class LSTMModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size):
    super(LSTMModel, self).__init__()
    self.hidden_size = hidden_size

    # embedding layer
    # Layer 1: Embedding layer to transform indices into dense vectors
    # of fixed size (embedding_dim)
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # LSTM layer
    # Layer 2: LSTM with `hidden_size` number of hidden units
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

    # Layer 3: Linear layer (fully connected layer) that maps the LSTM layer's output
    # to the number of characters we have in our vocabulary
    self.linear = nn.Linear(hidden_size, vocab_size)

    def init_hidden(self, batch_size, device):
      # Initialize hidden and cell states
      return (torch.zeros(1, batch_size, self.hidden_size).to(device),
              torch.zeros(1, batch_size, self.hidden_size).to(device))
    
    def forward(self, x, state=None, return_state=False):
      x = self.embedding(x)

      if state is None:
        state = self.init_hidden(x.size(0), x.device)
      out, state = self.lstm(x, state)

      out = self.linear(out)
      return out if not return_state else (out, state)