In [None]:
# TODO: Locate the root of the directory
BASE_DIR = '/content/drive/MyDrive/cs260-final-project'

In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs260/assignments/assignment6/'
FOLDERNAME = 'src/vanilla-rnn'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('{}/{}'.format(BASE_DIR, FOLDERNAME))

# Song Generation with RNNs
Import all of the necessary functions to use here.

In [None]:
# Setup cell.
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from cs260.rnn_layers import *
from cs260.lyric_solver import CaptioningSolver
from cs260.classifiers.rnn import CaptioningRNN
from cs260.lyric_utils import load_lyric_data, decode_captions, sample_lyric_minibatch, write_lyric_data

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # Set default size of plots.
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

# Lyric Dataset

**Features.** Includes the Artist and the Topic

**Lyrics.** Includes the tokenized version of the lyrics

**Tokens.** There are a couple special tokens that we add to the vocabulary, and we have taken care of all implementation details around special tokens for you. We prepend a special `<START>` token and append an `<END>` token to the beginning and end of each lyric respectively. Rare words are replaced with a special `<UNK>` token (for "unknown"). In addition, since we want to train with minibatches containing lyrics of different lengths, we pad short lyrics with a special `<NULL>` token after the `<END>` token and don't compute loss or gradient for `<NULL>` tokens.

**NOTE:** For first-time users, run all three cells and ensure the following cells are working. This creates an h5 file so that our training data will be ready to be fed to the RNN.

In [None]:
num_topics = 10

In [None]:
train_file = f'data/train/big-lda-train-{num_topics}.csv'
val_file = f'data/val/big-lda-val-{num_topics}.csv'
h5_file = f'data/h5/big-lda-{num_topics}.h5'
write_lyric_data(train_file, val_file, h5_file, base_dir=BASE_DIR)

base dir  /content/drive/MyDrive/cs260-final-project
(1/6) Extracting training data...
(2/6) Extracting validation data...
(3/6) Extracting tokenizer...
(4/6) Tokenizing training lyrics...
(5/6) Tokenizing validation lyrics...
(6/6) Storing data into h5 file data/h5/big-lda-10.h5...
Done!


In [None]:
data = load_lyric_data(10, base_dir=BASE_DIR) # num_topics, base_dir, max_train=None

# Print out all the keys and values from the data dictionary.
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

base dir  /content/drive/MyDrive/cs260-final-project
train_artist <class 'numpy.ndarray'> (178712,) object
train_topic_id <class 'numpy.ndarray'> (178712,) int64
train_lyric <class 'numpy.ndarray'> (178712, 512) int64
val_artist <class 'numpy.ndarray'> (26254,) object
val_topic_id <class 'numpy.ndarray'> (26254,) int64
val_lyric <class 'numpy.ndarray'> (26254, 512) int64
idx_to_word <class 'list'> 241489
word_to_index <class 'dict'> 241489
idx_to_artist <class 'list'> 6921
artist_to_index <class 'dict'> 6921
train_features <class 'numpy.ndarray'> (178712, 1001) float64
val_features <class 'numpy.ndarray'> (26254, 1001) float64


# Lyric RNN Model on Small Data
Run this cell to fit the lyric data to the vanilla RNN model. 

**WARNING:** Running this cell take up all of your RAM depending on your Colab plan.

In [None]:
np.random.seed(260)

small_data = load_lyric_data(10, base_dir=BASE_DIR, max_train=50)

small_rnn_model = CaptioningRNN(
    cell_type='rnn',
    word_to_idx=small_data['word_to_index'],
    input_dim=small_data['train_features'].shape[1],
    hidden_dim=512,
    wordvec_dim=256,
)

small_rnn_solver = CaptioningSolver(
    small_rnn_model, small_data, # see how the batch is constructed
    update_rule='adam',
    num_epochs=1, #50
    batch_size=1, #25
    val_batch_size=5,
    optim_config={
     'learning_rate': 5e-3,
    },
    lr_decay=0.95,
    verbose=True, print_every=10,
)

small_rnn_solver.train()

# Plot the training losses.
plt.plot(small_rnn_solver.loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

base dir  /content/drive/MyDrive/cs260-final-project
(Iteration 1 / 50) training loss: 3815.593388, validation loss: 940.276811


In [None]:
print('Final loss: ', small_rnn_solver.loss_history[-1])

# RNN Sampling at Test Time
Unlike classification models, image captioning models behave very differently at training time vs. at test time. At training time, we have access to the ground-truth caption, so we feed ground-truth words as input to the RNN at each timestep. At test time, we sample from the distribution over the vocabulary at each timestep and feed the sample as input to the RNN at the next timestep.

In the file `cs260/classifiers/rnn.py`, implement the `sample` method for test-time sampling. After doing so, run the following to sample from your overfitted model on both training and validation data. The samples on training data should be very good. The samples on validation data, however, probably won't make sense.

In [None]:
# sample the lyric and see if the model generates the same lyric!
for split in ['train', 'val']:
    minibatch = sample_lyric_minibatch(small_data, split=split, batch_size=2)
    gt_lyrics, features = minibatch
    gt_lyrics = decode_captions(gt_lyrics, data['idx_to_word'])

    sample_lyrics = small_rnn_model.sample(features)
    sample_lyrics = decode_captions(sample_lyrics, data['idx_to_word'])

    for gt_lyric, sample_lyric in zip(gt_lyrics, sample_lyrics):    
        plt.title('%s\n%s\nGT:%s' % (split, sample_lyric, gt_lyric))
        plt.axis('off')
        plt.show()