# Project Name: Lyric Generator
Description:
Implementing a Deep Neural network using LSTMs to create a character based lyric generator

Details:
1. step size: 40
2. batch size: 128
3. Epochs: 100
4. Songs: 1000

### Importing Libraries

In [1]:
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Activation,LSTM,Dense,CuDNNLSTM, Flatten, Bidirectional, Dropout
from keras.optimizers import Adam
import pandas as pd
import numpy as np
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
import os
import matplotlib.pyplot as plt
import re
np.random.seed(10)

Using TensorFlow backend.


### Design Parameters

In [3]:
BATCH_SIZE = 32
maxlen = 50 ##timesteps
epochs = 10
MIN_WORD_FREQUENCY = 10
song_count = 1000

In [4]:
def load_lyrics(path):
    '''
    Function to load lyrics of all the artists in the input path
    '''
    lyrics = ""
    for fn in os.listdir(path):
        with open(os.path.join(path, fn), 'r') as song:
            song_lyrics = clean_string(song.read())
            lyrics += song_lyrics
    return lyrics

def clean_string(string):
    """
    Cleans unwanted characters and words from string.
    @param string: The string to be cleaned.
    @return: The cleaned string.
    """
    string = string.lower()  # lowercase

    clean_words = []
    for word in string.split():
#         if(word and (word[0] == "[" and word[-1] == "]")\
#            or (word[0] == "(" and word[-1] == ")")):
#             continue
        # clean words with quotation marks on only one side
        if word[0] == '"' and word[-1] != '"':
            word = word[1:]
        elif word[-1] == '"' and word[0] != '"':
            word = word[-1]

        # clean words with parenthases on only one side
        if word[0] == '(' and word[-1] != ')':
            word = word[1:]
        elif word[-1] == ')' and word[0] != '(':
            word = word[:-1]

        clean_words.append(word)
    return ' '.join(clean_words)

def clean_array(vec):
    """
    Cleans unwanted characters and words from string.
    @param string: The string to be cleaned.
    @return: The cleaned string.
    """
    clean_words = []
    for word in vec:
        if(word and (word[0] == "[" and word[-1] == "]")\
           or (word[0] == "(" and word[-1] == ")")):
            continue
        # clean words with quotation marks on only one side
        if word[0] == '"' and word[-1] != '"':
            word = word[1:]
        elif word[-1] == '"' and word[0] != '"':
            word = word[-1]

        # clean words with parenthases on only one side
        if word[0] == '(' and word[-1] != ')':
            word = word[1:]
        elif word[-1] == ')' and word[0] != '(':
            word = word[:-1]

        clean_words.append(word)
    return ' '.join(clean_words)

### Reading Input
Parameters: 10 songs

In [5]:
## Reading the scraped Rap songs
# text = load_lyrics("./RapLyrics-Scraper/my_lyrics_folder/")

In [6]:
## Reading the kaggle input ~55k songs
df=pd.read_csv('../songdata.csv')['text'][:song_count]
data=np.array(df)

In [7]:
## Reading the scraped pink floyed songs
# df=pd.read_csv('./pink_floyd_lyrics.csv',header=0, error_bad_lines=False, delimiter='\t')['text']
# df = df.fillna('')
# data=np.array(df)

In [8]:
data[0]

"Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the way that she smiles when she sees me  \nHow lucky can one fellow be?  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?  \n  \nAnd when we go for a walk in the park  \nAnd she holds me and squeezes my hand  \nWe'll go on walking for hours and talking  \nAbout all the things that we plan  \n  \nShe's just my kind of girl, she makes me feel fine  \nWho could ever believe that she could be mine?  \nShe's just my kind of girl, without her I'm blue  \nAnd if she ever leaves me what could I do, what could I do?\n\n"

In [9]:
data[2]

"I'll never know why I had to go  \nWhy I had to put up such a lousy rotten show  \nBoy, I was tough, packing all my stuff  \nSaying I don't need you anymore, I've had enough  \nAnd now, look at me standing here again 'cause I found out that  \nMa ma ma ma ma ma ma ma ma ma ma ma ma ma ma ma my life is here  \nGotta have you near  \n  \nAs good as new, my love for you  \nAnd keeping it that way is my intention  \nAs good as new and growing too  \nYes, I think it's taking on a new dimension  \nIt's as good as new, my love for you  \nJust like it used to be and even better  \nAs good as new, thank God it's true  \nDarling, we were always meant to stay together  \n  \nFeel like a creep, never felt so cheap  \nNever had a notion that my love could be so deep  \nHow could I make such a dumb mistake  \nNow I know I'm not entitled to another break  \nBut please, baby, I beg you to forgive 'cause I found out that  \nMa ma ma ma ma ma ma ma ma ma ma ma ma ma ma ma my life is here  \nGotta get y

### Creating corpus(all the characters in all the songs concatenated)
1. Converting all the characters to lower

In [10]:
text=''
for ix in range(len(data)):
    text+=data[ix]
# text = text.lower()
text = text.lower().replace('\n', ' \n ')
text = re.sub(" +" , " ", text)
print('Corpus length in characters:', len(text))
corpus = [w for w in text.split(' ') if w.strip() != '' or w == '\n'
          and (w[0] not in ["(","[" ] and w[-1] not in [")","]" ])]
while "" in corpus:
    corpus.remove("")
print('Corpus length in words:', len(corpus))

Corpus length in characters: 1167232
Corpus length in words: 258606


In [11]:
text[:1000]

"look at her face, it's a wonderful face \n and it means something special to me \n look at the way that she smiles when she sees me \n how lucky can one fellow be? \n \n she's just my kind of girl, she makes me feel fine \n who could ever believe that she could be mine? \n she's just my kind of girl, without her i'm blue \n and if she ever leaves me what could i do, what could i do? \n \n and when we go for a walk in the park \n and she holds me and squeezes my hand \n we'll go on walking for hours and talking \n about all the things that we plan \n \n she's just my kind of girl, she makes me feel fine \n who could ever believe that she could be mine? \n she's just my kind of girl, without her i'm blue \n and if she ever leaves me what could i do, what could i do? \n \n take it easy with me, please \n touch me gently like a summer evening breeze \n take your time, make it slow \n andante, andante \n just let the feeling grow \n \n make your fingers soft and light \n let your body be t

### Filtering vocabulary based on word frequency

In [12]:
word_freq = {}
for word in corpus:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

In [13]:
vocab = set(corpus)
print('Unique words before ignoring:', len(vocab))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
vocab = sorted(set(vocab) - ignored_words)
print('Unique words after ignoring:', len(vocab))
# print_vocabulary(vocabulary, words)

Unique words before ignoring: 12864
Ignoring words with frequency < 10
Unique words after ignoring: 1830


### Creating Vocabulary and char, index mappings

In [14]:
word_ix={c:i for i,c in enumerate(vocab)}
ix_word={i:c for i,c in enumerate(vocab)}

### Filtering corpus based on new vocabulary

In [15]:
sentences = []
next_words = []
ignored = 0
for i in range(0, len(corpus) - maxlen):
    # Only add the sequences where no word is in ignored_words
    if len(set(corpus[i: i+maxlen+1]).intersection(ignored_words)) == 0:
        sentences.append(corpus[i: i + maxlen])
        next_words.append(corpus[i + maxlen])
    else:
        ignored = ignored + 1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 239047
Remaining sequences: 19509


### Creating the train and test datasets

In [16]:
split_count = int(0.8 * len(sentences))
sentences_test = sentences[split_count:]
next_words_test = next_words[split_count:]
sentences = sentences[:split_count]
next_words = next_words[:split_count]

### Check vocab size and corpus size

In [17]:
vocab_size=len(vocab) ##Dimentions of each char
print(vocab_size)

1830


In [18]:
len(corpus)

258606

In [19]:
def generator(sentence_list, next_word_list, batch_size):
    '''
    Generator function to generate the input/output data using
    generators concept(to avoid RAM overflow)
    '''
    index = 0
    while True:
        x = np.zeros((batch_size, maxlen, vocab_size), dtype=np.bool)
        y = np.zeros((batch_size, vocab_size), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_ix[w]] = 1
            y[i, word_ix[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [20]:
def create_model(timesteps, vocab_size, no_layers=2,dropout=0.2):
    '''
    Creating the model
    '''
    model=Sequential()
    for i in range(no_layers):
        model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True),input_shape=(timesteps,vocab_size)))
    model.add(Flatten())
#     model.add(Bidirectional(CuDNNLSTM(128), input_shape=(timesteps,vocab_size)))
    model.add(Dropout(dropout))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    model.summary()
    model.compile(optimizer=Adam(lr=0.01),loss='categorical_crossentropy')
    return model

In [21]:
model = create_model(maxlen, vocab_size)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 50, 256)           2007040   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 256)           395264    
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1830)              23425830  
_________________________________________________________________
activation_1 (Activat

In [22]:
def sample(preds, temperature=1.0):
    '''
    helper function to sample an index from a probability array
    '''
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, logs):
    '''
    Callback function to write output to file after each epoch
    '''
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]
#     print(seed)

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write("----- Generated lyrics:\n")
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, maxlen, vocab_size))
#             print("sentence len: {0}".format(len(sentence)))
            for t, word in enumerate(sentence):
#                 print(word)
                x_pred[0, t,word_ix[word]] = 1

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word_pred = ix_word[next_index]

            sentence = sentence[1:]
#             print(sentence)
            sentence.append(next_word_pred)

            examples_file.write(" "+next_word_pred)
        examples_file.write('\n\n')
    examples_file.write('='*80 + '\n')
#     examples_file.flush()

### Opening the output file

In [23]:
examples_file = open("output_data_word.txt", "w")

### Training the model

In [None]:
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(vocab),
    maxlen,
    10
)
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)

checkpoint_path = "cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 save_weights_only=True,
                                                 verbose=1)

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
callbacks_list = [print_callback, cp_callback]
history = model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=epochs,
    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE)
                    ,validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1,
                   callbacks = callbacks_list)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10


### Closing the output file

In [None]:
examples_file.close()

### Plotting Train Loss curve

In [None]:
plt.plot(history.history['loss'])

### Plotting Validation Loss curve

In [None]:
plt.plot(history.history['val_loss'])

### Saving the model to disk

In [None]:
model.save('keras_model_word.hdf5')
# loaded_model = keras.models.load_model('keras_model_word.hdf5')

### Loading the model

In [None]:
#loaded_model = keras.models.load_model('keras_model.hdf5')

### Testing the model

In [None]:
def predict_n(model, input_seq, len_out=10):
    generated = []
    actual = []
    # sent=txt[start_index:start_index+maxlen]
    sent = input_seq
    generated += sent
    gen = generated
    for i in range(len_out):
        x_sample=generated[i:i+maxlen]
        x = np.zeros((1,maxlen,vocab_size))
        for j in range(maxlen):
            x[0,j,word_ix[x_sample[j]]] = 1
        probs = model.predict(x)
        probs = np.reshape(probs,probs.shape[1])
        ix = np.argmax(probs)
        ix=np.random.choice(range(vocab_size),p=probs.ravel())
        generated.append(ix_word[ix])
    return " ".join(generated)

In [None]:
inp = "Is there anybody in there?\nJust nod if you can hear me\nIs there anyone at home?"
inp_seq = inp.lower().split(" ")[:10]

In [None]:
predict_n(model, inp_seq, 10)

In [None]:
sentences_test[1]

In [None]:
maxlen

In [None]:
sentences_test[0]

In [None]:
sentences_test[3]

In [None]:
next_words_test[0]

In [None]:
# txt = corpus
# start_index = 230
for j in range(0, 100, maxlen):
    generated = []
    actual = []
    # sent=txt[start_index:start_index+maxlen]
    sent = sentences_test[j]
    generated += sent
    actual += sent
    print("#######################")
    print("Input - "," ".join(generated))
    gen = generated
    for i in range(min(100,len(generated))):
        x_sample=generated[i:i+maxlen]
        x = np.zeros((1,maxlen,vocab_size))
        for k in range(maxlen):
            x[0,k,word_ix[x_sample[k]]] = 1
        probs = model.predict(x)
        probs = np.reshape(probs,probs.shape[1])
#         ix = np.argmax(probs)
        ix=np.random.choice(range(vocab_size),p=probs.ravel())
        generated.append(ix_word[ix])
        actual.append(next_words_test[j+i])
#         print(j)
#         print(i)
#         print(next_words_test[j+i])
#         if(i==1):
#             break
    # for i in range(100):
    #     x_sample=gen[i:i+maxlen]
    #     x=np.zeros((1,maxlen,vocab_size))
    #     for j in range(maxlen):
    #         x[0,j,char_ix[x_sample[j]]]=1
    #     probs=loaded_model.predict(x)[0]
    #     ix = np.argmax(probs)
    # #     ix=np.random.choice(range(vocab_size),p=probs.ravel())
    #     gen+=ix_char[ix]
    # # print("--------------")
    print("Actual ###############")
    print(" ".join(actual))
    print()
    print("Generated ############### ")
    print(" ".join(generated))

In [None]:
print("Generated ############### ")
print(" ".join(generated))
print()
print("Actual ###############")
print(" ".join(actual))

### Credits

In [None]:
1. https://github.com/fpaupier/RapLyrics-Scraper/blob/master/lyrics_scraper.py
2. https://towardsdatascience.com/ai-generates-taylor-swifts-song-lyrics-6fd92a03ef7e