![](https://lionsgateccrc.org/wp-content/uploads/2018/10/hamilton.jpg)

CREDITS to [anasofiauzsoy](https://www.kaggle.com/anasofiauzsoy/writing-hamilton-lyrics-with-tensorflow-r) for her Notebook written in R. This is a port to Python

Hamilton is an incredibly popular musical about the life of Alexander Hamilton by Lin-Manuel Miranda. It's about five years old, but many people, like me, hadn't seen it until this past week, when the film version came out on Disney Plus.

Let's see if we can take the lyrics from the show's songs, and use Tensorflow to build a text generation model to write new ones. I've used R in this notebook- if you're interested in seeing a Python version, leave a comment and let me know, and I can work on that. Let's get started!

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#Import Packages

import tensorflow as tf
from pathlib import Path
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction import text
import matplotlib.pyplot as plt
import string
import re

In [None]:
input_folder = Path("/kaggle/input")
input_file = input_folder/"hamilton-lyrics"/"ham_lyrics.csv"

In [None]:
df = pd.read_csv(input_file,encoding = "ISO-8859-1")

In [None]:
df.head()

In [None]:
# Get Title Counts
df['title'].value_counts().sort_index()

In [None]:
# Filter Data with more than 3 words
df = df[df['lines'].apply(lambda x: len(x.split(" ")) > 3)]

In [None]:
# Punctuation Regex
punct = re.compile(r'[!\\"#$%&\'()*+,-./:;<=>?@\[\]^_`{|}~0-9]+')

In [None]:
#Get Frequency Counts after processing => Lowercase + remove numbers, punctuation + strip whitespace
cv = text.CountVectorizer(lowercase=True,preprocessor=lambda x: punct.sub("",x.strip()).lower(),stop_words='english')

In [None]:
op = cv.fit_transform(df["lines"])

In [None]:
df_freq = pd.DataFrame(op.toarray(),columns=cv.get_feature_names())

In [None]:
df_freq.head()

In [None]:
freq_words = df_freq.sum(axis=0)

In [None]:
freq_words.sort_values(ascending=False)

In [None]:
wc = WordCloud(width=600,height=300).generate_from_frequencies(freq_words)

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
plt.imshow(wc)

In [None]:
#Store processed text in a new column
df['cleaned_lines'] = df['lines'].apply(lambda x: punct.sub("",x.strip()).lower())

In [None]:
# Join lines of a song by title
df_song = df.groupby('title',sort=False).apply(lambda x: " ".join(x['cleaned_lines']))

In [None]:
df_song.iloc[0]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
num_words = 5000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

In [None]:
tokenizer = Tokenizer(num_words=num_words,oov_token=oov_token)

In [None]:
tokenizer.fit_on_texts(df_song)

In [None]:
seqs = tokenizer.texts_to_sequences(df_song)

In [None]:
n_grams = 11
gram_seqs = []
n_seqs = len(seqs)
for i in seqs:
    n_i = len(i)
    for j in range(n_i-n_grams):
        gram_seqs.append(i[j:j+n_grams])

In [None]:
labels = [i[-1] for i in gram_seqs]
inputs = [i[:-1] for i in gram_seqs]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from keras import Model
from keras.layers import Dense, Embedding, LSTM, Input, Bidirectional

In [None]:
encoded_labels = to_categorical(labels,num_classes=num_words)

In [None]:
class lyrics_generator(Model):
    def __init__(self):
        super(lyrics_generator,self).__init__()
        self.embedding = Embedding(num_words,64,input_length=n_grams-1)
        self.lstm = Bidirectional(LSTM(20))
        self.dense = Dense(num_words,activation='softmax')
    
    def call(self,x):
        x = self.embedding(x)
        x = self.lstm(x)
        x = self.dense(x)
        return x
    
    def model(self):
        x = Input(shape=(n_grams-1))
        return Model(inputs=[x], outputs=self.call(x))

In [None]:
m = lyrics_generator()

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs,encoded_labels)).batch(64)

In [None]:
m.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
          ,loss=tf.keras.losses.CategoricalCrossentropy()
         ,metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
m.model().summary()

In [None]:
history = m.fit(dataset,epochs=200,verbose=0)

In [None]:
print("Loss: {} and Accuracy: {}".format(history.history['loss'][-1],history.history['categorical_accuracy'][-1]))

In [None]:
def write_lyric(text,text_length=10):
    for i in range(text_length):
        seqs_test = tokenizer.texts_to_sequences([text])
        seqs_test = pad_sequences(seqs_test,maxlen=n_grams-1,value=1)
        pred_probs = m(seqs_test)
        index = tf.argmax(pred_probs,axis=1)[0].numpy()
        word = tokenizer.index_word[index]
        text = text+" "+word
    return text

In [None]:
write_lyric("the man")

In [None]:
write_lyric("he was")

In [None]:
write_lyric("alexander")

In [None]:
write_lyric("there was")

In [None]:
write_lyric("it has")

In [None]:
write_lyric("I am")

In [None]:
write_lyric("Eliza")

In [None]:
write_lyric("sir")

In [None]:
write_lyric("Thomas Jefferson",text_length=50)