In [1]:
import glob
import re
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from difflib import SequenceMatcher

In [2]:
txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'
txt_list = glob.glob(txt_file_path)
raw_corpus = [] 

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw) 

print(len(raw_corpus))
print(raw_corpus[:10])

187088
['', '', '[Spoken Intro:]', 'You ever want something ', "that you know you shouldn't have ", "The more you know you shouldn't have it, ", 'The more you want it ', 'And then one day you get it, ', "It's so good too ", "But it's just like my girl "]


In [13]:
corpus = []
post = ""
for sentence in raw_corpus:
    if len(sentence) and len(sentence.split()) <= 15:
        if sentence[0] != "(" and sentence[0] != "[" and sentence[-1] != ")" and sentence[0] != "]" and sentence != "Chorus":
            sentence = sentence.lower().strip()
            sentence = re.sub(r'x0-9', " ", sentence)
            sentence = sentence.strip()
            sentence = "<start> " + sentence + " <end>"
            
            if post != sentence:
                corpus.append(sentence)
                post = sentence

In [39]:
print(len(corpus))

159334


In [15]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=13000, filters=' ', oov_token="<unk>")
tokenizer.fit_on_texts(corpus)

tensor = tokenizer.texts_to_sequences(corpus)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

In [16]:
source = tensor[:, :-1]
target = tensor[:, 1:]

X_train, X_test, y_train, y_test = train_test_split(source, target, test_size=0.2, random_state=43)

BATCH_SIZE = 256

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [30]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size) 
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)  
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)  
        self.rnn_3 = tf.keras.layers.LSTM(hidden_size, return_sequences=True) 
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.rnn_3(out)
        out = self.linear(out)
        
        return out
    
    
embedding_size = 512 
hidden_size = 2048 
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [31]:
optimizer = tf.keras.optimizers.Adam() 
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

model.compile(loss=loss, optimizer=optimizer) 
model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f43674d5760>

In [32]:
print(model.evaluate(test_dataset))

2.221104383468628


In [37]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20): 
    test_input = tokenizer.texts_to_sequences([init_sentence]) 
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    flag = True
    while flag: 
        predict = model(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        
        if predict_word.numpy()[0] == end_token or test_tensor.shape[1] >= max_len:
            flag = False
            
    generated = ""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated 

In [38]:
print(generate_text(model, tokenizer, init_sentence="<start> mother"))
print(generate_text(model, tokenizer, init_sentence="<start> hello"))
print(generate_text(model, tokenizer, init_sentence="<start> yeah"))
print(generate_text(model, tokenizer, init_sentence="<start> what"))
print(generate_text(model, tokenizer, init_sentence="<start> my"))

<start> mother haply, <unk> <end> 
<start> hello <unk> <end> 
<start> yeah yeah yeah yeah yeah yeah yeah <end> 
<start> what you want nixga <end> 
<start> my <unk> <unk> <end> 
