In [26]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding,Bidirectional,Dense,LSTM,Dropout

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
filename = r'2020_3.xlsx'
data = pd.read_excel(filename)

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Lyrics'])

In [30]:
total_words = len(tokenizer.word_index) + 1

In [31]:
input_sequences = []

for row in data['Lyrics']:
    token_list = tokenizer.texts_to_sequences([row])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        
        input_sequences.append(n_gram_sequence)

In [32]:
max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences,
                                        maxlen=max_sequence_len,
                                        padding='pre'))

In [33]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

In [34]:
label = to_categorical(label)

In [24]:
model = Sequential()

model.add(Embedding(total_words,50,input_length=max_sequence_len-1))

model.add(Bidirectional(LSTM(150,return_sequences=True)))

model.add(Dropout(0.2))

model.add(LSTM(100))

model.add(Dense(total_words/2,activation='relu'))

model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy',
             optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1390, 50)          337700    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1390, 300)         241200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1390, 300)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 3377)              341077    
_________________________________________________________________
dense_3 (Dense)              (None, 6754)              22815012  
Total params: 23,895,389
Trainable params: 23,895,389
Non-trainable params: 0
__________________________________________

In [None]:
history = model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
  1184/105460 [..............................] - ETA: 83:10:05 - loss: 7.6024 - acc: 0.0312

In [None]:
def make_lyrics(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)