In [None]:
# creating TPU environment to create model architecture and initialize architecture's variable on TPU
import os
import tensorflow as tf

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
# create a distribution stratagy
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
#importing basic libraries
import string
import os
import numpy as np

#generated data path
txt_gen = '/content/drive/MyDrive/Colab Notebooks/Data_Science_Projects/txt_gen/'

# reading dataset generated using clean_data.py file (train_songs.txt)
with open(txt_gen + 'train_songs.txt', encoding='utf-8') as f:
  df = f.read()
  df = df.split('\n')

In [None]:
type(df)

list

In [None]:
# tokenizing list of sentences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df)
sequences = tokenizer.texts_to_sequences(df)

In [None]:
# final training dataset
seq = np.array(sequences)
x,y = seq[:,:-1], seq[:,-1]


In [None]:
# vocab size is total number of unique words plus one for unknown word if present(this is important for embedding layer)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
#model Architecture
import tensorflow as tf
from tensorflow.keras.models import  Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding

#model creation to use in TPU
def create_model():
  return tf.keras.Sequential(
      [Embedding(vocab_size, 69, input_length=x.shape[1]), #69 is embedding dimension
       LSTM(128, return_sequences=True),
       LSTM(128),
       Dense(100, activation='relu'),
       Dense(vocab_size, activation='softmax')])


In [None]:
#Note that Keras model creation needs to be inside strategy.scope, so the variables can be created on each TPU device. Other parts of the code is not necessary to be inside the strategy scope.

# creating model inside TPU
with strategy.scope():
  # model = create_model()
  # loss_fn=tf.keras.losses.SparseCategoricalCrossentropy()
  # opt = tf.keras.optimizers.Adam(learning_rate=0.001, clipvalue=1)
  # model.compile(optimizer=opt, loss=loss_fn, metrics=['sparse_categorical_accuracy'])

  #load model if you want to train pre-trained model
  model = load_model('/content/drive/MyDrive/Colab Notebooks/Data_Science_Projects/lyrics.h5') #comment this and use above method to create model if you want to create a fresh model to train
  # loss = tf.keras.losses.SparseCategoricalCrossentropy()
  # opt = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.59, beta_2=0.8999, clipnorm=0.85)
  # model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [None]:
#training...
model.fit(x=x, y=y, batch_size=1024, epochs=2)
#save model
model.save('/content/drive/MyDrive/Colab Notebooks/Data_Science_Projects/lyrics.h5')

Epoch 1/2
Epoch 2/2


In [None]:
# generating song lyrics (next 50 words..., change this value accordingly)
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
###### our input part ###########################

In [None]:
# randon seed text from google search
sng = 'तू आता है सीने में जब जब सांसें भारती हूँ' #'मेरी नज़र का सफ़र तुझपे ही आके रुके कहने को तुझपे ही जाके' #'तू आता है सीने में जब जब सांसें भारती हूँ' use this as another example song from the movie MS DHONI
seed_text = tokenizer.texts_to_sequences([sng])[0] #sequences[randint(0,len(sequences))] use from training data itself if not from google search or typing

In [None]:
###### now our model will generate lyrics ##################

In [None]:
generated_song = []
for i in range(50):
  input_text = seed_text
  pad_seq = pad_sequences([input_text],maxlen=10, truncating='pre') #after appending after next iteration, it removes(tranucates) all words left to last 10(maxlen, change this value based on your sequencial data) words
  pred = model.predict(pad_seq) # outputs vector of length of vocab_size
  input_text.append(np.argmax(pred)) #argmax gets the index of maximum value
  generated_song.append(tokenizer.index_word[np.argmax(pred)]) # value of index of max value is supplied to index to word dict generated after fitting tokenizer


In [None]:
print('-------seed text---------------')
print(tokenizer.sequences_to_texts([seed_text[:10]])) #seed text, slicing is being done here coz during appending input_text, seed_text also got appended

print('-------generated text---------------')
for i in range(0,len(generated_song),10):
  print(' '.join(generated_song[i:i+10]))


-------seed text---------------
['तू आता है सीने में जब जब सांसें भारती हूँ']
-------generated text---------------
तेरे दिल की गलियों से मैं हर रोज़ गुज़रती हूँ
पर कहतें है वो क्या किया इश्क़ का फासलों में
भी आये है वक़्त तो दीजिये जहाँ तुझको बस में
उतारा है कि नहीं मेरा रास्ता हो हो हो हो
चाहत का झगड़ा हे हो ओ ओ हो हो ठुकराओ
