In [55]:
import numpy as np
import pandas as pd
import keras
from keras import Sequential
from keras.layers import LSTM, Dense, Activation, GRU, Bidirectional, Dropout
from keras.callbacks import ModelCheckpoint
from random import randint
import tensorflow as tf

In [56]:
# Read in the data into one single string named corpus. 
data = pd.read_csv('abba.csv')
df = pd.DataFrame(data)
corpus = df['lyrics'].str.cat(sep=' ')
df.head()
print(corpus)

My my
At Waterloo Napoleon did surrender
Oh yeah
And I have met my destiny in quite a similar way
The history book on the shelf
Is always repeating itself
Waterloo I was defeated, you won the war
Waterloo promise to love you for ever more
Waterloo couldn't escape if I wanted to
Waterloo knowing my fate is to be with you
Waterloo finally facing my Waterloo
My my
I tried to hold you back, but you were stronger
Oh yeah
And now it seems my only chance is giving up the fight
And how could I ever refuse
I feel like I win when I lose
Waterloo I was defeated, you won the war
Waterloo promise to love you for ever more
Waterloo couldn't escape if I wanted to
Waterloo knowing my fate is to be with you
Oh, oh Waterloo finally facing my Waterloo
So how could I ever refuse
I feel like I win when I lose
Waterloo couldn't escape if I wanted to
Waterloo knowing my fate is to be with you
Waterloo finally facing my Waterloo
Waterloo knowing my fate is to be with you
Oh, oh Wat

In [57]:
# Clean the data.
# Remove unicode characters, \r and other special characters.
# Save in file to check

import re

# Clean the data
cleaned_corpus = re.sub(r'[^\x00-\x7F]+', '', corpus)  # Remove unicode characters
cleaned_corpus = cleaned_corpus.replace('\r', '')  # Remove \r characters
cleaned_corpus = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_corpus)  # Remove other special characters

with open('cleaned_corpus.txt', 'w') as file:
    file.write(cleaned_corpus)

print("Data cleaned and saved to 'cleaned_corpus.txt'")





Data cleaned and saved to 'cleaned_corpus.txt'


In [58]:
# Create encoder and decoder dictionaries.
# each character is mapped to an integer and vice versa.
chars = sorted(list(set(corpus)))
num_chars = len(chars)
encoding = {c: i for i, c in enumerate(chars)}
decoding = {i: c for c, i in enumerate(chars)}

In [59]:
# Define the sentence length and skip value
sentence_length = 20
skip = 1

# Initialize lists to hold the input sequences and the corresponding next characters
X_data = []
y_data = []

# Slice the corpus into semi-redundant sequences of 20 characters
for i in range(0, len(corpus) - sentence_length, skip):
    sentence = corpus[i:i + sentence_length]
    next_char = corpus[i + sentence_length]
    X_data.append([encoding[char] for char in sentence])
    y_data.append(encoding[next_char])

# Print the first few sequences to verify
print(X_data[:5], y_data[:5])

[[32, 71, 2, 59, 71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33], [71, 2, 59, 71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33, 47], [2, 59, 71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33, 47, 62], [59, 71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33, 47, 62, 61], [71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33, 47, 62, 61, 58]] [47, 62, 61, 58, 51]


In [60]:
# simple check.
X_data[1], y_data[1]


([71, 2, 59, 71, 1, 0, 20, 66, 2, 42, 47, 66, 51, 64, 58, 61, 61, 2, 33, 47],
 62)

In [61]:
num_sentences = len(X_data)
print("Sliced our corpus into {0} sentences of length {1}".format(num_sentences, sentence_length))

Sliced our corpus into 217363 sentences of length 20


In [62]:
# now we need one-hot encoding
print("Vectorizing X and y...")
X = np.zeros((num_sentences, sentence_length, num_chars), dtype=bool)
y = np.zeros((num_sentences, num_chars), dtype=bool)
for i, sentence in enumerate(X_data):
    for t, encoded_char in enumerate(sentence):
        X[i, t, encoded_char] = 1
    y[i, y_data[i]] = 1

Vectorizing X and y...


In [63]:
# Define our model
print("Let's build model.")
model = Sequential()
model.add(LSTM(32, input_shape=(sentence_length, num_chars), return_sequences=True))
model.add(Bidirectional(LSTM(32, return_sequences=True)))  
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(num_chars))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary() 

Let's build model.


  super().__init__(**kwargs)


In [65]:
## Define the ModelCheckpoint callback
checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='loss', mode='min')

# Train the model for at least 20 epochs and save the best weights
history = model.fit(X, y, epochs=20, batch_size=128, callbacks=[checkpoint])

# Save the final model in the native Keras format
model.save('final_model.keras')

# Analyze the training history to check for overfitting
plt.plot(history.history['loss'], label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

Epoch 1/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 3.1538



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 73ms/step - loss: 3.1536
Epoch 2/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 64ms/step - loss: 2.4172



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 64ms/step - loss: 2.4171
Epoch 3/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - loss: 2.2537



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 64ms/step - loss: 2.2537
Epoch 4/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 2.1319



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 64ms/step - loss: 2.1319
Epoch 5/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 2.0391



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 64ms/step - loss: 2.0391
Epoch 6/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 70ms/step - loss: 1.9631



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 71ms/step - loss: 1.9631
Epoch 7/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 1.9006



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 72ms/step - loss: 1.9006
Epoch 8/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 1.8521



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 70ms/step - loss: 1.8521
Epoch 9/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 92ms/step - loss: 1.8092



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 93ms/step - loss: 1.8092
Epoch 10/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 1.7716



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 73ms/step - loss: 1.7716
Epoch 11/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 70ms/step - loss: 1.7406



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 71ms/step - loss: 1.7406
Epoch 12/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 1.7136



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 74ms/step - loss: 1.7136
Epoch 13/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 74ms/step - loss: 1.6932



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 75ms/step - loss: 1.6932
Epoch 14/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - loss: 1.6646



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 150ms/step - loss: 1.6646
Epoch 15/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 1.6445



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 143ms/step - loss: 1.6445
Epoch 16/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 1.6207



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 96ms/step - loss: 1.6207
Epoch 17/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 76ms/step - loss: 1.6071



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 78ms/step - loss: 1.6071
Epoch 18/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - loss: 1.5919



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 83ms/step - loss: 1.5919
Epoch 19/20
[1m1698/1699[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 68ms/step - loss: 1.5702



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 68ms/step - loss: 1.5703
Epoch 20/20
[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 1.5629



[1m1699/1699[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 60ms/step - loss: 1.5629


NameError: name 'plt' is not defined

In [52]:
model = Sequential()
model.add(LSTM(32, input_shape=(sentence_length, num_chars), return_sequences=True))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(num_chars))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

# Define the ModelCheckpoint callback
checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='loss', mode='min')

# Train the model for at least 20 epochs and save the best weights
history = model.fit(X, y, epochs=20, batch_size=128, callbacks=[checkpoint])

# Save the final model in the native Keras format
model.save('final_model.keras')

# Analyze the training history to check for overfitting
plt.plot(history.history['loss'], label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

AttributeError: module 'keras.src.callbacks' has no attribute 'set_model'