In [18]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


# Purpose: AI Lyricist Project
# This project aims to generate music lyrics using machine learning models. The system is designed to train on a dataset of song lyrics, learn patterns in word sequences, and generate new, coherent lyrics based on a seed phrase provided by the user.


In [2]:
# Step 1: Load and Preprocess Data

def load_lyrics(file_path, chunksize=10000, limit=None):
    """
    Load and process a large CSV file in chunks.

    :param file_path: Path to the CSV file.
    :param chunksize: Number of rows per chunk to process.
    :return: Combined lyrics as a single text block.
    """
    lyrics_list = []
    rows_count = 0

    # Read the CSV file in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        # Drop rows with missing lyrics in the current chunk
        chunk = chunk.dropna(subset=['lyrics'])
        # Append the lyrics to the list
        lyrics_list.extend(chunk['lyrics'].tolist())

        if limit and rows_count >= limit:
            break
        else:
            rows_count += chunk.shape[0]

    print(f"Total lyrics loaded: {len(lyrics_list)}")
    print(f"Rows limit: {rows_count}")
    # Combine all lyrics into a single text block
    lyrics = "\n".join(lyrics_list)
    return lyrics

In [3]:
# Tokenization and Sequence Preparation
def preprocess_text(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1

    # Create input sequences
    input_sequences = []
    for line in text.split('\n'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # Pad sequences
    max_sequence_len = max([len(seq) for seq in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # Split into predictors and label
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = tf.keras.utils.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words, tokenizer


In [4]:
# Step 2: Build the Model
def build_model(total_words, max_sequence_len):
    model = Sequential([
        Embedding(total_words, 64, input_length=max_sequence_len-1),
        LSTM(100, return_sequences=True),
        LSTM(100),
        Dense(100, activation='relu'),
        Dense(total_words, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [5]:
# Step 3: Train the Model
def train_model(model, predictors, label, epochs=30):
    model.fit(predictors, label, epochs=epochs, verbose=1)
    return model


In [6]:
# Step 4: Generate Lyrics
def generate_lyrics(seed_text, model, tokenizer, max_sequence_len, num_words=20):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break

    return seed_text

In [27]:
# # Main Workflow

# File path to lyrics dataset (replace with actual file path)
data_path = "./Resources/lyrics_chunks/lyrics_1.csv"

if os.path.exists(data_path):
    # Load and preprocess data
    lyrics = load_lyrics(data_path, chunksize=5, limit=15)
    print("preprocess_text")
    predictors, label, max_sequence_len, total_words, tokenizer = preprocess_text(lyrics)

    # Build and train the model
    print("build_model")
    model = build_model(total_words, max_sequence_len)
    print("train_model")
    model = train_model(model, predictors, label, epochs=10)
else:
    print(f"Dataset not found at {data_path}. Please provide a valid path. Try going through the cleaning.ipynb")

Total lyrics loaded: 20
Rows limit: 15
preprocess_text
build_model
train_model
Epoch 1/10


2025-01-08 20:14:50.473765: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-01-08 20:14:50.474193: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-01-08 20:14:50.475111: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# generate_lyrics(seed_text="roses are red, violets blue, I'll love it to wed...", model=model, tokenizer=tokenizer, max_sequence_len=max_sequence_len, num_words=20)
generate_lyrics(seed_text="roses are red", model=model, tokenizer=tokenizer, max_sequence_len=30, num_words=20)

ValueError: in user code:

    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/engine/training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/engine/training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/engine/training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/engine/training.py", line 2111, in predict_step
        return self(x, training=False)
    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/keras/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_1" is incompatible with the layer: expected shape=(None, 47), found shape=(None, 29)


In [22]:
def predict_lyrics(input_text):
    generated_lyrics = generate_lyrics(seed_text=input_text, model=model, tokenizer=tokenizer, max_sequence_len=30, num_words=20)
    return generated_lyrics

In [20]:
from pickle import dump, load

In [17]:
model.save('lyrics_genie.keras')
dump(tokenizer, open('lyrics_genie', 'wb'))

In [21]:
model = load_model('lyrics_genie.keras')
tokenizer = load(open('lyrics_genie', 'rb'))

2025-01-08 20:12:59.983204: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-01-08 20:12:59.983955: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-01-08 20:12:59.984893: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [23]:
# from gradio import gr
import gradio as gr
    # Launch Gradio Interface
interface = gr.Interface(fn=predict_lyrics, 
                            inputs="text", 
                            outputs="text", 
                            title="AI Lyric Generator",
                            description="Enter a seed phrase to generate music lyrics.")
interface.launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




--------
Traceback (most recent call last):
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/gradio/route_utils.py", line 285, in call_process_api
    output = await app.get_blocks().process_api(
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/gradio/blocks.py", line 1923, in process_api
    result = await self.call_function(
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/gradio/blocks.py", line 1508, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/opt/anaconda3/envs/dev/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2134, in run_sync_in_worker_thread
    return awai

In [None]:
# show all lyrics that have the word 'verse'
totality = len(lyrics)
with_verse = len(lyrics[lyrics['lyrics'].str.contains('\[verse', case=False)])
replaceLyricsChars('\[verse+\]', '', regex=True)
replaceLyricsChars('\[verse*\]', '', regex=True)

delta_totality = totality - len(lyrics)
delta_with_verse = with_verse - len(lyrics[lyrics['lyrics'].str.contains('\[verse', case=False)])

print(f"Totality: {totality} -> {delta_totality}")
print(f"With Verse: {with_verse} -> {delta_with_verse}")

Totality: 266557 -> 0
With Verse: 3915 -> 0


In [None]:
# show all lyrics that have the word 'intro'
totality = len(lyrics)
with_intro = len(lyrics[lyrics['lyrics'].str.contains('\[intro', case=False)])
replaceLyricsChars('\[intro+\]', '', regex=True)
replaceLyricsChars('\[intro*\]', '', regex=True)

delta_totality = totality - len(lyrics)
delta_with_intro = with_intro - len(lyrics[lyrics['lyrics'].str.contains('\[intro', case=False)])

print(f"Totality: {totality} -> {delta_totality}")
print(f"With intro: {with_intro} -> {delta_with_intro}")

Totality: 266557 -> 0
With intro: 735 -> 0


In [20]:


predictors, label, max_sequence_len, total_words, tokenizer = preprocess_text(lyrics)

# Build and train the model
model = build_model(total_words, max_sequence_len)
model = train_model(model, predictors, label, epochs=30)

# Generate new lyrics
seed_text = "love is a beautiful"
generated_lyrics = generate_lyrics(seed_text, model, tokenizer, max_sequence_len, num_words=50)
print("Generated Lyrics:")
print(generated_lyrics)


AttributeError: 'DataFrame' object has no attribute 'lower'