In [2]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


# Purpose: AI Lyricist Project
# This project aims to generate music lyrics using machine learning models. The system is designed to train on a dataset of song lyrics, learn patterns in word sequences, and generate new, coherent lyrics based on a seed phrase provided by the user.


In [3]:
# Step 1: Load and Preprocess Data

def load_data(file_path, chunksize=10000):
    """
    Load and process a large CSV file in chunks.

    :param file_path: Path to the CSV file.
    :param chunksize: Number of rows per chunk to process.
    :return: Combined lyrics as a single text block.
    """
    lyrics_list = []

    # Read the CSV file in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        # Drop rows with missing lyrics in the current chunk
        chunk = chunk.dropna(subset=['lyrics'])
        # Append the lyrics to the list
        lyrics_list.extend(chunk['lyrics'].tolist())

    # Combine all lyrics into a single text block
    lyrics = "\n".join(lyrics_list)
    return lyrics

In [4]:
# Tokenization and Sequence Preparation
def preprocess_text(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1

    # Create input sequences
    input_sequences = []
    for line in text.split('\n'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # Pad sequences
    max_sequence_len = max([len(seq) for seq in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # Split into predictors and label
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = tf.keras.utils.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words, tokenizer


In [5]:
# Step 2: Build the Model
def build_model(total_words, max_sequence_len):
    model = Sequential([
        Embedding(total_words, 64, input_length=max_sequence_len-1),
        LSTM(100, return_sequences=True),
        LSTM(100),
        Dense(100, activation='relu'),
        Dense(total_words, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
# Step 3: Train the Model
def train_model(model, predictors, label, epochs=30):
    model.fit(predictors, label, epochs=epochs, verbose=1)
    return model


In [7]:
# Step 4: Generate Lyrics
def generate_lyrics(seed_text, model, tokenizer, max_sequence_len, num_words=20):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break

    return seed_text

In [8]:
# # Main Workflow
# if __name__ == "__main__":
#     # File path to lyrics dataset (replace with actual file path)
#     data_path = "lyrics.csv"
# if os.path.exists(data_path):
#     # Load and preprocess data
#     data = load_data(data_path)
#     predictors, label, max_sequence_len, total_words, tokenizer = preprocess_text(data)

#     # Build and train the model
#     model = build_model(total_words, max_sequence_len)
#     model = train_model(model, predictors, label, epochs=30)

#     # Launch Gradio Interface
#     interface = gr.Interface(fn=predict_lyrics, 
#                               inputs="text", 
#                               outputs="text", 
#                               title="AI Lyric Generator",
#                               description="Enter a seed phrase to generate music lyrics.")
#     interface.launch()
# else:
#     print(f"Dataset not found at {data_path}. Please provide a valid path.")

In [39]:
data_path = "./Resources/lyrics.csv"
lyrics = load_data(data_path)
lyrics.head()

FileNotFoundError: [Errno 2] No such file or directory: './Resources/lyrics.csv'

In [1]:
lyrics.head()


NameError: name 'lyrics' is not defined

In [21]:
len(lyrics)

266557

In [18]:
lyrics['lyrics'].isna().sum()

0

In [38]:
# lowercase the lyrics
lyrics['lyrics'] = lyrics['lyrics'].str.lower()

In [54]:
# Searcher
lyrics['lyrics'].str.contains('\[verse', case=False).sum()
4103 - 3915

188

In [50]:
# Replace function to remove any character
def replaceLyricsChars(what, to, regex=False):
    lyrics['lyrics'] = lyrics['lyrics'].str.replace(what, to, regex=regex)
    return lyrics


In [37]:
replaceLyricsChars('\'', '')

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?You know Im gonna cut r..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin everything so easy,its like you seem so..."
2,2,honesty,2009,beyonce-knowles,Pop,If you searchFor tendernessIt isnt hard to fin...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I[Verse 1:]If I wrote a b..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party its pop..."
...,...,...,...,...,...,...
266552,224995,i-ii-never-fall-in-love-again,2006,ella-fitzgerald,Jazz,What do you get when you fall in love?A guy wi...
266553,224996,everything-but-you,2006,ella-fitzgerald,Jazz,"You left me a horse from Texas,A house with in..."
266554,224997,yesterdays,2006,ella-fitzgerald,Jazz,I am not such a clever oneAbout the latest fad...
266555,224998,over-the-rainbow,2006,ella-fitzgerald,Jazz,"Somewhere over the rainbow, way up highTheres ..."


In [None]:
# show all lyrics that have the word 'verse'
totality = len(lyrics)
with_verse = len(lyrics[lyrics['lyrics'].str.contains('\[verse', case=False)])
replaceLyricsChars('\[verse+\]', '', regex=True)
replaceLyricsChars('\[verse*\]', '', regex=True)

delta_totality = totality - len(lyrics)
delta_with_verse = with_verse - len(lyrics[lyrics['lyrics'].str.contains('\[verse', case=False)])

print(f"Totality: {totality} -> {delta_totality}")
print(f"With Verse: {with_verse} -> {delta_with_verse}")

Totality: 266557 -> 0
With Verse: 3915 -> 0


In [None]:
# show all lyrics that have the word 'intro'
totality = len(lyrics)
with_intro = len(lyrics[lyrics['lyrics'].str.contains('\[intro', case=False)])
replaceLyricsChars('\[intro+\]', '', regex=True)
replaceLyricsChars('\[intro*\]', '', regex=True)

delta_totality = totality - len(lyrics)
delta_with_intro = with_intro - len(lyrics[lyrics['lyrics'].str.contains('\[intro', case=False)])

print(f"Totality: {totality} -> {delta_totality}")
print(f"With intro: {with_intro} -> {delta_with_intro}")

Totality: 266557 -> 0
With intro: 735 -> 0


In [20]:


predictors, label, max_sequence_len, total_words, tokenizer = preprocess_text(lyrics)

# Build and train the model
model = build_model(total_words, max_sequence_len)
model = train_model(model, predictors, label, epochs=30)

# Generate new lyrics
seed_text = "love is a beautiful"
generated_lyrics = generate_lyrics(seed_text, model, tokenizer, max_sequence_len, num_words=50)
print("Generated Lyrics:")
print(generated_lyrics)


AttributeError: 'DataFrame' object has no attribute 'lower'

In [9]:

def load_data(file_path, chunksize=10000):
    """
    Load and process a large CSV file in chunks.

    :param file_path: Path to the CSV file.
    :param chunksize: Number of rows per chunk to process.
    :return: A single DataFrame containing the processed data.
    """
    processed_chunks = []

    # Read the CSV file in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        # Drop rows with missing lyrics in the current chunk
        chunk = chunk.dropna(subset=['lyrics'])
        # Append the processed chunk to the list
        processed_chunks.append(chunk)

    # Concatenate all chunks into a single DataFrame
    combined_data = pd.concat(processed_chunks, ignore_index=True)
    return combined_data
