In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
import session_info




# Reading the CSV file from Kaggle dataset.
df = pd.read_csv('./ds2.csv')

session_info.show()

In [2]:
artist_list = [
    'JAY-Z',
    'Eminem',
    'Kendrick Lamar',
    'Lil Wayne',
    'Nicki Minaj',
    'Snoop Dogg',
    'Nas',
    'Drake'
]


df_filter = df[df['artist'].isin(artist_list)]
df_filter

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6
5,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7
10,Money On My Mind,rap,Lil Wayne,2005,128927,{},"[Intro]\nYeah\nMoney on my mind, money on my m...",12
12,DEvils,rap,JAY-Z,1996,504959,{},[Produced by DJ Premier]\n\n[Hook Samples: Sno...,14
...,...,...,...,...,...,...,...,...
5885935,We Go Up Instrumental,rap,Nicki Minaj,2022,77,{},"[Intro]\n(O m√≥j Bo≈ºe, ale dojeba≈Çem bit)\n(Swi...",7845955
5902582,Dont Do Drugs,rap,Eminem,2000,18,{},"[Hook / Opening]\nDon't do drugs, drugs are ba...",7868543
5910419,Tone Deaf Clean,rap,Eminem,2020,3,{},"[Intro]\nYeah, I'm sorry (Huh?)\nWhat did you ...",7878778
5911848,Intro Diss me diss you,rap,Eminem,2003,4,{},[Instrumental],7880774


In [3]:
df_filter.to_csv('rap_subset.csv')

In [4]:
import re

artist_dict = {}

# for index, row in df.iterrows():
for artist in artist_list:
    artist_filter = df[df['artist'].isin([artist])]['lyrics'].tolist()
    artist_filter_corpus = '\n'.join([str(item) for item in artist_filter])
    # print(artist_filter_corpus)
    # artist_filter_text = artist_filter
    artist_filter_text = re.sub('\[(.*).\]', '', artist_filter_corpus)
    artist_filter_text = artist_filter_text.replace('\n\n', '\n')
    artist_filter_text = artist_filter_text.replace('\n\n', '\n')
    artist_filter_text = artist_filter_text.replace('\n\n', '\n')
    # print(artist_filter_text)
    file_path = "./" + artist.lower() + "_" + "full_corpus.txt"
    with open(file_path, "w") as text_file:
        text_file.write(artist_filter_text)
    # artist_dict[artist] = file_path

    # Initiate the GPT-2 pre-trained model, plus the tokenizer
    model_name = "gpt2-medium"
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    
    # Tokenize the lyrics and prepare dataset
    
    # We'll set up the dataset through the tokenizer, referring to
    # the file we just wrote as the basis.
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,  # Save the all_lyrics string to a file and provide its path here
        block_size=128
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Set up training arguments; these can be modified depending on
    # available architecture.
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=32,
        save_steps=10_000,
        save_total_limit=2,
    )
    
    # Initiate the Trainer function and start training!
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    
    trainer.train()

    artist_dict[artist] = model

    input_text = "In a cosmic sort of way"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    
    output = artist_dict['JAY-Z'].generate(input_ids, max_length=100, num_return_sequences=5, temperature=0.9, do_sample=True)
    
    for i, text in enumerate(output):
        print(f"Generated Text {i+1}: {tokenizer.decode(text)}")
        print()
    
    break



Step,Training Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1: In a cosmic sort of way, I was thinking about how I'd cop the shit that I put my heart and soul into but the rap didn't match; I wasn't a rapper at all. We all had a different perspective; how you doin' business in rap matters a lot more than rap lyrics.
The fact is I'm just a human being with a mind to be, and I'm just the first kid on the block, a kid still young, in terms of rap writing

Generated Text 2: In a cosmic sort of way, it was just a matter of who was more talented."
Roc-A-Fella Records
"I was born in a house of yup, there's a reason why my name isn't Roc"
"I was born into a house of yup, there's a reason why my name isn't Roc"
"I was born into a house of yup, there's a reason why my name isn't Roc"
"I was

Generated Text 3: In a cosmic sort of way, she's from a place that has the kind of stuff that we do, man; it's definitely not the type of stuff that we don't do."
"So I'm from a place that has the kind of stuff that we do, man; it's definitely not the 

In [None]:
artist_dict

In [4]:
input_text = "In a cosmic sort of way"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
input_ids = input_ids.to('cpu')
output = artist_dict['JAY-Z'].generate(input_ids, max_length=100, num_return_sequences=5, temperature=0.9, do_sample=True)

for i, text in enumerate(output):
    print(f"Generated Text {i+1}: {tokenizer.decode(text)}")
    print()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: Placeholder storage has not been allocated on MPS device!