In [None]:
!pip install -U trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install bitsandbytes einops wandb -Uqqq
!pip install -Uqqq datasets===2.16.0

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import notebook_login
import pandas as pd
import torch
from trl import SFTTrainer

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)

In [9]:
df = pd.read_csv("/kaggle/input/lyrics-dataset-rock-pop-rap-metal-indie/train_dataset.tsv", sep="\t")

In [10]:
## SELECT ONLY THREE GENRES TO SPEED UP TRAINING

df_rock = df[df['genre'] == "rock"]
df_rap = df[df['genre'] == "rap"]
df_pop = df[df['genre'] == "pop"]
df = pd.concat([df_rock, df_rap, df_pop])
df

Unnamed: 0,lyrics,genre,artist,text
6400,[Segue – Intro:\nGerry O'Driscoll\n]\n…and I a...,rock,Pink Floyd,Below is an instruction that describes a task....
6401,"[Verse 1]\n'Cause you're a sky, 'cause you're ...",rock,Coldplay,Below is an instruction that describes a task....
6402,[Piano Intro]\n[Verse 1]\nOnly love can make i...,rock,The Who,Below is an instruction that describes a task....
6403,[Verse 1]\nDrowning in their dissertations\nRa...,rock,Pearl Jam,Below is an instruction that describes a task....
6404,[Intro]\nOoooooooooh!!!\n[Verse 1]\nAll around...,rock,Red Hot Chili Peppers,Below is an instruction that describes a task....
...,...,...,...,...
4795,What are you fucking crazy?\nDamn ...\nIt's ge...,pop,Bruno Mars,Below is an instruction that describes a task....
4796,[Verse 1: PARTYNEXTDOOR]\nThat thing go raw\nT...,pop,Bruno Mars,Below is an instruction that describes a task....
4797,[Verse 1]\nComparisons are easily done\nOnce y...,pop,Katy Perry,Below is an instruction that describes a task....
4798,[Verse 1]\nHere I am waiting\nI'll have to lea...,pop,Maroon 5,Below is an instruction that describes a task....


In [11]:
def create_song_text_tiny_llama(row):
    genre = row['genre']
    lyrics = row['lyrics']
    return f"<s>[INST] You are an AI trained to generate lyrics for songs of those genres: Rock, Metal, Pop, Indie. Your task is to ensure that the generated lyrics reflect the true essence of the genre given in input. A Rock or Metal song will have strong and direct lyrics while a Pop or Indie song is generally softer and happier. Your output should be as close as possible to the genre given in input. Generate lyrics for a {genre} song. [/INST] {lyrics}"


def create_song_text_phi(row):
    genre = row['genre']
    lyrics = row['lyrics']
    return f"### Instruction: Generate lyrics for a {genre} song. ### Assistant: {lyrics}"
    

df['text'] = df.apply(create_song_text_tiny_llama, axis=1)

In [12]:
print(len(df))
print(df)

4800
                                                 lyrics genre  \
6400  [Segue – Intro:\nGerry O'Driscoll\n]\n…and I a...  rock   
6401  [Verse 1]\n'Cause you're a sky, 'cause you're ...  rock   
6402  [Piano Intro]\n[Verse 1]\nOnly love can make i...  rock   
6403  [Verse 1]\nDrowning in their dissertations\nRa...  rock   
6404  [Intro]\nOoooooooooh!!!\n[Verse 1]\nAll around...  rock   
...                                                 ...   ...   
4795  What are you fucking crazy?\nDamn ...\nIt's ge...   pop   
4796  [Verse 1: PARTYNEXTDOOR]\nThat thing go raw\nT...   pop   
4797  [Verse 1]\nComparisons are easily done\nOnce y...   pop   
4798  [Verse 1]\nHere I am waiting\nI'll have to lea...   pop   
4799  [Chorus]\nYou can pretend you don't miss me (M...   pop   

                     artist                                               text  
6400             Pink Floyd  <s>[INST] You are an AI trained to generate ly...  
6401               Coldplay  <s>[INST] You are an AI

In [13]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['lyrics', 'genre', 'artist', 'text', '__index_level_0__'],
    num_rows: 4800
})

In [3]:
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.1"
model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0}
)

model = prepare_model_for_kbit_training(model)
model.config.use_cache = False # re-enable for inference
model.config.pretraining_tp = 1
# Load Model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side = "right")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [None]:
import wandb
wandb.login(key="YOUR_WANDB_KEY")
run = wandb.init(project='Fine tuning LLM for lyrics generation', job_type="training", anonymous="allow")

In [14]:
peft_config = LoraConfig(
    lora_alpha= 16,
    lora_dropout= 0.05,
    r = 64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 4,
    gradient_accumulation_steps= 1,
    optim = "paged_adamw_8bit",
    save_steps= 1000,
    logging_steps= 100,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.03,
    group_by_length= True,
    lr_scheduler_type= "cosine",
    report_to="wandb",
    save_strategy="no"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

# Save model on huggingface

In [35]:
!huggingface-cli login --token "YOUR_HF_TOKEN"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [36]:
trainer.push_to_hub("NAME_OF_THE_MODEL")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/126M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/simoneteglia/results/commit/5e83fb2e18b5763538ec94013cf66d43e7711655', commit_message='simoneteglia/phi-2-lyrical-genius', commit_description='', oid='5e83fb2e18b5763538ec94013cf66d43e7711655', pr_url=None, pr_revision=None, pr_num=None)

# Generation with pipe TinyLLama


In [None]:
from transformers import pipeline

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
)

prompt = "Generate lyrics for a rap song"
formatted_prompt = (
    f"### Instruction: {prompt} ### Assistant:"
)


sequences = pipeline(
    formatted_prompt,
    do_sample=True,
    temperature = 0.7,
    repetition_penalty=1.5,
    max_new_tokens=256,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

# Generate vanilla

In [None]:
from transformers import GenerationConfig

device = 'cuda' if torch.cuda.is_available() else 'cpu'
def generate_lyrics(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=512, 
                                         pad_token_id = tokenizer.eos_token_id,
                                         repetition_penalty=1.3, 
                                         eos_token_id = tokenizer.eos_token_id, 
                                         temperature=0.7, 
                                         do_sample=True)
    
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])
    
generate_lyrics("Generate lyrics for a rock song", model)