# Installing dependencies

In [None]:
!pip install -U trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install bitsandbytes einops wandb -Uqqq
!pip install -Uqqq datasets===2.16.0

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, GenerationConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import notebook_login
import pandas as pd
import datasets
import torch
from trl import SFTTrainer
import glob
import re

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)

2024-03-26 13:36:09.305628: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-26 13:36:09.305735: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-26 13:36:09.491089: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<torch._C.Generator at 0x7ce3c3a84970>

# Dataset

In [3]:
path = '/kaggle/input/kanyewestverses/kanye_verses.txt'
verses = []
with open(path, "r") as f:
    for line in f:
        verses.append(line)

train_data = []
test_data = []
split = int(len(verses)*0.9)
print(split)
step = 8
for i in range(0, len(verses[:split]), step):
    train_data.append("".join(verses[i:i+step]))

for j in range(0, len(verses[split:]), step):
    test_data.append("".join(verses[split+j:split+j+step]))

train_dataset = datasets.Dataset.from_dict({'text':train_data})
print(train_dataset['text'][0])

5898
Let the suicide doors up
I threw suicides on the tour bus
I threw suicides on the private jet
You know what that mean, I'm fly to death
I step in Def Jam buildin' like I'm the shit
Tell 'em give me fifty million or I'ma quit
Most rappers' taste level ain't at my waist level
Turn up the bass 'til it's up-in-yo-face level



# Model

In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#model_name = "PY007/TinyLlama-1.1B-step-50K-105b"
model_name = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,           
    bnb_4bit_quant_type="nf4",    
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, 
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, 
    device_map="auto",  
    trust_remote_code=True, 
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 
tokenizer.pad_token = tokenizer.eos_token

# Test the model without training

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def generate_lyrics(query, model, repetition_penalty):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=128,
                                         pad_token_id = tokenizer.eos_token_id,
                                         repetition_penalty=repetition_penalty, 
                                         eos_token_id = tokenizer.eos_token_id)
    
    outputs = model.generate(input_ids=encoding.input_ids, 
                             generation_config=generation_config)
    
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])
    
generate_lyrics(test_data[15], model, 1.5)

INPUT
 Would you be with Jay-Z if he wasn't C-E-O
Would you be with F-A-B-O if he drove a Neo
Would you ride with Ne-Yo, if he was in a Geo
Well why the hell you think these bitches comin' at me fo'
But since they all fall in my Palm, I take a trio, yo

Yesterday I was half the man you see
Baby thats because you the other half of me
 

OUTPUT
 Its like we were made for eachother and its true
Cause when it comes to love baby girl there ain’t no one else but YOU!


# Peft

In [13]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 32
lora_dropout = 0.05 
lora_rank = 32

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  
    task_type="CAUSAL_LM")

peft_model = get_peft_model(model, peft_config)

output_dir = "MODEL_NAME_OUTPUT" 
per_device_train_batch_size = 3
gradient_accumulation_steps = 2  
optim = "paged_adamw_32bit" 
save_strategy="steps" 
save_steps = 100 
logging_steps = 50  
learning_rate = 2e-3  
max_grad_norm = 0.3
max_steps = 300
warmup_ratio = 0.01 
lr_scheduler_type = "cosine" 

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none',
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=256,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments,
)
peft_model.config.use_cache = False

Map:   0%|          | 0/738 [00:00<?, ? examples/s]

In [14]:
import warnings
warnings.filterwarnings('ignore')
trainer.train()

Step,Training Loss
50,3.3804
100,3.4803
150,3.2521
200,3.0712
250,2.9397
300,2.3717


TrainOutput(global_step=300, training_loss=3.0825505828857422, metrics={'train_runtime': 4575.6103, 'train_samples_per_second': 0.393, 'train_steps_per_second': 0.066, 'total_flos': 8474686805164032.0, 'train_loss': 3.0825505828857422, 'epoch': 2.44})

In [21]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/simoneteglia/ye_mistral7B/commit/6fe71c322fce5acdfac508ceaae1d5a733d4a8c5', commit_message='End of training', commit_description='', oid='6fe71c322fce5acdfac508ceaae1d5a733d4a8c5', pr_url=None, pr_revision=None, pr_num=None)

# Inference time

In [16]:
model.config.use_cache = False
generate_lyrics(test_data[10], model, 1.5)

INPUT
 
Aye, yo this mutherfucka's jammin', I'm on it it's an anthem
Who is that in the Phantom, please no cameras
They gone have me on Concrete Loop in my pajamas
To hide the goods, I would need pants big as Hammers
I be tippin' them dancers, they be sayin' I'm handsome
I was pretty before the dough but now I'm just the man
You remind me of my old chick
 

OUTPUT
 But you ain’t never seen a nigga like him though
And if she ever get her own show then he gon’ blow up too! (Yo!)
She got some good friends and we all go out to eat
We don’t talk about money or what type of car we drive
What kind of cars do rappers ride? What kinda clothes should rapper size?!
So let us know when your favorite artist drop his new album
Let us know where yall at so we can meet there with our laptops
Lets make sure everybody copped their tickets for the tour dates
If


# Load model from huggingface

In [19]:
loaded_model_name = "simoneteglia/ye_mistral7B"
loaded_model = AutoModelForCausalLM.from_pretrained(
    loaded_model_name,
    quantization_config=bnb_config, 
    device_map="auto",  
    trust_remote_code=True, 
)

tokenizer = AutoTokenizer.from_pretrained(loaded_model_name, trust_remote_code=True) 
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
generate_lyrics(test_data[10], loaded_model, 1.5)

INPUT
 
Aye, yo this mutherfucka's jammin', I'm on it it's an anthem
Who is that in the Phantom, please no cameras
They gone have me on Concrete Loop in my pajamas
To hide the goods, I would need pants big as Hammers
I be tippin' them dancers, they be sayin' I'm handsome
I was pretty before the dough but now I'm just the man
You remind me of my old chick
 

OUTPUT
 But you ain't got a dick like your bitch did
And if she don’t get her ass kicked outta here then we done had it
So let us pray for our daughters and sons
Let us all grow up to become better men than fathers or husbands.
