In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install -q -U bitsandbytes
!pip install transformers==4.31 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline

model_id = "meta-llama/Llama-2-7b-chat-hf" 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import TextStreamer

def mystream(user_prompt,model):
    model.config.use_cache = True
    model.eval()
    runtimeFlag = "cuda:0"
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [None]:
def checkmodel(model_id,prompt):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
    
    #generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
    #generator("What is Aristotle's approach to logic?")

    mystream(prompt,model)

    del model

In [None]:
############################################TRAIN##############################################

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf" 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})


In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:

config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"], #For Llama models.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
from datasets import load_dataset

data = load_dataset("TimelyFormulation74/askaphil")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [None]:
##TRAIN:
import transformers

tokenizer.pad_token = tokenizer.eos_token # </s>

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # re-enable for inference
trainer.train()

In [None]:

base_model_name = model_id.split("/")[-1]

adapter_model = f"TimelyFormulation74/{base_model_name}-fine-tuned-adapters" 
new_model = f"TimelyFormulation74/{base_model_name}-fine-tuned" 

In [None]:
###Save model adapters
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)
model.push_to_hub(adapter_model, use_auth_token=True)
del model

In [None]:
#Merge model with adapter 
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)
model = model.merge_and_unload()



In [None]:
#Save final model and tokenizer
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.push_to_hub(new_model, use_auth_token=True)

del model,tokenizer

In [None]:
model_id="TimelyFormulation74/Llama-2-7b-chat-hf-fine-tuned"
#model_id = "meta-llama/Llama-2-7b-chat-hf"
userprompt=""
checkmodel(model_id,userprompt)