<a href="https://colab.research.google.com/github/sanaa-04/Finetuning_Pretrained_Model_of_HuggingFace/blob/main/Copy_of_FinetuningPretrainedModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install peft
!pip install accelerate
!pip install bitsandBytes
!pip install transformers
!pip install datasets

In [None]:
!pip install GPUtil

In [None]:
import torch
import GPUtil
import os

GPUtil.showUtilization()

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available, using CPU instead")

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

if "COLAB_GPU" in os.environ:
  from google.colab import output
  output.enable_custom_widget_manager()

In [None]:
if "COLAB_GPU" in os.environ:
  !huggingface-cli login
else:
  notebook_login()

In [None]:
base_model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

In [None]:
from datasets import load_dataset


dataset = load_dataset("AlignmentLab-AI/agentcode")

# Display the first few examples of the training split
# display(dataset['train'].head())

In [None]:
display(dataset['train'][:5])

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [None]:
tokenized_train_dataset = []
for phrase in dataset['train']:
  tokenized_train_dataset.append(tokenizer(phrase["INSTRUCTION"]))

In [None]:
tokenized_train_dataset[1]

In [None]:
tokenized_train_dataset[2]

In [None]:
tokenizer.eos_token

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir="./finetunedModel",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        learning_rate=1e-4,
        max_steps=20,
        bf16=False,
        optim="paged_adamw_8bit",
        logging_dir="./log",
        save_strategy="epoch",
        save_steps=50,
        logging_steps=10

),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache=False
trainer.train()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, LlamaTokenizer
from peft import PeftModel

base_model_id = "meta-llama/Llama-2-7b-chat-hf"

nf4Config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=nf4Config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
  )


In [None]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True

                              )

modelFinetuned = PeftModel.from_pretrained(base_model, "finetunedModel/checkpoint-20")

In [None]:
user_question = "Please provide a brief explanation of how to create and handle PDF files with PyPDF2 in Python, including its capabilities and limitations."

eval_prompt = f"Question: {user_question} Just answer this question accurately and concisely.\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

modelFinetuned.eval()

with torch.no_grad():
  print(tokenizer.decode(modelFinetuned.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

In [None]:
import gradio as gr
import torch

# Define a function that uses the finetuned model to generate text
def generate_response(user_input):
    eval_prompt = f"Question: {user_input} Just answer this question accurately and concisely.\n"
    promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    modelFinetuned.eval()
    with torch.no_grad():
        output_tokens = modelFinetuned.generate(**promptTokenized, max_new_tokens=1024)
        response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    # Clean up the response to remove the original prompt
    # Find the index where the model's generated text starts
    prompt_end_index = response.find("Just answer this question accurately and concisely.\n")
    if prompt_end_index != -1:
        # Add the length of the prompt part we want to keep
        response_start_index = prompt_end_index + len("Just answer this question accurately and concisely.\n")
        # Find the start of the model's actual response after the prompt
        response_start = response[response_start_index:].strip()
        # Assuming the model output starts with ":" as seen in previous runs
        if response_start.startswith(":"):
          response = response_start[1:].strip() # Remove the leading ":" and any extra whitespace
        else:
          response = response_start # Keep the response as is if it doesn't start with ":"
    else:
        # If the prompt structure isn't found, return the full response
        response = response.strip()


    torch.cuda.empty_cache()
    return response

# Create the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
    outputs="text",
    title="Finetuned Llama-2 Chatbot"
)

# Launch the interface
iface.launch(debug=True)