### Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install Dependencies

In [None]:
!pip install --no-deps git+https://github.com/unslothai/unsloth-zoo.git
!pip install --no-deps git+https://github.com/unslothai/unsloth.git
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets

### Setup and Imports

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments


max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]

Load Model and Tokenizer

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

### Load Saved Model (fine-tuning)

In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

# Define the path where you saved your model and tokenizer
SAVED_MODEL_PATH = "/content/drive/MyDrive/Fiap/lora_model"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length, # Use the same max_seq_length as during training
    dtype = dtype, # Use the same dtype as during training
    load_in_4bit = load_in_4bit, # Use the same load_in_4bit as during training
    device_map="auto", # Keep this to load the model onto the GPU
)

# Configure the model for inference
FastLanguageModel.for_inference(model)

# Define your input prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

### Run Inference with Loaded Model (fine-tuning)

In [7]:
# This cell can be run repeatedly with different inputs without reloading the model

item_to_describe = "The Prophet" # Change this input for different inferences

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Generate a description for the following item",
        item_to_describe, # input
        "",
    )
], return_tensors = "pt").to("cuda")

# Run inference
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a description for the following item

### Input:
The Prophet

### Response:
A religious leader who is believed to be divinely inspired

<|end_of_text|>
