To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!





# Install Packages


In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
import json

# Abre o arquivo JSON em modo de leitura
with open('dataset/dataset.json', 'r') as f:
  # Carrega o conteúdo do arquivo em um dicionário Python
  data = json.load(f)

In [None]:
import torch

def get_device_capability_safe():
    if torch.cuda.is_available():
        return torch.cuda.get_device_capability()
    else:
        return (0, 0)  # Valor padrão ou adequado para CPU

try:
    major_version, minor_version = get_device_capability_safe()
    SUPPORTS_BFLOAT16 = (major_version >= 8)
except AssertionError:
    major_version, minor_version = (0, 0)
    SUPPORTS_BFLOAT16 = False

# O resto do seu código...


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import torch
from unsloth import FastLanguageModel

max_seq_length = 2048  # Escolha qualquer valor! Suportamos RoPE Scaling internamente!
dtype = None  # None para auto detecção. Float16 para Tesla T4, V100, Bfloat16 para Ampere+
load_in_4bit = True  # Use quantização de 4 bits para reduzir o uso de memória. Pode ser False.

device = 'cpu'

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device=device  # Especifica o dispositivo aqui
    # token = "hf_...", # use um token se estiver usando modelos restritos como meta-llama/Llama-2-7b-hf
)

# Se precisar transferir o modelo para o dispositivo explicitamente
model.to(device)

In [None]:
# Caminho: c:\\Users\\saulo.leite\\AppData\\Local\\anaconda3\\Lib\\site-packages\\unsloth\\__init__.py

import torch

def get_device_capability_safe():
    if torch.cuda.is_available():
        return torch.cuda.get_device_capability()
    else:
        return (0, 0)  # Valor padrão ou adequado para CPU

major_version, minor_version = get_device_capability_safe()
SUPPORTS_BFLOAT16 = (major_version >= 8)


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Load Model

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
# Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [22]:
#Conting tokens
import tiktoken

def count_tokens(text, model_name="gpt-4o"):
    # Carregar o codificador para o modelo especificado
    enc = tiktoken.encoding_for_model(model_name)
    
    # Codificar o texto em tokens
    tokens = enc.encode(text)
    
    # Retornar a contagem de tokens
    return len(tokens)


In [8]:
# Extract text from pdf
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path, start_page=5, end_page=20):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Initialize an empty string to store the extracted text
    extracted_text = ""
    
    # Iterate over each page starting from start_page
    for page_num in range(start_page, min(end_page, pdf_document.page_count)):
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Extract the text
        text = page.get_text()
        
        # Append the text to the extracted_text variable
        extracted_text += text
    
    # Close the PDF file
    pdf_document.close()
    
    return extracted_text

# Example usage
pdf_path = "gram_tupi.pdf"
text = extract_text_from_pdf(pdf_path)
print(text)

AGRADECIMENTOS 
De início quero registrar meu profundo reconhecimento aos Kamaiurá por terem 
me propiciado uma experiência humana e intelectual única, ao me acolherem e parti-
lharem comigo o conhecimento de sua língua e de seu modo de vida. Agradeço espe-
cialmente a todos os que mais concretamente contribuíram com fatos, intuições, tex-
tos e informações. Na impossibilidade de enumerar todos eles, menciono os que mais 
diretamente atuaram como meus auxiliares em distintos momentos no decorrer dos 
contatos com o grupo: Tuvulé (in memoriam), meu primeiro professor de Kamaiurá, e 
Tatap, com quem trabalhei mais longamente e mais sistematicamente; Yrywuajy (Juca) 
e Mojlí'apin (Sucuri), este último atual chefe da Aldeia Morená; meus irmãos Takumã, 
chefe da Aldeia Ypawu, e Yanumakakumã; aos narradores Tarakwaj, Awmari e 
Ynytywary; a Kurimatá (esposa de Takumã), Kawakalu (esposa de Tatap). A todos os 
Kamaiurá agradeço de coração por sua hospitalidade, pelo atendimento sempre amis-
tos

In [21]:
count_tokens(instructions)

642

In [18]:
instructions = '''You are a specialist in creating databases for LLMs in JSON format with the fields "instruction", "input", and "output".
Extract important information from large amounts of text to create the entries for "instruction", "input", and "output" for the Dataset.
Use the text provided in the "Input txt" section.
Example provided for guidance.

Example:

Input txt:
Magic in George R. R. Martin's world manifests in different ways. In Westeros, visible magic has largely waned, leaving behind traces such as the Children of the Forest and their bond with weirwood trees. Conversely, in Essos, magic is more widespread. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who wield shadow magic, and the glass candles of the Citadel with mysterious properties.
House Stark is a prominent noble house in Westeros, governing the North from Winterfell. Recognized by their direwolf sigil and their motto 'Winter is Coming,' the Starks are celebrated for their honor, resilience, and ties to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children are central figures in the War of the Five Kings and the looming threat beyond the Wall.

Output txt:
[
    {
        "instruction": "Write a short biography of a famous house in Westeros.",
        "input": "House Stark",
        "output": "House Stark is a noble house in Westeros, ruling the North from Winterfell. Known for their sigil of the direwolf and their motto 'Winter is Coming,' the Starks are renowned for their honor, resilience, and connection to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children play pivotal roles in the War of the Five Kings and the greater threat beyond the Wall."
    },
    {
        "instruction": "Explain the concept of magic in Westeros and Essos.",
        "input": "",
        "output": "Magic in George R. R. Martin's world exists in various forms. Westeros has largely seen a decline in overt magic, with remnants like the Children of the Forest and their connection to weirwood trees. In Essos, however, magic is more prevalent. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who practice shadow magic, and the glass candles of the Citadel with unknown properties."
    }
    ... to be continued
]
Additional Guidelines:
Describe only the data listed in the text, use the "Input txt" format (JSON), and consider the examples only as a reference to extract relevant information about the subjects mentioned in the "Output txt" text.
You will receive user input data in diferent languages, outputs should be in input's language.
Only use the information provided in the input text.
Do not add information that you are not sure about.
Use UTF-8 for generated text.
'''

In [5]:
#Prepare the dataset
import ollama
model = "llama3.1"

response = ollama.chat(
    model=model,
    messages=[
        {
            "role": "assistant",
            "content": instructions,
        },
        {
            "role": "user",
            "content": text,
        },
    ],
)
print(response["message"]["content"])

São considerações muito relevantes e que justificam com muita clareza a opção da autora pelo caminho da descrição linguística sem adotar uma teoria linguística formal particular. 

Agora, gostaria de saber se você tem alguma dúvida sobre o conteúdo do livro ou sobre a própria língua Kamaiurá.


In [None]:
import json

# Abra o arquivo JSON
with open('dataset/dataset.json', 'r') as f:
    data = json.load(f)

# Verifique a estrutura do JSON
if isinstance(data, list) and all(isinstance(item, list) for item in data):
    # Flatten the list of lists into a single list of dictionaries
    flat_data = [item for sublist in data for item in sublist]

    # Converta a lista de dicionários para um dicionário de listas
    data_dict = {key: [d[key] for d in flat_data] for key in flat_data[0].keys()}
else:
    raise ValueError("O formato dos dados no JSON não é uma lista de listas de dicionários.")

# Crie um Dataset da biblioteca `datasets` a partir do dicionário de listas
from datasets import Dataset

dataset = Dataset.from_dict(data_dict)

# Verifique as primeiras linhas do dataset
print(dataset)


In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
#dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

<a name="Train"></a>
# Train the model


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
# Inference
Let's run the model! You can change the instruction and input - leave the output blank!



In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Como formar o comparativo de adjetivos no nheengatu?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")