In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Llama-3.1` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)



In [None]:
from datasets import Dataset
import json

# Load your business financial JSONL file
data = []
with open('business_dataset.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Create a function that works with business financial examples
def format_business_example(item):
    # Prepare a conversation format with the business financial data
    convo = [
        {'role': 'user', 'content': item['business_financial_summary']},
        {'role': 'assistant', 'content': (
            f"Profitability Analysis: {item['profitability_analysis']}\n\n"
            f"Cash Flow Recommendations: {item['cash_flow_recommendations']}\n\n"
            f"Expense Optimization Strategies:\n{format_expense_optimization(item['expense_optimization'])}\n\n"
            f"Investment Opportunities: {item['investment_opportunities']}\n\n"
            f"Risk Assessment: {item['risk_assessment']}"
        )}
    ]

    # Apply tokenizer template to create formatted text
    formatted_text = tokenizer.apply_chat_template(
        convo,
        tokenize=False,
        add_generation_prompt=False
    )

    return {'text': formatted_text, 'conversations': convo}

# Helper function to format the expense optimization (handles both dict and string)
def format_expense_optimization(expense_data):
    # Check if expense_optimization is a string or a dictionary
    if isinstance(expense_data, str):
        # If it's already a string, return it directly
        return expense_data
    elif isinstance(expense_data, dict):
        # If it's a dictionary, format it nicely
        formatted_text = ""
        for category, recommendation in expense_data.items():
            formatted_text += f"- {category}: {recommendation}\n"
        return formatted_text
    else:
        # Fallback for any other data type
        return str(expense_data)

# Process all business financial examples
formatted_business_data = [format_business_example(item) for item in data]

# Create dataset with both text and conversations
train_data = Dataset.from_list(formatted_business_data)

# Alternatively, if you only want the 'text' field:
# business_train_data = Dataset.from_dict({'text': [x['text'] for x in formatted_business_data]})

# Preview the first example to verify formatting
print("Sample formatted business financial data:")
print(formatted_business_data[0]['text'][:500] + "...")  # Print first 500 chars

# Save the processed dataset if needed
# business_train_data.save_to_disk("processed_business_financial_dataset")

Sample formatted business financial data:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

The company generated a total revenue of $520,914.02 in January 1960. Employee salaries constitute the largest expense at $128,625.48, followed by raw materials at $91,696.32. Other significant expenses include taxes ($45,779.57), marketing ($33,529.62), and transfer money ($40,064.26). After accounting for all expenses, ...


We look at how the conversations are structured for item 5:

And we see how the chat template transformed these conversations.

**[Notice]** Llama 3.1 Instruct's default chat template default adds `"Cutting Knowledge Date: December 2023\nToday Date: 26 July 2024"`, so do not be alarmed!

In [None]:
train_data[5]["conversations"]

[{'content': "The company's financial data for June 1960 shows a total revenue of $456,647.43 and profits of $23,353.07. Significant expenses include employee salaries ($169,930.94), raw materials ($94,440.34), and taxes ($50,920.52). Loan payments are $27,822.76. Other expenses such as utilities, rent, marketing, insurance, office supplies, and maintenance also contribute to the overall cost structure. A transfer of $37,152.90 is also noted. The profit margin, calculated as (Profit / Total Revenue) * 100, is approximately 5.11%.",
  'role': 'user'},
 {'content': 'Profitability Analysis: The profitability analysis reveals a relatively low profit margin of 5.11%. While revenue is substantial, the high cost of employee salaries and raw materials significantly impacts the bottom line. Revenue growth trends are not available from this single data point, but monitoring these trends over time is crucial. Expense management, particularly in the areas of salaries and raw materials, needs caref

In [None]:
train_data[5]["text"]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe company's financial data for June 1960 shows a total revenue of $456,647.43 and profits of $23,353.07. Significant expenses include employee salaries ($169,930.94), raw materials ($94,440.34), and taxes ($50,920.52). Loan payments are $27,822.76. Other expenses such as utilities, rent, marketing, insurance, office supplies, and maintenance also contribute to the overall cost structure. A transfer of $37,152.90 is also noted. The profit margin, calculated as (Profit / Total Revenue) * 100, is approximately 5.11%.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nProfitability Analysis: The profitability analysis reveals a relatively low profit margin of 5.11%. While revenue is substantial, the high cost of employee salaries and raw materials significantly impacts the bottom line. Revenue growt

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import Dataset

# Convert the train_data list to a Hugging Face Dataset
train_data = Dataset.from_list(train_data)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/780 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=2):   0%|          | 0/780 [00:00<?, ? examples/s]

We verify masking is actually done:

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThe company's financial data for June 1960 shows a total revenue of $456,647.43 and profits of $23,353.07. Significant expenses include employee salaries ($169,930.94), raw materials ($94,440.34), and taxes ($50,920.52). Loan payments are $27,822.76. Other expenses such as utilities, rent, marketing, insurance, office supplies, and maintenance also contribute to the overall cost structure. A transfer of $37,152.90 is also noted. The profit margin, calculated as (Profit / Total Revenue) * 100, is approximately 5.11%.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nProfitability Analysis: The profitability analysis reveals a relatively low profit margin of 5.11%. While revenue is substantial, the high cost of employee salaries and raw materials significantly impacts the bottom li

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                        Profitability Analysis: The profitability analysis reveals a relatively low profit margin of 5.11%. While revenue is substantial, the high cost of employee salaries and raw materials significantly impacts the bottom line. Revenue growth trends are not available from this single data point, but monitoring these trends over time is crucial. Expense management, particularly in the areas of salaries and raw materials, needs careful attention. A cost-benefit analysis of marketing spend ($33,730.93) should be conducted to ensure it is effectively driving revenue. The impact of taxes ($50,920.52) is also significant and should be considered in long-term financial planning.\n\nCash Flow Recommendations: To improve cash flow, several strategies can be implemented. Firstly, negotiate extended payment terms with raw material suppl

We can see the System and Instruction prompts are successfully masked!

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.748 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 780 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mzelsayed722[0m ([33mzelsayed722-mansoura-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.3049
2,1.3186
3,1.3153
4,1.2915
5,1.2098
6,1.1143
7,0.935
8,0.9142
9,0.8028
10,0.7171


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": """
{'Date': '2018-05-01',
 'Month': 5,
 'Year': 2018,
 'Total_Revenue': 526352.11,
 'Employee_Salaries': 165851.51,
 'Loan_Payment': 23506.69,
 'Raw_Materials': 94326.72,
 'Utilities': 9173.98,
 'Rent': 17038.7,
 'Marketing': 29864.6,
 'Insurance': 15844.99,
 'Office_Supplies': 4422.06,
 'Maintenance': 8500.11,
 'Taxes': 51556.86,
 'Transfer_Money': 35452.41,
 'Profits': 106265.88999999996}
    """},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n{'Date': '2018-05-01',\n 'Month': 5,\n 'Year': 2018,\n 'Total_Revenue': 526352.11,\n 'Employee_Salaries': 165851.51,\n 'Loan_Payment': 23506.69,\n 'Raw_Materials': 94326.72,\n 'Utilities': 9173.98,\n 'Rent': 17038.7,\n 'Marketing': 29864.6,\n 'Insurance': 15844.99,\n 'Office_Supplies': 4422.06,\n 'Maintenance': 8500.11,\n 'Taxes': 51556.86,\n 'Transfer_Money': 35452.41,\n 'Profits': 106265.88999999996}\n    <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nProfitability Analysis: The profitability of the company can be assessed by comparing the 'Total_Revenue' with the 'Profits'. In this case, the profit margin is approximately 20.21% (Profits / Total_Revenue). This indicates a reasonable level of profitability. However, further analysis of revenue growth trends and cost components (such as E

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": """
{'Date': '2018-05-01',
 'Month': 5,
 'Year': 2018,
 'Total_Revenue': 526352.11,
 'Employee_Salaries': 165851.51,
 'Loan_Payment': 23506.69,
 'Raw_Materials': 94326.72,
 'Utilities': 9173.98,
 'Rent': 17038.7,
 'Marketing': 29864.6,
 'Insurance': 15844.99,
 'Office_Supplies': 4422.06,
 'Maintenance': 8500.11,
 'Taxes': 51556.86,
 'Transfer_Money': 35452.41,
 'Profits': 106265.88999999996}
    """},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 3000,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Profitability Analysis: The profit margin for this period is approximately 20.1% (Profits/Total_Revenue). Revenue growth appears stagnant, with a slight decrease from 526352.11 to 523000.00 if we assume the profit margin remains constant, revenue would increase to 656000.00. Expense management is crucial; Employee_Salaries and Raw_Materials represent the largest expenses. A detailed review of these costs is necessary. Marketing spends seem reasonable, but their return on investment (ROI) should be analyzed. Interest expenses from Loans should be compared against alternative borrowing options. The impact of Transfer_Money needs to be assessed, is this a necessary expense or can it be optimized?

Cash Flow Recommendations: To improve cash flow, consider the following strategies: 1) Negotiate extended payment terms with suppliers for Raw_Materials and Utilities. 2) Implement a robust accounts receivable process to ensure timely collection of payments from customers. 3) Explore options for

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("business_financial_model") # Local saving
tokenizer.save_pretrained("business_financial_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('business_financial_model/tokenizer_config.json',
 'business_financial_model/special_tokens_map.json',
 'business_financial_model/tokenizer.json')

In [None]:
!pip install "vllm>=0.4.0" huggingface_hub

In [None]:
!vllm serve \
  --model unsloth/Meta-Llama-3.1-8B-Instruct \
  --quantization awq \  # 4-bit quantization (faster + cheaper)
  --enable-lora \
  --lora-modules my-lora=./path-to-your-lora-adapter \  # Local path or HF Hub ID
  --max-lora-rank 64 \
  --host 0.0.0.0 --port 8000

In [None]:
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
ter_model_id = "personal_financial_model"
business_model_id = "business_financial_model"

!nohup vllm serve $model_name \
  --dtype half \
  --gpu-memory-utilization 0.8 \
  --max-lora-rank 64 \
  --enable-lora \
  --lora-modules personal-lora=$ter_model_id business-lora=$business_model_id \
  > vllm.log 2>&1 &

In [None]:
!tail -n 30 nohup.out

2025-04-13 20:48:23.495743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744577303.524686   23713 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744577303.533823   23713 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 04-13 20:48:29 __init__.py:190] Automatically detected platform cuda.
usage: vllm [-h] [-v] {serve,complete,chat} ...
vllm: error: unrecognized arguments: --lora-base unsloth/Meta-Llama-3.1-8B-Instruct


In [None]:
from huggingface_hub import login
login(token="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")
model.push_to_hub("sayed-zaki-122/business_financial_model") # Online saving
tokenizer.push_to_hub("sayed-zaki-122/business_financial_model")

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sayed-zaki-122/business_financial_model/commit/2c80dde6e19d74abc39b0a4b67e136742eb3ac82', commit_message='Upload tokenizer', commit_description='', oid='2c80dde6e19d74abc39b0a4b67e136742eb3ac82', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sayed-zaki-122/business_financial_model', endpoint='https://huggingface.co', repo_type='model', repo_id='sayed-zaki-122/business_financial_model'), pr_revision=None, pr_num=None)

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "personal_financial_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": """
    {'Income': 3271.32,
 'Electricity': 68.8,
 'Gas': 44.46,
 'Internet': 63.46,
 'Water': 43.37,
 'Food': 387.04,
 'Groceries': 304.22,
 'Rent': 861.92,
 'Transportation': 159.58,
 'Healthcare': 99.91,
 'Shopping': 173.13,
 'Entertainment': 102.72,
 'Dining Out': 153.67,
 'Subscription Services': 31.06,
 'Clothing': 95.52,
 'Electronics': 52.21,
 'Zakat': 222.91,
 'Investment': 359.92,
 'Emergency Fund': 229.5,
 'Retirement': 415.64,
 'Education': 149.66,
 'Travel': 305.61,
 'Gifts': 52.39,
 'Home Maintenance': 118.51,
 'Others': 89.79}
    """},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Recommended savings: Based on your income and expenses, I recommend aiming to save at least 20% of your income, which is approximately 657.58. This includes your existing savings allocations (Investment, Emergency Fund, and Retirement) and potential areas for increase (Consider increasing savings to reach this goal).
Optimized budget: {'Income': 3461.88,
 'Electricity': 68.92,
 'Gas': 48.39,
 'Internet': 59.03,
 'Water': 34.7,
 'Food': 361.19,
 'Groceries': 291.87,
 'Rent': 843.46,
 'Transportation': 153.93,
 'Healthcare': 99.87,
 'Shopp

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if True:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "personal_financial_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("personal_financial_model")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
# if True: model.save_pretrained("model", safe_serialization=True)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "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")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

AttributeError: 'LlamaForCausalLM' object has no attribute 'push_to_hub_merged'

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

In [None]:
!vllm serve unsloth/Meta-Llama-3.1-8B-Instruct --enable-lora --lora-modules business-lora=sayed-zaki-122/business_financial_model personal-lora=sayed-zaki-122/personal_financial_model --max-lora-rank 64 --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.6  # Lower due to higher memory usage

2025-04-13 21:44:39.866572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744580679.888162   38449 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744580679.894421   38449 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 04-13 21:44:42 __init__.py:190] Automatically detected platform cuda.
INFO 04-13 21:44:43 api_server.py:840] vLLM API server version 0.7.2
INFO 04-13 21:44:43 api_server.py:841] args: Namespace(subparser='serve', model_tag='unsloth/Meta-Llama-3.1-8B-Instruct', config='', host='0.0.0.0', port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None,

In [None]:
!pip show vllm

Name: vllm
Version: 0.7.2
Summary: A high-throughput and memory-efficient inference and serving engine for LLMs
Home-page: https://github.com/vllm-project/vllm
Author: vLLM Team
Author-email: 
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, blake3, cloudpickle, compressed-tensors, depyf, einops, fastapi, filelock, gguf, importlib_metadata, lark, lm-format-enforcer, mistral_common, msgspec, numpy, nvidia-ml-py, openai, outlines, partial-json-parser, pillow, prometheus-fastapi-instrumentator, prometheus_client, protobuf, psutil, py-cpuinfo, pydantic, pyyaml, pyzmq, ray, requests, sentencepiece, tiktoken, tokenizers, torch, torchaudio, torchvision, tqdm, transformers, typing_extensions, uvicorn, xformers, xgrammar
Required-by: 


In [None]:
!nvidia-smi

Sun Apr 13 21:35:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P0             27W /   70W |   13796MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM

# base_model = AutoModelForCausalLM.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(base_model, "personal_financial_model")

model.save_pretrained("final_personal_financial")  # This will save it in the format vLLM expects


KeyError: 'base_model.model.model.model.embed_tokens'

In [None]:
base_model_id = "unsloth/Meta-Llama-3.1-8B-Instruct"
adapter1_path = "business_financial_model"
adapter2_path = "personal_financial_model"

!nohup vllm serve "$base_model_id" --dtype=half --gpu-memory-utilization 0.8 --max_lora_rank 64 --enable-lora --lora-modules adapter1=$adapter1_path adapter2=$adapter2_path &


nohup: appending output to 'nohup.out'


In [None]:
!tail -n 30 nohup.out

  File "/usr/local/lib/python3.11/dist-packages/vllm/scripts.py", line 44, in serve
    uvloop.run(run_server(args))
  File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 105, in run
    return runner.run(wrapper())
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
  File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 61, in wrapper
    return await main
           ^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 875, in run_server
    async with build_async_engine_client(args) as engine_client:
  File "/usr/lib/python3.11/contextlib.py", line 210, in __aenter__
    return await anext(self.gen)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/vll

In [None]:
model.save_pretrained_merged("merged_model")
model.save_pretrained_lora("lora_adapter_folder")

AttributeError: 'LlamaForCausalLM' object has no attribute 'save_pretrained_merged'