In [1]:
%%capture
import os, re

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install -q unsloth
else:
    import torch
    v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")

    !pip install -q --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install -q sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install -q --no-deps unsloth

!pip install -q transformers==4.56.2
!pip install -q --no-deps trl==0.22.2


In [2]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [3]:
from datasets import load_dataset

data_path = "/content/drive/MyDrive/Uplifty/Copy of suggestions_dataset_combined.jsonl"

dataset = load_dataset(
    "json",
    data_files=data_path,
    split="train"
)

print(dataset)
print(dataset[0])


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 381
})
{'input': {'primary dimension': 'Relationships & Social Connection', 'score': 62, 'user onboarding information': {'career aspiration': 'Social Worker', 'sociability': 'Outgoing', 'mbti': 'ESFJ', 'intent': 'To build meaningful friendships and community', 'streak': {'type': 'Social', 'length': 15}, 'interests': ['meetups', 'team sports', 'social events']}, 'badges': [{'category': 'Social', 'title': 'Friend Connector', 'description': 'Introduced 10 people to each other'}], 'campus': {'campus challenges': [{'title': 'Coffee Chat Challenge', 'description': 'Have coffee with 3 new people this week'}], 'campus comments': [{'parent content text': 'How to make friends as a transfer student?', 'parent content media': 'text', 'content': "Join clubs and attend campus events - that's what worked for me!"}], 'campus posts': [{'content': 'Amazing game night with new friends! #social', 'media': 'picture', 'uplifty_count': 12, 'like_coun

### Unsloth

In [4]:
from unsloth import FastLanguageModel
import torch

# For your dataset size (381 rows), 2048 is safe.
# If your JSON is small, you can set this to 1024 to train faster.
max_seq_length = 2048

# A100 (Ampere) works best with bfloat16
dtype = torch.bfloat16

# 4-bit base weights for efficient QLoRA fine-tuning
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct-bnb-4bit",  # matches your goal
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [12]:
from datasets import load_dataset

data_path = "/content/drive/MyDrive/Uplifty/Copy of suggestions_dataset_combined.jsonl"
raw_dataset = load_dataset("json", data_files=data_path, split="train")

print(raw_dataset)
print(raw_dataset[0].keys())


Dataset({
    features: ['input', 'output'],
    num_rows: 381
})
dict_keys(['input', 'output'])


In [13]:
def wc_ok(example):
    s = example["output"]["suggestion_output"].strip()
    n = len(s.split())
    return 15 <= n <= 40

filtered = raw_dataset.filter(wc_ok)

print("Before:", len(raw_dataset))
print("After :", len(filtered))
print("Dropped:", len(raw_dataset) - len(filtered))


Filter:   0%|          | 0/381 [00:00<?, ? examples/s]

Before: 381
After : 381
Dropped: 0


In [15]:
def wc(example):
    return {"wc": len(example["output"]["suggestion_output"].split())}

dropped = raw_dataset.map(wc).filter(lambda x: x["wc"] < 15 or x["wc"] > 40)
print("Dropped examples:", len(dropped))

if len(dropped) > 0:
    print(dropped[0])
else:
    print("âœ… No dropped rows â€” all outputs are already within 15â€“40 words.")



Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Filter:   0%|          | 0/381 [00:00<?, ? examples/s]

Dropped examples: 0
âœ… No dropped rows â€” all outputs are already within 15â€“40 words.


In [16]:
import json

SYSTEM_PROMPT = (
    "You are the Uplifty Suggestion Generator. "
    "Given a JSON user profile and campus activity context, return exactly ONE concrete suggestion sentence. "
    "Rules: 15â€“40 words. No lists. No explanations. No greetings. "
    "Output only the suggestion sentence."
)

def build_text(example):
    user_json = json.dumps(example["input"], ensure_ascii=False)
    suggestion = example["output"]["suggestion_output"].strip()

    text = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n\n"
        f"{SYSTEM_PROMPT}<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"User JSON:\n{user_json}\n\n"
        "Return the suggestion now."
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{suggestion}<|eot_id|>"
    )
    return {"text": text}

dataset = raw_dataset.map(build_text)

splits = dataset.train_test_split(test_size=0.1, seed=3407)
train_dataset = splits["train"]
eval_dataset  = splits["test"]

print("Train:", len(train_dataset), "Eval:", len(eval_dataset))
print(train_dataset[0]["text"][:900])


Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Train: 342 Eval: 39
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are the Uplifty Suggestion Generator. Given a JSON user profile and campus activity context, return exactly ONE concrete suggestion sentence. Rules: 15â€“40 words. No lists. No explanations. No greetings. Output only the suggestion sentence.<|eot_id|><|start_header_id|>user<|end_header_id|>

User JSON:
{"primary dimension": "Financial Health", "score": 58, "user onboarding information": {"career aspiration": "Financial Advisor", "sociability": "Practical", "mbti": "ISTJ", "intent": "To learn money management and build savings", "streak": {"type": "Budgeting", "length": 14}, "interests": ["investing", "personal finance", "frugal living"]}, "badges": [{"category": "Finance", "title": "Budget Master", "description": "Stuck to budget for 30 days straight"}], "campus": {"campus challenges": [{"title": "No-Spend Weekend", "descript


In [17]:
sample = train_dataset[0]["text"]
print(sample)
print("\n--- END ---\n")


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are the Uplifty Suggestion Generator. Given a JSON user profile and campus activity context, return exactly ONE concrete suggestion sentence. Rules: 15â€“40 words. No lists. No explanations. No greetings. Output only the suggestion sentence.<|eot_id|><|start_header_id|>user<|end_header_id|>

User JSON:
{"primary dimension": "Financial Health", "score": 58, "user onboarding information": {"career aspiration": "Financial Advisor", "sociability": "Practical", "mbti": "ISTJ", "intent": "To learn money management and build savings", "streak": {"type": "Budgeting", "length": 14}, "interests": ["investing", "personal finance", "frugal living"]}, "badges": [{"category": "Finance", "title": "Budget Master", "description": "Stuck to budget for 30 days straight"}], "campus": {"campus challenges": [{"title": "No-Spend Weekend", "description": "Avoid unnecessary purchases for 3 days"}, {"title": "Savings Sprint", "description": "Save 

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [18]:
from unsloth import FastLanguageModel

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (16 is a solid default)
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0.0,                 # keep 0.0 (fast + optimized) since we trust dataset
    bias="none",
    use_gradient_checkpointing="unsloth",  # saves VRAM, helps long JSON inputs
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth 2026.1.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


We look at how the conversations are structured for item 5:

<a name="Train"></a>
### Train the model
Now let's train our model. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [19]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

output_dir = "/content/drive/MyDrive/Uplifty/model_runs/uplifty_llama32_1b_lora_v1"

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=4,        # A100 can handle this for 1B 4bit
        gradient_accumulation_steps=4,        # effective batch ~16
        warmup_ratio=0.05,                    # nicer than fixed warmup_steps
        num_train_epochs=2,                   # start with 2 (you can try 1 or 3 later)
        learning_rate=1e-4,                   # more stable for instruction outputs
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        save_total_limit=2,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir=output_dir,
        report_to="none",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/342 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/39 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [20]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)


Map (num_proc=16):   0%|          | 0/342 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/39 [00:00<?, ? examples/s]

We can see the System and Instruction prompts are successfully masked!

In [22]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
3.229 GB of memory reserved.


In [23]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 342 | Num Epochs = 2 | Total steps = 44
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


In [24]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

39.604 seconds used for training.
0.66 minutes used for training.
Peak reserved memory = 3.229 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 8.163 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [25]:
from unsloth import FastLanguageModel
import json
import torch

FastLanguageModel.for_inference(model)

test_input = {
    "primary dimension": "Financial Health",
    "score": 58,
    "user onboarding information": {
        "career aspiration": "Financial Advisor",
        "intent": "To learn money management and build savings"
    }
}

prompt = (
    "<|begin_of_text|>"
    "<|start_header_id|>system<|end_header_id|>\n\n"
    "You are the Uplifty Suggestion Generator. "
    "Given a JSON user profile and campus activity context, return exactly ONE concrete suggestion sentence. "
    "Rules: 15â€“40 words. No lists. No explanations. No greetings."
    "<|eot_id|>"
    "<|start_header_id|>user<|end_header_id|>\n\n"
    f"User JSON:\n{json.dumps(test_input)}\n\n"
    "Return the suggestion now."
    "<|eot_id|>"
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
)

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=80,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


system

You are the Uplifty Suggestion Generator. Given a JSON user profile and campus activity context, return exactly ONE concrete suggestion sentence. Rules: 15â€“40 words. No lists. No explanations. No greetings.user

User JSON:
{"primary dimension": "Financial Health", "score": 58, "user onboarding information": {"career aspiration": "Financial Advisor", "intent": "To learn money management and build savings"}}

Return the suggestion now.assistant

Your financial literacy builds security and freedom! Continue learning about money managementâ€”every dollar saved builds wealth and provides greater financial security.


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [26]:
lora_path = "/content/drive/MyDrive/Uplifty/model_runs/uplifty_llama32_1b_lora_v1"

trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

print("âœ… LoRA adapters saved to:", lora_path)


âœ… LoRA adapters saved to: /content/drive/MyDrive/Uplifty/model_runs/uplifty_llama32_1b_lora_v1


In [28]:
!df -h
!du -h -d 1 /content | sort -h


Filesystem      Size  Used Avail Use% Mounted on
overlay         236G   41G  195G  18% /
tmpfs            64M     0   64M   0% /dev
shm              41G   24K   41G   1% /dev/shm
/dev/root       2.0G  1.2G  750M  62% /usr/sbin/docker-init
tmpfs            42G  1.1M   42G   1% /var/colab
/dev/sda1       242G   43G  200G  18% /kaggle/input
tmpfs            42G     0   42G   0% /proc/acpi
tmpfs            42G     0   42G   0% /proc/scsi
tmpfs            42G     0   42G   0% /sys/firmware
drive            15G   14G  1.9G  88% /content/drive
140K	/content/.config
2.8M	/content/unsloth_compiled_cache
17M	/content/huggingface_tokenizers_cache
55M	/content/sample_data
389M	/content/drive
463M	/content


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [29]:
local_merged_path = "/content/uplifty_llama32_1b_merged_fp16_v1"

trainer.model.save_pretrained_merged(
    local_merged_path,
    tokenizer,
    save_method="merged_16bit",
)

print("âœ… Merged FP16 model saved locally at:", local_merged_path)


Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:05<00:00,  6.00s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:10<00:00, 10.45s/it]


Unsloth: Merge process complete. Saved to `/content/uplifty_llama32_1b_merged_fp16_v1`
âœ… Merged FP16 model saved locally at: /content/uplifty_llama32_1b_merged_fp16_v1


In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

In [30]:
import os, glob

path = "/content/uplifty_llama32_1b_merged_fp16_v1"
print("Files:", os.listdir(path))

print("\nSafetensors:", glob.glob(path + "/*.safetensors"))
print("Config:", glob.glob(path + "/config.json"))
print("Tokenizer:", glob.glob(path + "/tokenizer.*") + glob.glob(path + "/*tokenizer*"))


Files: ['chat_template.jinja', 'tokenizer_config.json', 'model.safetensors', 'special_tokens_map.json', '.cache', 'config.json', 'tokenizer.json']

Safetensors: ['/content/uplifty_llama32_1b_merged_fp16_v1/model.safetensors']
Config: ['/content/uplifty_llama32_1b_merged_fp16_v1/config.json']
Tokenizer: ['/content/uplifty_llama32_1b_merged_fp16_v1/tokenizer.json', '/content/uplifty_llama32_1b_merged_fp16_v1/tokenizer_config.json', '/content/uplifty_llama32_1b_merged_fp16_v1/tokenizer.json']


In [31]:
!zip -r /content/uplifty_llama32_1b_merged_fp16_v1.zip /content/uplifty_llama32_1b_merged_fp16_v1


  adding: content/uplifty_llama32_1b_merged_fp16_v1/ (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/chat_template.jinja (deflated 71%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/tokenizer_config.json (deflated 94%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/model.safetensors (deflated 21%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/special_tokens_map.json (deflated 71%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/ (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/huggingface/ (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/huggingface/download/ (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/huggingface/download/tokenizer.model.lock (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/huggingface/download/model.safetensors.lock (stored 0%)
  adding: content/uplifty_llama32_1b_merged_fp16_v1/.cache/huggingface/download/model.safetensors.metadata

In [35]:
!cp /content/uplifty_llama32_1b_merged_fp16_v1.zip \
   /content/drive/MyDrive/Uplifty/


In [36]:
!ls /content/drive/MyDrive/Uplifty


'Copy of suggestions_dataset_combined.jsonl'
 model_runs
 uplifty_llama32_1b_merged_fp16_v1.zip
