In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [None]:
# analyzing custom data

# socratiq dataset repo: https://github.com/NUS-IDS/eacl23_soqg/blob/main/data/soqg_dataset/train_chunk_I.csv

# umass data augmentation repo: https://github.com/umass-ml4ed/socratic-quest-gen

#video link: https://www.youtube.com/watch?v=JJWvYQdOVOY

import pandas as pd

In [None]:
df = pd.read_csv('train_chunk_I.csv')
df2 = pd.read_csv('train_chunk_II.csv')
df3 = pd.read_csv('train_chunk_III.csv')

In [None]:
# prompt: remove the first word from input column

# Remove the first word from the 'input' column
df['input'] = df['input'].str.split(n=1).str[1]
df.head()

df2['input'] = df2['input'].str.split(n=1).str[1]
df2.head()

df3['input'] = df3['input'].str.split(n=1).str[1]
df3.head()

merged_df = pd.concat([df, df2, df3], ignore_index=True)
randSample = merged_df.sample(n=1000)

In [None]:
# prompt: remove the first word from input column

# Remove the first word from the 'input' column
df['input'] = df['input'].str.split(n=1).str[1]
df.head()


Unnamed: 0.1,Unnamed: 0,input,target
0,0,parallel argument would state that England is ...,What about nations who have nothing?
1,1,would be cringe because trump's policies (and ...,"If not, what about this is cringe exactly?"
2,2,do not understand how this significantly chang...,What about public surveillance cameras?
3,3,is a tremendous amount of overlap between high...,How about allowing some students to go straigh...
4,4,consider yourself a feminist. But judging by y...,What else do you imagine is necessary to be co...


In [None]:
df['target']

Unnamed: 0,target
0,What about nations who have nothing?
1,"If not, what about this is cringe exactly?"
2,What about public surveillance cameras?
3,How about allowing some students to go straigh...
4,What else do you imagine is necessary to be co...
...,...
28189,Is it really that much effort to call someone ...
28190,How is this similar to the issue of individual...
28191,Which part is the conspiracy?
28192,Do you think that you are just talking about a...


In [None]:
from transformers import AutoTokenizer

INSTRUCTION = "Provide a socratic question in my style based on the input"

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct-bnb-4bit")

print(f"{'Total':<12}{'Instruction':<12}{'Input':<12}{'Target':<12}")

maxTokens = 0

for index, row in df.iterrows():
    input = row["input"]    # Replace with actual column name if different
    question = row["target"] # Replace with actual column name if different

    # Count tokens
    instruction_tokens = tokenizer(INSTRUCTION, return_tensors="pt")["input_ids"].shape[1]
    input_tokens = tokenizer(input, return_tensors="pt")["input_ids"].shape[1]
    question_tokens = tokenizer(question, return_tensors="pt")["input_ids"].shape[1]

    # Print table of tokens
    total_tokens = instruction_tokens + input_tokens + question_tokens
    maxTokens = max(maxTokens, total_tokens)
    #print(f"{total_tokens:<12}{instruction_tokens:<12}{input_tokens:<12}{question_tokens:<12}")

    if index >= 1000:
        break

# max sequence length set at 2048 should be fine

Total       Instruction Input       Target      


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8, #choosing lower alpha so that finetuning layers aren't used as much as the base model because we are only finetuning on the type of questions
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.14 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
from datasets import Dataset

df = randSample.head(1000)

# Define the prompt format
train_prompt = """Below is a detailed description of a topic provided by a user, outlining their current understanding of its fundamentals.
Using the principles of the Socratic method, generate a probing and reflective question that challenges underlying assumptions and invites deeper exploration of the topic's basic principles.
Your question should:
- Request clarification or further explanation of key concepts.
- Encourage the user to critically reflect on the foundations and implications of the topic.
- Remain focused on the subject and stimulate thoughtful self-examination.
- Only provide a question, nothing else.

### Topic Description:
{}

### Socratic Question:
{}
""" + tokenizer.eos_token

topic = ""

inference_prompt = prompt = """Below is a detailed description of a topic provided by a user, outlining their current understanding of its fundamentals.
Using the principles of the Socratic method, generate a probing and reflective question that challenges underlying assumptions and invites deeper exploration of the topic's basic principles.
Your question should:
- Request clarification or further explanation of key concepts.
- Encourage the user to critically reflect on the foundations and implications of the topic.
- Remain focused on the subject and stimulate thoughtful self-examination.
- Only provide a question, nothing else.

### Topic Description:
{}

### Socratic Question:
{}
"""

# Prepare fine-tuning examples
finetuning_examples = [
    {"text": train_prompt.format(row["input"], row["target"])}
    for _, row in df.iterrows()
]

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(finetuning_examples)

# Print example to verify
print(dataset[0])


{'text': "Below is a detailed description of a topic provided by a user, outlining their current understanding of its fundamentals.\nUsing the principles of the Socratic method, generate a probing and reflective question that challenges underlying assumptions and invites deeper exploration of the topic's basic principles.\nYour question should:\n- Request clarification or further explanation of key concepts.\n- Encourage the user to critically reflect on the foundations and implications of the topic.\n- Remain focused on the subject and stimulate thoughtful self-examination.\n- Only provide a question, nothing else.\n\n### Topic Description:\nThe country I live in has likely begun the transition from a public/private sector medical system, to an almost entirely publicly provided system. Currently the wealthiest citizens use the private sector exclusively, and pay large amounts to medical schemes who fund the bulk of this use. With the shift to a national healthcare system, the medical 

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
2.295 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 48,627,712/3,000,000,000 (1.62% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.1332
2,3.0581
3,3.1715
4,3.0249
5,3.0395
6,2.8062
7,2.8301
8,2.7393
9,2.6646
10,2.3641


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

469.2444 seconds used for training.
7.82 minutes used for training.
Peak reserved memory = 3.668 GB.
Peak reserved memory for training = 1.373 GB.
Peak reserved memory % of max memory = 24.883 %.
Peak reserved memory for training % of max memory = 9.314 %.


In [None]:
df = pd.read_csv('train_chunk_I.csv')
df['input'] = df['input'].str.split(n=1).str[1]
input = df.iloc[2001,1]

In [None]:
df.iloc[2001,2]

'What about in situations where someone calls somebody a sick cunt?'

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(
[
    inference_prompt.format(
        """Gravity is a fundamental force of nature that causes objects with mass or energy to attract each other""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")
eos_token_id = tokenizer.eos_token_id
outputs = model.generate(
    **inputs,
    max_new_tokens=24,
    min_new_tokens=4,
    eos_token_id=eos_token_id,
    pad_token_id=eos_token_id,  # ‚Üê CRUCIAL ADDITION
    do_sample=True,
    temperature=0.7,  # ‚Üê Better creativity
    repetition_penalty=1.2,  # ‚Üê Prevent loops
)
tokenizer.batch_decode(outputs,skip_special_tokens=True)

["Below is a detailed description of a topic provided by a user, outlining their current understanding of its fundamentals.\nUsing the principles of the Socratic method, generate a probing and reflective question that challenges underlying assumptions and invites deeper exploration of the topic's basic principles.\nYour question should:\n- Request clarification or further explanation of key concepts.\n- Encourage the user to critically reflect on the foundations and implications of the topic.\n- Remain focused on the subject and stimulate thoughtful self-examination.\n- Only provide a question, nothing else.\n\n### Topic Description:\nGravity is a fundamental force of nature that causes objects with mass or energy to attract each other\n\n### Socratic Question:\n\nWhat about people who are not attracted to others?\n"]