# Hugging Face and Weights & Biases Setup

In [1]:
# Importing necessary libraries for authentication
from huggingface_hub import login
import getpass
import os

# Prompt for the Hugging Face token securely
hf_token = getpass.getpass("Enter your Hugging Face token: ")

# Login to Hugging Face Hub
login(token=hf_token)

Enter your Hugging Face token: ··········


# Initialize Weights & Biases

In [2]:
from huggingface_hub import login
import wandb
import getpass

# Login to Weights & Biases
wb_token = getpass.getpass("Enter your Weights & Biases API Key: ")
wandb.login(key=wb_token)

# Initialize W&B run
run = wandb.init(
    project="Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset",
    job_type="training",
    anonymous="allow"
)

Enter your Weights & Biases API Key: ··········


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mm7alishaikhx[0m ([33mm7alishaikhx-fast[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Install Required Libraries

In [3]:
# Install Unsloth for faster fine-tuning
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.14-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.11 (from unsloth)
  Downloading unsloth_zoo-2025.3.12-py3-none-any.whl.metadata (17 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.17-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Do

# Load the Model and Tokenizer

In [4]:
from unsloth import FastLanguageModel

# Set model parameters
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load model and tokenizer from unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hf_token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

# Define the Prompt Style for Meme Explanation

In [5]:
# Define the style for the prompt to generate explanations
prompt_style = """Below is a math meme with an incorrect solution. Your task is to identify the error and provide a correct explanation.

### Incorrect Meme:
{}

### Identified Error:
{}

### Fixed Explanation:
{}"""

# Inference Function for Testing Memes

In [6]:
# Set the model in inference mode
FastLanguageModel.for_inference(model)

def test_math_meme(meme_description):
    input_text = prompt_style.format(meme_description, "", "")

    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    # Generate the response
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=500,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Print the raw response for debugging
    print("Raw Response:\n", response)

    # Extract explanations
    explanations = response.split("### Fixed Explanation")
    valid_explanations = [exp.strip() for exp in explanations if exp.strip() and "error" not in exp.lower()]

    if valid_explanations:
        extracted_explanation = valid_explanations[-1]
        print("\nExtracted Explanation:\n", extracted_explanation)
    else:
        print("\nNo valid explanation detected.")

## Test Case Example

In [7]:
test_meme = "A meme stating that 3 + 5 * 2 = 13"
test_math_meme(test_meme)

Raw Response:
 Below is a math meme with an incorrect solution. Your task is to identify the error and provide a correct explanation.

### Incorrect Meme:
A meme stating that 3 + 5 * 2 = 13

### Identified Error:


### Fixed Explanation:
**Step-by-step Explanation:**

1. **Understand the Order of Operations:** 
   - According to PEMDAS (Parentheses, Exponents, Multiplication and Division, Addition and Subtraction), multiplication should be done before addition.

2. **Apply the Correct Operations:**
   - First, calculate 5 * 2 = 10.
   
3. **Then Add 3:**
   - 3 + 10 = 13.

4. **Conclusion:**
   - The correct result is 13.
   - The initial statement claiming 3 + 5 * 2 = 13 is incorrect because it ignores the proper order of operations, resulting in an addition before multiplication.

**Correct Calculation:**
3 + 5 * 2 = 13.

**Fixed Meme:**
A meme stating that 3 + 5 * 2 = 13, but this is incorrect. The correct result is 13, not 15.

---

**Step-by-Step Explanation:**

1. **Identify the 

# Prepare the Dataset for Fine-Tuning

In [8]:
train_prompt_style = """Below is a math meme with an incorrect solution. Your task is to identify the error, explain it, and provide the correct answer.

### Incorrect Meme:
{}

### Identified Error:
<think>
{}
</think>

### Fixed Explanation:
{}"""

# Ensure EOS token is appended to each text
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    descriptions = examples["description"]  # Incorrect meme
    errors = examples["error"]  # Identified mistake
    explanations = examples["explanation"]  # Corrected response

    texts = [
        train_prompt_style.format(desc, err, exp) + EOS_TOKEN
        for desc, err, exp in zip(descriptions, errors, explanations)
    ]
    return texts  # ✅ Return a list instead of a dictionary

# Load the Dataset for Training

In [9]:
from datasets import Dataset
import pandas as pd
import json

dataset_path = "/content/math_memes.jsonl"  # Update with the correct path

# Read JSONL file into a list of dictionaries
with open(dataset_path, "r") as f:
    dataset = [json.loads(line) for line in f]

# Convert list of dictionaries to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(pd.DataFrame(dataset))

# Check the first few rows of the dataset
print(hf_dataset[:5])

{'description': ['3 + 5 × 2 = 16?', '12 ÷ 3(3 + 3) = 18?', '5 ÷ 0 = 0', '4 × (2 + 3) = 20?', '100 - 50 ÷ 2 = 25?'], 'error': ['Misinterpretation of multiplication and order of operations', 'Misinterpretation of parentheses and division', 'Division by zero is undefined', 'Misinterpretation of parentheses', 'Order of operations mistake (PEMDAS)'], 'explanation': ['The correct answer is 13. According to the order of operations (PEMDAS), multiplication is performed first: 5 × 2 = 10, then 3 + 10 = 13.', 'The correct answer is 12. The parentheses are evaluated first: 3 + 3 = 6. Then, 12 ÷ 3 × 6 = 12.', 'Division by zero is not allowed in mathematics. It is undefined.', 'The correct answer is 20, but the expression needs clarification. The parentheses should be simplified first, then multiply: 4 × (5) = 20.', 'The correct answer is 75. Division comes before subtraction in PEMDAS: 50 ÷ 2 = 25, then 100 - 25 = 75.']}


# Model Configuration with PEFT (Low-Rank Adaptation)

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.3.14 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Training Configuration and Fine-Tuning

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=hf_dataset,  # ✅ Now a Hugging Face Dataset
    dataset_text_field="text",
    formatting_func=formatting_prompts_func,  # ✅ Ensures text formatting
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Start training
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/21 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 21 | Num Epochs = 12 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
10,1.6887
20,0.3108
30,0.1048
40,0.0628
50,0.047
60,0.0385


# Save the Model

In [12]:
# Define the directory where you want to save the model
output_dir = "deepseek_q2"

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to deepseek_q2


# Inference After Fine-Tuning

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Clear cache and reset memory to avoid fragmentation
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()  # Clear any previous memory state

# Set the memory fraction to limit the GPU memory usage
torch.cuda.set_per_process_memory_fraction(0.5)  # Limit GPU memory usage to 50%

# Define model path
output_dir = "deepseek_q2"

# Choose quantization level: 8-bit or 4-bit
use_4bit = False  # Set to False for 8-bit precision

# Configure quantization and memory management
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    load_in_8bit=not use_4bit,  # Use 8-bit if not using 4-bit
    bnb_4bit_compute_dtype=torch.float16 if use_4bit else None,
    bnb_4bit_use_double_quant=True if use_4bit else False,
    llm_int8_enable_fp32_cpu_offload=True  # Offload FP32 to CPU if needed
)

# Try loading the model with quantization
try:
    model = AutoModelForCausalLM.from_pretrained(output_dir, quantization_config=bnb_config).to("cuda")
except RuntimeError as e:
    print("Error loading model: ", e)
    # Handle memory error, e.g., reduce model size or use CPU offloading

tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Set model max sequence length manually if needed
model.max_seq_length = 1024  # Reduce sequence length to fit model in memory

# Example inference function
def generate_text(prompt, max_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_tokens,  # Limit token generation to save memory
            do_sample=True,  # Set to False for deterministic output
            temperature=0.3  # Adjust creativity
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test inference with a new prompt
prompt = "A meme claiming that the square root of 25 equals -5"
print(generate_text(prompt, max_tokens=100))

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Error loading model:  CUDA out of memory. Tried to allocate 1008.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 7.43 GiB is free. Process 16553 has 7.31 GiB memory in use. 7.37 GiB allowed; Of the allocated memory 7.01 GiB is allocated by PyTorch, and 146.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
A meme claiming that the square root of 25 equals -5 is incorrect. The correct answer is 5, as square roots are defined to be non-negative in many contexts, especially in mathematics.
The correct answer is 5.
</think>

The correct answer is 5. The square root of 25 is 5, as square roots are defined to be non-negative in many contexts, especially in mathematics.
