In [1]:
!nvidia-smi

Fri Dec 19 18:25:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:3B:00.0 Off |                    0 |
| N/A   46C    P0             73W /  300W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off |   00

In [2]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

MODEL_ID = "mistralai/Mistral-7B-v0.1"
OUTPUT_DIR = "hcl_final_model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 2
LR = 2e-4
EPOCHS = 2

print(f"ðŸ”¹ Using Device: {DEVICE}")

data = [
    {"text": "### Instruction: Correct grammar. ### Input: i want police for matrnity ### Response: I want the policy for maternity leave."},
    {"text": "### Instruction: Correct grammar. ### Input: screen broke wat do ### Response: My screen is broken, what should I do?"},
    {"text": "### Instruction: Correct grammar. ### Input: salary not credit yestday ### Response: My salary was not credited yesterday."},
    {"text": "### Instruction: Correct grammar. ### Input: vpn not connectng help ### Response: The VPN is not connecting, I need help."},
    
    {"text": "### Instruction: Summarize strict context. ### Input: (context: What is leave?, data: Leave is 20 days.) ### Response: Leave is 20 days."},
    {"text": "### Instruction: Summarize strict context. ### Input: (context: Who is CEO?, data: Mr. Smith is the CEO.) ### Response: Mr. Smith is the CEO."},
    {"text": "### Instruction: Summarize strict context. ### Input: (context: Where is IT?, data: IT is on Floor 2.) ### Response: IT is on Floor 2."},
    
    {"text": "### Instruction: Summarize strict context. ### Input: (context: Who is CEO?, data: The cafeteria serves pizza.) ### Response: knowledge base has no relevant data"}
]

train_data = data * 20 

print("ðŸ”¹ Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("ðŸ”¹ Loading Model (FP16)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    use_cache=False
)
model.to(DEVICE)

print("ðŸ”¹ Applying LoRA...")
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

def tokenize_function(examples):
    text_list = [t + tokenizer.eos_token for t in examples["text"]]
    
    tokenized = tokenizer(
        text_list,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

print("ðŸ”¹ Tokenizing Dataset...")
raw_dataset = Dataset.from_list(train_data)
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

print("ðŸ”¹ Starting Training Loop...")
model.train()

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{EPOCHS} ---")
    progress_bar = tqdm(train_dataloader)
    
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        progress_bar.set_description(f"Loss: {loss.item():.4f}")

print(f"\nðŸ”¹ Saving model to {OUTPUT_DIR}...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("âœ… DONE! You can now run the inference script.")

ðŸ”¹ Using Device: cuda
ðŸ”¹ Loading Tokenizer...
ðŸ”¹ Loading Model (FP16)...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ðŸ”¹ Applying LoRA...
trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758
ðŸ”¹ Tokenizing Dataset...


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

ðŸ”¹ Starting Training Loop...

--- Epoch 1/2 ---


Loss: 0.0053: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 80/80 [00:30<00:00,  2.65it/s]



--- Epoch 2/2 ---


Loss: 0.0041: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 80/80 [00:26<00:00,  3.00it/s]



ðŸ”¹ Saving model to hcl_final_model...
âœ… DONE! You can now run the inference script.


In [3]:
pip install torch transformers peft bitsandbytes accelerate datasets trl huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import os
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer, SFTConfig

MODEL_ID = "mistralai/Mistral-7B-v0.1"
OUTPUT_DIR = "hcl_final_model"

data = [
    {"text": "### Instruction: Correct grammar. ### Input: i want police for matrnity ### Response: I want the policy for maternity leave."},
    {"text": "### Instruction: Correct grammar. ### Input: screen broke wat do ### Response: My screen is broken, what should I do?"},
    {"text": "### Instruction: Correct grammar. ### Input: salary not credit yestday ### Response: My salary was not credited yesterday."},
    {"text": "### Instruction: Summarize strict context. ### Input: (context: What is leave?, data: Leave is 20 days.) ### Response: Leave is 20 days."},
]
train_data = data * 20
dataset = Dataset.from_list(train_data)

print(f"Loading {MODEL_ID} in Float16 (No BitsAndBytes)...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16, # Standard precision
    device_map="auto",
    use_cache=False
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print("Starting Training...")
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field="text",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=1,
    max_steps=50,
    fp16=True,
    group_by_length=True,
    report_to="none",
    packing=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
)

trainer.train()

print(f"Saving adapters to {OUTPUT_DIR}...")
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training Complete. Now run the Merge script.")

Loading mistralai/Mistral-7B-v0.1 in Float16 (No BitsAndBytes)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758
Starting Training...


Adding EOS to train dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
1,4.2036
2,2.9297
3,2.0541
4,1.6845
5,0.9895
6,0.7411
7,0.469
8,0.2755
9,0.2126
10,0.1271


Saving adapters to hcl_final_model...
Training Complete. Now run the Merge script.


In [5]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

MODEL_ID = "mistralai/Mistral-7B-v0.1"
ADAPTER_DIR = "hcl_final_model"
MERGED_DIR = "hcl_merged_complete"

print("Loading Base Model for Merging...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

print("Loading Adapters...")
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)

print("Merging weights...")
model = model.merge_and_unload()

print(f"Saving full merged model to {MERGED_DIR}...")
model.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.save_pretrained(MERGED_DIR)

print("DONE! You can now download the 'hcl_merged_complete' folder.")

Loading Base Model for Merging...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading Adapters...
Merging weights...
Saving full merged model to hcl_merged_complete...
DONE! You can now download the 'hcl_merged_complete' folder.
