In [None]:
# Step 1: Install Dependencies
# These are the core libraries: Transformers, Datasets, PEFT (for LoRA), TRL (Trainer), BitsAndBytes (4-bit quant)
!pip install -q datasets peft transformers accelerate trl bitsandbytes


In [None]:
# Step 2: Load JSONL Dataset
# This loads your data into train/val/test splits using Hugging Face's `datasets` library

from datasets import load_dataset

data = load_dataset("json", data_files={
    "train": "app/data/training_data.jsonl",
    "validation": "app/data/validation_data.jsonl",
    "test": "app/data/test_data.jsonl"
})

# ✅ Shuffle data (important for generalization, especially if your data is grouped)
data["train"] = data["train"].shuffle(seed=42)
data["validation"] = data["validation"].shuffle(seed=42)
data["test"] = data["test"].shuffle(seed=42)

# Quick peek
data["train"][0]


In [None]:
# Step 3: Convert messages into Mistral-style prompt/response format
# Your data is ChatML-style, so we turn it into <s>[INST] ... [/INST] response </s>

def format_chat_prompt(example):
    messages = example["messages"]
    prompt = ""
    for i, msg in enumerate(messages):
        role = msg["role"]
        content = msg["content"].strip()

        if role == "system":
            system_prompt = content
        elif role == "user":
            if i == 1 and messages[0]["role"] == "system":
                # System + first user message inside one [INST] block
                prompt += f"<s>[INST] {system_prompt}\n\n{content} [/INST]"
            else:
                prompt += f"<s>[INST] {content} [/INST]"
        elif role == "assistant":
            # Append assistant reply and close sequence
            prompt += f" {content} </s>"

    return { "prompt": prompt }

# Apply formatting to all splits
data = data.map(format_chat_prompt)


In [None]:
# Step 4: Load Mistral 7B Instruct model in 4-bit for memory efficiency
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Mistral does not have a PAD token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


In [None]:
# Step 5: Apply LoRA (Low-Rank Adaptation) for efficient fine-tuning
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Enable gradient checkpointing & cast layer norms
model = prepare_model_for_kbit_training(model)

# Define which layers to apply LoRA to
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Should show a small number of LoRA params


In [None]:
# Step 6: Tokenize the formatted prompt + response text
# The entire [INST] ... [/INST] response is tokenized as a single sequence
def tokenize(example):
    tokenized = tokenizer(
        example["prompt"] + tokenizer.eos_token,
        truncation=True,
        max_length=2048,
        padding=False
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = data.map(tokenize, remove_columns=data["train"].column_names)


In [None]:
# Step 7: Train using Hugging Face's SFTTrainer from `trl`
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mistral-lora-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    bf16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_args,
    dataset_text_field=None
)

trainer.train()


In [None]:
# Step 8: Save LoRA adapter weights and tokenizer (not full model yet)
trainer.model.save_pretrained("./mistral-lora-adapter")
tokenizer.save_pretrained("./mistral-lora-adapter")


In [None]:
# Step 9: Merge adapter into base model to get a full model
from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
model = PeftModel.from_pretrained(base_model, "./mistral-lora-adapter")
model = model.merge_and_unload()

model.save_pretrained("./mistral-merged")
tokenizer.save_pretrained("./mistral-merged")


In [None]:
# Step 10: Convert to GGUF format using llama.cpp

# Clone llama.cpp repo
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp

# Build llama.cpp tools (optional but good practice)
!cmake -B build
!cmake --build build --config Release

# Install Python requirements for conversion
%pip install -r requirements.txt

# Return to root
%cd ..

# Run the conversion script from llama.cpp
!python llama.cpp/convert-hf-to-gguf.py mistral-merged --outfile mistral_model.gguf


In [None]:
# Step 11: Run inference using the merged model
from transformers import pipeline

pipe = pipeline("text-generation", model="./mistral-merged", tokenizer=tokenizer, device_map="auto")

prompt = "<s>[INST] What is the purpose of the 'filterRows' action in TWL? [/INST]"
output = pipe(prompt, max_new_tokens=256, do_sample=True)
print(output[0]["generated_text"])


In [None]:
# Step 12: Upload both merged HF model and GGUF model to Hugging Face
from huggingface_hub import login, HfApi
from transformers import AutoModelForCausalLM

# Authenticate (you'll be prompted)
notebooklogin()

# Update with your repo names
repo_hf_model = "your-username/mistral-7b-tamarind-lora"
repo_gguf = "your-username/mistral-7b-tamarind-gguf"

# Push HF model + tokenizer
AutoModelForCausalLM.from_pretrained("./mistral-merged").push_to_hub(repo_hf_model)
tokenizer.push_to_hub(repo_hf_model)

# Upload GGUF model
api = HfApi()
api.create_repo(repo_id=repo_gguf, repo_type="model", exist_ok=True)
api.upload_file(
    path_or_fileobj="../mistral_model.gguf",
    path_in_repo="mistral_model.gguf",
    repo_id=repo_gguf,
    repo_type="model"
)
