Install needed packages.

In [None]:
%pip install --upgrade transformers -q
%pip install --upgrade torch accelerate -q
%pip install bitsandbytes -q
%pip install auto-gptq -q
%pip install unsloth -q
%pip install unsloth-zoo -q

Imports.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

Load a model from Huggingface.

In [None]:
model_name = "mistralai/Mistral-7B-v0.3"
max_seq_length = 2048  # Adjust as necessary

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length
)

Test prompt.

In [None]:
FastLanguageModel.for_inference(model)

prompt = "In what way has AI changed society?"

inputs = tokenizer(prompt, return_tensors="pt")

output = model.generate(**inputs, max_new_tokens=120)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response[len(prompt):].strip())

Prepare for fine-tuning.

Add [LoRA adapters](https://arxiv.org/abs/2106.09685).

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    random_state = 1337,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Prepare domain data. The model will not 'memorize' these text items as 'facts', but the model will update its weights so that it can better generate responses that are aligned with the specific language patterns, terminology, and nuances of the domain. A fine-tuned LLM would improve not only in generating text in the fine-tuned style but also in recognizing and discerning nuances of that style.

In [None]:
jsonl_file = "domain.jsonl"

Load and reformat the domain data.

In [None]:
# use the dataset loader by Huggingface and some formatting functions
dataset = load_dataset("json", data_files=jsonl_file, split="train")

tokenizer.pad_token = tokenizer.eos_token

def format_text(examples):
    texts = [note + tokenizer.pad_token for note in examples["text"]]
    return {"text": texts}

dataset = dataset.map(format_text, batched=True)

Tokenize the domain data with the pre-trained model's tokenizer.

In [None]:
# Initialize the tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text
def tokenize_texts(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_texts, batched=True)

## 4. Fine-tune the model

We want the *training loss* to decrease. A loss value around 2-3 is reasonable, if it gets close to 1.0 or drops below, the predictions will be highly confident, but also with some risk of overfitting, meaning that the model has learned the training data too well and may not perform as effectively on unseen data.

*See `README.md` for details about which parameters to tweak to avoid overfitting.*

In [None]:
# Remove unneeded columns and set format for PyTorch
tokenized_dataset = tokenized_dataset.remove_columns(["text"])  # Keep only tokenized columns
tokenized_dataset.set_format(type="torch")

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=0.00001, # <<<<<<< THE HIGHER THE RATE THE FASTER TO OVERFIT
        warmup_steps=5,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.3,
        lr_scheduler_type="linear",
        seed=1337,
        output_dir="outputs",
        report_to="none",
    ),
)

trainer_stats = trainer.train()

In [None]:
# Explicity save the model and tokenizer after training
trainer.save_model("outputs")  # Saves the model, tokenizer, and training args
tokenizer.save_pretrained("outputs")

In [None]:
from unsloth import FastLanguageModel

# Specify the path to your fine-tuned model
fine_tuned_model_path = "outputs"  # Path where your fine-tuned model is saved

# Load the fine-tuned model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_path,
    max_seq_length=2048  # Adjust to match your desired sequence length
)

# Prepare the model for inference
FastLanguageModel.for_inference(model)

In [None]:
prompt="Is climate change real?"


inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)