In [None]:
from datasets import load_dataset, DatasetDict, load_from_disk, Dataset
import json
from transformers import LlamaTokenizer, LlamaForCausalLM, Trainer, TrainingArguments

In [None]:
import pandas as pd
data_train = pd.read_json("dataset/training_data.json")  # Use `lines=True` for line-delimited JSON
data_test = pd.read_json("dataset/testing.json")
# print(data.head())


In [31]:
data_train = data_train.astype(str)
data_test = data_test.astype(str)
train_dataset = Dataset.from_pandas(data_train)
test_dataset = Dataset.from_pandas(data_test)

In [None]:
# Load the tokenizer for LLaMA
tokenizer = LlamaTokenizer.from_pretrained('facebook/llama-8b')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)

In [None]:
# Load the LLaMA model
model = LlamaForCausalLM.from_pretrained('facebook/llama-8b')  # replace with your model path


In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # directory to store checkpoints and results
    evaluation_strategy="epoch",  # evaluate after each epoch
    learning_rate=2e-5,  # you can adjust this based on your experiment
    per_device_train_batch_size=4,  # adjust based on your hardware
    weight_decay=0.01,
    num_train_epochs=3,  # number of epochs for training
    logging_dir="./logs",  # directory to store logs
    save_steps=500,  # save checkpoints every 500 steps
    save_total_limit=2,  # keep the last two checkpoints
    remove_unused_columns=False  # prevent removing columns not used for training
)
# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

trainer.save_model("./fine_tuned_llama_8b")