In [1]:
import os
import json

input_dir = './data'
output_file = './finetune_data.jsonl'

with open(output_file, 'w') as out:
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            with open(os.path.join(input_dir, filename), 'r') as f:
                json_data = json.load(f)
                prompt = f"Generate a user profile with name '{json_data['name']}', age {json_data['age']}, and city '{json_data['city']}'."
                completion = json.dumps(json_data, indent=2)
                out.write(json.dumps({"prompt": prompt, "completion": completion}) + '\n')


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

model_name = "HuggingFaceTB/SmolLM2-135M"
dataset_path = "./finetune_data.jsonl"





  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize
def tokenize(sample):
    prompt = sample["prompt"]
    completion = sample["completion"]
    full = prompt + "\n" + completion
    return tokenizer(full, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize)

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="auto")
model = prepare_model_for_kbit_training(model)

In [7]:
# LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Training args
training_args = TrainingArguments(
    output_dir="./smollm2-json-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()


NameError: name 'model' is not defined