In [12]:
# lease_finetune.py

import os
import torch
import sympy
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

# 1. Prepare a JSONL with {"instruction","input","output"} triples for lease generation.
#    e.g. "instruction": "Generate lease", "input": "<params JSON>", "output": "<full lease text>"
dataset = load_dataset("json", data_files="lease_dataset.jsonl", split="train")

# 2. Preprocess into a single text field
def preprocess(batch):
    texts = [
        f"Instruction: {ins}\nInput: {inp}\nOutput: {out}"
        for ins, inp, out in zip(batch["instruction"], batch["input"], batch["output"])
    ]
    return {"text": texts}

dataset = dataset.map(preprocess, batched=True)  # :contentReference[oaicite:0]{index=0}

# 3. Load tokenizer and set pad token
model_name = "gpt2"  # swap in your base
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

# 4. Tokenize & label
def tokenize(batch):
    tok = tokenizer(
        batch["text"], truncation=True, padding="max_length", max_length=1024
    )
    tok["labels"] = [
        [(tid if tid != tokenizer.pad_token_id else -100) for tid in ids]
        for ids in tok["input_ids"]
    ]
    return tok

tokenized = dataset.map(tokenize, batched=True)
train_test = tokenized.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = train_test["train"], train_test["test"]

# 5. Load model with 8‑bit quantization
bnb = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb,
    device_map="auto",
)

# 6. Apply LoRA
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(base_model, lora_cfg)
peft_model.print_trainable_parameters()  # :contentReference[oaicite:1]{index=1}

# 7. Training arguments
args = TrainingArguments(
    output_dir="lease_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    logging_steps=100,
    warmup_steps=100,
)

# 8. Trainer
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# 9. Train & save
trainer.train()
peft_model.save_pretrained("lease_model")
tokenizer.save_pretrained("lease_model")
print("✓ Lease‑generation model fine‑tuned and saved to ./lease_model")


ModuleNotFoundError: No module named 'sympy'

In [10]:
import pdfplumber
import json

# Define your leases and their corresponding parameter dicts
leases = [
    {
        "pdf_path": "training_lease_one.pdf",
        "params": {
            "landlord_name": "ABC Properties",
            "tenant_name": "Silvia Mando",
            "address": "9876 Cherry Avenue, Apartment 426, Anytown, USA",
            "start_date": "2012-07-01",
            "end_date": "2013-06-30",
            "rent": "685",
            "deposit": "685",
            "late_fee": "$25 plus $5/day after 3‑day grace period",
            "additional_terms": "No pets unless approved; tenant pays electricity, gas, phone."
        }
    },
    {
        "pdf_path": "training_lease_two.pdf",
        "params": {
            "landlord_name": "Landlord Name Here",
            "tenant_name": "Tenant Name Here",
            "address": "123 Example St, Unit 5, City, State",
            "start_date": "2025-06-01",
            "end_date": "2026-05-31",
            "rent": "1200",
            "deposit": "1200",
            "late_fee": "5% of monthly rent after the 5th day",
            "additional_terms": "No pets; tenant responsible for utilities; subletting by written consent only."
        }
    }
]

# Extract text and write JSONL
with open("lease_dataset.jsonl", "w") as fout:
    for lease in leases:
        # Extract full text from the PDF
        with pdfplumber.open(lease["pdf_path"]) as pdf:
            text = "\n\n".join(page.extract_text() for page in pdf.pages)
        
        entry = {
            "instruction": "Generate a clear, professional residential lease agreement.",
            "input": json.dumps(lease["params"]),
            "output": text
        }
        fout.write(json.dumps(entry) + "\n")

print("Created lease_dataset.jsonl with", len(leases), "entries")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Created lease_dataset.jsonl with 2 entries
