In [1]:
system_prompt = """
You are a senior Compliance Auditor and Regulatory Analyst specialized in ISO, NIST, and statutory frameworks.

Your task is to extract ONLY real, enforceable compliance controls or regulatory requirements from the given text.

Precision is critical. Do NOT infer, summarize, merge, or invent controls.

========================
CORE EXTRACTION RULES
========================

1. A control MUST satisfy ALL of the following:
   - It has a valid Control ID (see patterns below)
   - It includes an explicit requirement, obligation, or mandate
   - The descriptive text directly applies to that Control ID

2. DO NOT extract:
   - Control IDs listed under:
     - "Related controls"
     - "Referenced controls"
     - "See also"
     - "Cross references"
     - Tables of contents
     - Indexes
     - Questionnaires or audit questions
   - Section headings, titles, or topic labels without enforceable requirements
   - Controls marked as "withdrawn", "deprecated", "not applicable", or "informative"
   - Any inferred or implied control not explicitly defined in the text

3. If the page does NOT clearly define a control, output an empty list: []

========================
CONTROL ID DETECTION
========================

Recognize ONLY these Control ID patterns:

- ISO / Annex A:
  - A.5.1
  - A.8.12
  - A.12.1.1

- NIST:
  - AC-1
  - IA-5
  - PM-10

- Legislative / Regulatory:
  - Sec. 302
  - Section 404

DO NOT treat IDs appearing inside explanatory text, examples, or references as controls.

========================
CONTROL BOUNDARY LOGIC
========================

- A valid control's description MUST:
  - Immediately follow or be clearly scoped to the Control ID
  - Contain enforceable language (e.g., "shall", "must", "is required to")
- STOP the description when:
  - A new Control ID appears
  - A new section or heading begins
  - The text shifts to references, examples, or guidance

========================
OUTPUT RULES (STRICT)
========================

- Output ONLY a raw JSON array
- No markdown
- No explanations
- No extra text
- No hallucinations

Each object MUST have:

{
  "control_id": "<exact identifier>",
  "control_title": "<concise title from text, 5–10 words>",
  "control_desc": "<full enforceable requirement text>"
}

========================
FAIL-SAFE BEHAVIOR
========================

- If uncertain whether text defines a real control → SKIP IT
- If zero valid controls exist → output []

========================
REMEMBER
========================

High precision > high recall.
It is better to return [] than an incorrect control."""

In [2]:
import re
def normalize_text(raw_text):

    # checking for empty string incase pdf has some
    if not raw_text:
        return ""

    # splitting lines
    lines = raw_text.split('\n')
    cleaned_lines = []

    # Generic noise patterns found the pdf[nist and iso]
    # defining the list because incase new patterns need to be added based on the pdf file.
    # These are safe to remove from ANY document.
    # We include a length check (len < 30) to ensure we don't accidentally
    # delete a real control that happens to contain the word "Page".
    noise_patterns = [
        r"^Page\s+\d+$",               # Matches "Page 1"
        r"^Page\s+\d+\s+of\s+\d+$",    # Matches "Page 1 of 10"
        r"^\d+\s+of\s+\d+$",           # Matches "1 of 10"
        r"^https?://",                 # URL artifacts often in footers
        r"^www\.",                     # Web links
        r"^\(c\)\s+\d{4}",             # Copyright markers like "(c) 2023"
        r"^Copyright",                 # Copyright word
        r'\bAppendix\s+[A-Z]+\s+Page\s+\d+\b'
    ]

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line is noise
        is_noise = False
        # Only check short lines to be safe. If a line is 100 chars long,
        # it's likely content, even if it has "Page" in it.
    
        for pattern in noise_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                is_noise = True
                break

        if not is_noise:
            cleaned_lines.append(line)

    # Merging all the lines into one text
    # We join with '\n' to preserve the structure.
    # The LLM needs to see the newlines to understand the layout.
    return '\n'.join(cleaned_lines)

In [3]:
import pdfplumber
pdf_path  = "notebooks/nist_file.pdf"
text_page = []



with pdfplumber.open(pdf_path) as pdf:
    for idx,page in enumerate(pdf.pages):    
        if (idx+1)%50 ==0:
            print(f"extracting page {idx+1}/{len(pdf.pages)}")
        
        text_page.append(normalize_text(page.extract_text(layout=True)))

extracting page 50/492
extracting page 100/492
extracting page 150/492
extracting page 200/492
extracting page 250/492
extracting page 300/492
extracting page 350/492
extracting page 400/492
extracting page 450/492


In [4]:
import json
data =  json.load(open('notebooks/control_json.json'))
train_samples = []

for c in data:
    page_idx = c["page"]
    page_text = text_page[page_idx]

    output = [{
        "control_id": c["control_id"],
        "control_title": c["control_title"],
        "control_desc": c["control_desc"]
    }]

    train_samples.append({
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": page_text
            },
            {
                "role": "assistant",
                "content": json.dumps(output, ensure_ascii=False)
            }
        ]
    })


In [5]:
control_pages = set(c["page"] for c in data)

all_pages = set(range(len(text_page)))
non_control_pages = list(all_pages - control_pages)


In [6]:
import random

neg_pages = random.sample(non_control_pages, k=len(train_samples))


In [7]:
for p in neg_pages:
    train_samples.append({
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": text_page[p]
            },
            {
                "role": "assistant",
                "content": "[]"
            }
        ]
    })


In [8]:
with open("dataset/qwen_controls_lora.jsonl", "w") as f:
    for s in train_samples:
        f.write(json.dumps(s, ensure_ascii=False) + "\n")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import os
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv()
login(os.environ["hugging_face_token"])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


model_id = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

# --------------------
# LoRA
# --------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 10,092,544 || all params: 7,625,709,056 || trainable%: 0.1323


In [None]:
import torch
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

dataset = load_dataset("json", data_files="qwen_controls_lora.jsonl")

def data_collator(features):
    input_ids_list = []
    labels_list = []

    for f in features:
        # Apply Qwen chat template
        input_ids = tokenizer.apply_chat_template(
            f["messages"],
            add_generation_prompt=False,
            return_tensors="pt"
        )[0]

        labels = input_ids.clone()

        # Mask everything BEFORE assistant response
        # Qwen uses role tokens internally, so this is safe
        assistant_token_ids = tokenizer.encode(
            "assistant", add_special_tokens=False
        )

        start = None
        for i in range(len(input_ids) - len(assistant_token_ids)):
            if input_ids[i:i+len(assistant_token_ids)].tolist() == assistant_token_ids:
                start = i
                break

        if start is not None:
            labels[:start] = -100
        else:
            labels[:] = -100  

        input_ids_list.append(input_ids)
        labels_list.append(labels)

    return {
        "input_ids": torch.nn.utils.rnn.pad_sequence(
            input_ids_list,
            batch_first=True,
            padding_value=tokenizer.pad_token_id
        ),
        "labels": torch.nn.utils.rnn.pad_sequence(
            labels_list,
            batch_first=True,
            padding_value=-100
        )
    }

args = TrainingArguments(
output_dir="./qwen-control-lora",
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
learning_rate=2e-4,
num_train_epochs=3,
fp16=True,
logging_steps=10,
save_steps=500,
save_total_limit=2,
optim="paged_adamw_8bit",  
report_to="none",
remove_unused_columns=False  
)



In [13]:

# --------------------
# Trainer
# --------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,0.4506
20,0.2127
30,0.1647


TrainOutput(global_step=30, training_loss=0.2759809414545695, metrics={'train_runtime': 281.4106, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.107, 'total_flos': 1.286712591058944e+16, 'train_loss': 0.2759809414545695, 'epoch': 3.0})

In [None]:
model.save_pretrained("./qwen-control-lora/final_adapter_30_epochs")
tokenizer.save_pretrained("./qwen-control-lora/final_adapter_30_epochs")


Model saved to ./qwen-control-lora/final_adapter
