### EXTRACTION CONTROLS FROM UNSTRUCTURED PDF USING LLM
#### Using Qwen AI Model
- for extraction of controls
- validationof controls extracted using RAG

##### Path of the File

In [70]:
# pdf_path = "uploads/NIST_CONTROL.pdf"
# pdf_path = "ISO_27001.pdf"
pdf_path  = "nist_file.pdf"

#### Reading Contents of the pdf and cleaning the pdf

In [71]:
import pypdf

In [72]:
reader = pypdf.PdfReader(pdf_path)

In [73]:
import re
def normalize_text(raw_text):

    # checking for empty string incase pdf has some
    if not raw_text:
        return ""

    # splitting lines
    lines = raw_text.split('\n')
    cleaned_lines = []

    # Generic noise patterns found the pdf[nist and iso]
    # defining the list because incase new patterns need to be added based on the pdf file.
    # These are safe to remove from ANY document.
    # We include a length check (len < 30) to ensure we don't accidentally
    # delete a real control that happens to contain the word "Page".
    noise_patterns = [
        r"^Page\s+\d+$",               # Matches "Page 1"
        r"^Page\s+\d+\s+of\s+\d+$",    # Matches "Page 1 of 10"
        r"^\d+\s+of\s+\d+$",           # Matches "1 of 10"
        r"^https?://",                 # URL artifacts often in footers
        r"^www\.",                     # Web links
        r"^\(c\)\s+\d{4}",             # Copyright markers like "(c) 2023"
        r"^Copyright",                 # Copyright word
        r'\bAppendix\s+[A-Z]+\s+Page\s+\d+\b'
    ]

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line is noise
        is_noise = False
        # Only check short lines to be safe. If a line is 100 chars long,
        # it's likely content, even if it has "Page" in it.
    
        for pattern in noise_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                is_noise = True
                break

        if not is_noise:
            cleaned_lines.append(line)

    # Merging all the lines into one text
    # We join with '\n' to preserve the structure.
    # The LLM needs to see the newlines to understand the layout.
    return '\n'.join(cleaned_lines)

In [74]:

text_page = []

for page in reader.pages[40:50]:
    page_cleaned = normalize_text(page.extract_text())
    text_page.append(page_cleaned)

#### Loading the Model in 4 bit because of vram limitations

In [75]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,pipeline

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # not using this as it requires login in hugging_face
model_id = "Qwen/Qwen2.5-7B-Instruct"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:18<00:00,  4.64s/it]


In [22]:
def clean_and_validate_json(json):
    indices_to_remove = []
    seen_ids = set()
    for idx,control in enumerate(json):
        empty_string_type = ['','none',' ',None]
        if control['control_id'].lower() not in seen_ids:
            seen_ids.add(control['control_id'].lower())
        else:
            indices_to_remove.append(idx)
            continue
        if len(control['control_desc'])<3:
            indices_to_remove.append(idx)
            continue
        try:
            if control['control_id'].lower() in empty_string_type or control['control_title'].lower() in empty_string_type or control['control_desc'].lower() in empty_string_type:
                indices_to_remove.append(idx)
                continue
        except:
            # if it fails to find any one of the column in the json
            indices_to_remove.append(idx)
    json = [element for index, element in enumerate(json) if index not in indices_to_remove]
    return json

In [76]:
# Cell 5: Define the Extraction Logic

def extract_controls_from_page(page_text):
    # 1. The System Prompt (The Brains)
    system_prompt = """
    You are an expert Compliance Auditor specialized in ISO, NIST, and Regulatory frameworks.

    Task: Extract individual controls or regulatory requirements from the provided text. Ignore all boilerplate, headers, and irrelevant legal prose.

    Control ID Identification Logic:
    Identify control IDs using a pattern-matching approach for:
    1. Annex A style (e.g., A.5.1, A.12.1.1)
    2. NIST style (e.g., AC-1, PM-10)
    3. Legislative/Section style (e.g., Sec. 302, Sec. 404)

    Rules:
    - If controls are found, return ONLY a raw JSON list of objects.
    - Fields:
        - "control_id": The exact identifier (e.g., "Sec. 404", "A.5.1", or "AC-1").
        - "control_title": A concise title (5-10 words) as stated in the text.
        - "control_desc": The full description or specific requirement associated with that ID.
    - If NO controls are found, output strictly: []
    - Strict Formatting: No markdown, no "```json" blocks, no preamble. Just the raw JSON.
    """

    # 2. Structure the Chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Analyze this text:\n\n{page_text}"}
    ]

    # 3. Prepare Inputs
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         
        temperature=.1,          #so the model halluinates less
    )

    # 5. Decode
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response.strip()

#### Main Extraction Loop
- the extraction loop gives the whole page chunk to the LLM, and LLM then returns the json output in a snippet

In [77]:
import json
import time

all_extracted_data = []
responses = []

total_start_time = time.time()  # Start global timer
print("Starting extraction...")

for i, page_text in enumerate(text_page):
    print(f"--- Processing Page {i+1} ---")


    page_start_time = time.time()


    raw_response = extract_controls_from_page(page_text)


    clean_json_string = raw_response.replace("```json", "").replace("```", "").strip()

    page_end_time = time.time()
    elapsed_time = page_end_time - page_start_time
    print(f"   > Time taken: {elapsed_time:.2f} seconds")
    responses.append(clean_json_string)

    try:

        if not clean_json_string or clean_json_string == "[]":
            print(f"   > No controls found.")
            continue

        data = json.loads(clean_json_string)

        # Verify it's a list
        if isinstance(data, list):
            count = len(data)
            print(f"   > Success! Found {count} controls.")
            for control in data:
                control['page']=i
            all_extracted_data.extend(data)
        else:
            print(f"   > Warning: Model returned valid JSON but not a list.")

    except json.JSONDecodeError:
        print(f"   > Error: Model output invalid JSON.\n   > Raw Output: {raw_response[:50]}...")

total_end_time = time.time()
total_duration = total_end_time - total_start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)


Starting extraction...
--- Processing Page 1 ---
   > Time taken: 0.29 seconds
   > No controls found.
--- Processing Page 2 ---
   > Time taken: 0.21 seconds
   > No controls found.
--- Processing Page 3 ---
   > Time taken: 0.27 seconds
   > No controls found.
--- Processing Page 4 ---
   > Time taken: 0.21 seconds
   > No controls found.
--- Processing Page 5 ---
   > Time taken: 1.75 seconds
   > Success! Found 1 controls.
--- Processing Page 6 ---
   > Time taken: 1.77 seconds
   > Success! Found 1 controls.
--- Processing Page 7 ---
   > Time taken: 33.19 seconds
   > Success! Found 28 controls.
--- Processing Page 8 ---
   > Time taken: 4.63 seconds
   > Success! Found 4 controls.
--- Processing Page 9 ---
   > Time taken: 1.19 seconds
   > Success! Found 1 controls.
--- Processing Page 10 ---
   > Time taken: 41.68 seconds
   > Success! Found 43 controls.


In [78]:
print("\nDONE!")
print(f"Total controls found: {len(all_extracted_data)}")


DONE!
Total controls found: 78


In [79]:
with open("output_nist.json", "w") as f:
    json.dump(all_extracted_data, f, indent=2)

# To check for RAG Pipeline

In [81]:
import json
with open('output_nist.json',"r") as f:
    all_extracted_data = json.load(f)

# pdf_path = "ISO_27001.pdf"

In [30]:
import pypdf
reader = pypdf.PdfReader(pdf_path)

text_page = []

for page in reader.pages:
    text_page.append(page.extract_text())

In [82]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import json
import re
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(os.environ['hf_api'])


def load_model(model_id="Qwen/Qwen2.5-7B-Instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # 2. Set padding side to 'left' for generation (Important for decoder-only models)
    tokenizer.padding_side = "left"

    # model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return tokenizer,model

def validate_controls(context,control_id,control_title,control_desc,tokenizer,model):
    validation_template = f"""
    You are a Quality Assurance Auditor. Verify if the following control exists in the source text.
    
    SOURCE TEXT (Reference):
    {context}
    
    CLAIMED CONTROL (To Verify):
    ID: {control_id}
    Title: {control_title}
    Description: {control_desc}
    
    TASK:
    Determine if the "Claimed Control" is explicitly supported by the "Source Text".
    Also make sure the control given isnt just a change made in any other control
    note: sometimes the "Claimed Control" might be just a table heading but not a valid control so make sure the claimed control is valid by 
    checking its description and reasoning it yourself.
    
    OUTPUT FORMAT:
    Return only a JSON object with this format:
    {{
        "is_valid": true/false,
        "reason": "Short explanation citing the text",
        "confidence_score": 1-10
    }}
    """
    messages = [
        {"role": "user", "content": validation_template}
    ]
    # message format
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         # temperature=0 (Deterministic)
        temperature=None,        # Must be None if do_sample=False
        top_p=None               # Must be None if do_sample=False
    )

    # 5. Decode
    # Determine the length of input tokens so we only decode the new response tokens
    response_tokens = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    # returning the response
    return clean_and_parse_json(response_text)


def clean_and_parse_json(text):
    """Helper to strip markdown code blocks and return a dict."""
    try:
        # 1. Try to find content inside ```json ... ```
        match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
        if match:
            text_to_parse = match.group(1)
        else:
            # 2. If no code blocks, try to find the first '{' and last '}'
            # This handles cases where the model outputs text before/after the JSON without fences
            start = text.find('{')
            end = text.rfind('}')
            if start != -1 and end != -1:
                text_to_parse = text[start:end+1]
            else:
                # 3. Fallback to original cleanup
                text_to_parse = re.sub(r"```json\s*|\s*```", "", text.strip())
        
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        # Fallback if model fails to generate valid JSON
        return {"error": "Failed to parse JSON", "raw_output": text}


    
def validation_pipeline(control_list, text_pages,tokenizer,model):
    # tokenizer,model=load_model("DavidAU/Llama3.3-8B-Instruct-Thinking-Claude-4.5-Opus-High-Reasoning")
    
    for control in control_list:
        print("PROCCESSING CONTROL:",control['control_id'],end=" ")
        resp = validate_controls(text_pages[control['page']],control['control_id'],control['control_title'],control['control_desc'],tokenizer,model)
        try:
            print("PASS"if resp['is_valid'] else "FAIL")
        except:
            print("invalid json response.")
            print(resp)
        control['validation'] = resp
    return control_list


In [37]:
try:
    del tokenizer
    del model
except:
    print("tokenizer,model not defined")
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

tokenizer,model=load_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.63s/it]


In [115]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
validated_list = validation_pipeline(all_extracted_data,text_page,tokenizer,model)

PROCCESSING CONTROL: AC-1 PASS
PROCCESSING CONTROL: AC-2 PASS
PROCCESSING CONTROL: AC-3 FAIL
PROCCESSING CONTROL: AC-5 PASS
PROCCESSING CONTROL: AC-6 PASS
PROCCESSING CONTROL: AC-17 FAIL
PROCCESSING CONTROL: AC-18 FAIL
PROCCESSING CONTROL: AC-20 FAIL
PROCCESSING CONTROL: AC-24 FAIL
PROCCESSING CONTROL: AU-2 FAIL
PROCCESSING CONTROL: AU-12 FAIL
PROCCESSING CONTROL: CM-5 FAIL
PROCCESSING CONTROL: IA-2 FAIL
PROCCESSING CONTROL: IA-4 FAIL
PROCCESSING CONTROL: IA-5 FAIL
PROCCESSING CONTROL: IA-8 FAIL
PROCCESSING CONTROL: MA-3 FAIL
PROCCESSING CONTROL: MA-5 FAIL
PROCCESSING CONTROL: PE-2 FAIL
PROCCESSING CONTROL: PL-4 FAIL
PROCCESSING CONTROL: PS-2 FAIL
PROCCESSING CONTROL: PS-4 FAIL
PROCCESSING CONTROL: PS-5 FAIL
PROCCESSING CONTROL: PS-7 FAIL
PROCCESSING CONTROL: PT-2 FAIL
PROCCESSING CONTROL: PT-3 FAIL
PROCCESSING CONTROL: SC-7 FAIL
PROCCESSING CONTROL: SC-12 FAIL
PROCCESSING CONTROL: SC-13 FAIL
PROCCESSING CONTROL: SC-37 FAIL
PROCCESSING CONTROL: AC-11 PASS
PROCCESSING CONTROL: AU-2 PASS

In [116]:
validated_list_copy = validated_list.copy()
assert id(validated_list_copy )!= id(validated_list)

In [117]:
pass_count = 0
fail_count = 0
failed = []
for idx,control in enumerate(validated_list):
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            fail_count+=1
            failed.append([idx,control])
            print(control)
            continue

        if control['validation']['is_valid']:
            
            pass_count+=1
            
        else:
            print(control)
            fail_count+=1
            failed.append([idx,control])
    else:
        print(f"invalid control {control['control_id']}")
print(f"{pass_count} controls, passed out of {len(all_extracted_data)}. {fail_count} controls failed. \n log:{failed}")

{'control_id': 'AC-3', 'control_title': 'Account Management', 'control_desc': 'Establish and maintain accountability for system components.', 'page': 6, 'validation': {'is_valid': False, 'reason': "The 'Account Management' control described in the 'Claimed Control' does not match the content provided in the 'Source Text'. The source text discusses the management of system accounts, including the creation, adjustment, and removal of accounts, as well as the establishment of temporary and emergency accounts. However, it does not explicitly mention establishing and maintaining accountability for system components, which is the core of the 'Account Management' control (AC-3) as described in the claimed control.", 'confidence_score': 8}}
{'control_id': 'AC-17', 'control_title': 'Account Management', 'control_desc': 'Manage user accounts to ensure that access is granted based on the principle of least privilege and that access is reviewed periodically.', 'page': 6, 'validation': {'is_valid':

In [118]:
cleaned_controls = []
seen_ids = set()
for control in validated_list_copy:
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            continue
        if control['control_id'] in seen_ids:
            continue
        else:
            seen_ids.add(control['control_id'])
        if not control['validation']['is_valid']:
            cleaned_controls.append(control)
        else:
            fail_count+=1
    else:
        print(f"invalid control {control['control_id']}")

print(json.dumps(cleaned_controls,indent=2))
# print(len(cleaned_controls))

[
  {
    "control_id": "AC-3",
    "control_title": "Account Management",
    "control_desc": "Establish and maintain accountability for system components.",
    "page": 6,
    "validation": {
      "is_valid": false,
      "reason": "The 'Account Management' control described in the 'Claimed Control' does not match the content provided in the 'Source Text'. The source text discusses the management of system accounts, including the creation, adjustment, and removal of accounts, as well as the establishment of temporary and emergency accounts. However, it does not explicitly mention establishing and maintaining accountability for system components, which is the core of the 'Account Management' control (AC-3) as described in the claimed control.",
      "confidence_score": 8
    }
  },
  {
    "control_id": "AC-17",
    "control_title": "Account Management",
    "control_desc": "Manage user accounts to ensure that access is granted based on the principle of least privilege and that acce