### EXTRACTION CONTROLS FROM UNSTRUCTURED PDF USING LLM
#### Using Qwen AI Model
- for extraction of controls
- validationof controls extracted using RAG

##### Path of the File

In [1]:
# pdf_path = "uploads/NIST_CONTROL.pdf"
# pdf_path = "ISO_27001.pdf"
pdf_path  = "nist_file.pdf"

#### Reading Contents of the pdf and cleaning the pdf

In [2]:
import pypdf

In [3]:
reader = pypdf.PdfReader(pdf_path)

In [4]:
import re
def normalize_text(raw_text):

    # checking for empty string incase pdf has some
    if not raw_text:
        return ""

    # splitting lines
    lines = raw_text.split('\n')
    cleaned_lines = []

    # Generic noise patterns found the pdf[nist and iso]
    # defining the list because incase new patterns need to be added based on the pdf file.
    # These are safe to remove from ANY document.
    # We include a length check (len < 30) to ensure we don't accidentally
    # delete a real control that happens to contain the word "Page".
    noise_patterns = [
        r"^Page\s+\d+$",               # Matches "Page 1"
        r"^Page\s+\d+\s+of\s+\d+$",    # Matches "Page 1 of 10"
        r"^\d+\s+of\s+\d+$",           # Matches "1 of 10"
        r"^https?://",                 # URL artifacts often in footers
        r"^www\.",                     # Web links
        r"^\(c\)\s+\d{4}",             # Copyright markers like "(c) 2023"
        r"^Copyright",                 # Copyright word
        r'\bAppendix\s+[A-Z]+\s+Page\s+\d+\b'
    ]

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line is noise
        is_noise = False
        # Only check short lines to be safe. If a line is 100 chars long,
        # it's likely content, even if it has "Page" in it.
    
        for pattern in noise_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                is_noise = True
                break

        if not is_noise:
            cleaned_lines.append(line)

    # Merging all the lines into one text
    # We join with '\n' to preserve the structure.
    # The LLM needs to see the newlines to understand the layout.
    return '\n'.join(cleaned_lines)

In [5]:

text_page = []

for page in reader.pages[40:50]:
    page_cleaned = normalize_text(page.extract_text())
    text_page.append(page_cleaned)

#### Loading the Model 
loading the model in 8 bit for now because 8 bit gives better accuracy and stable results as compared to 4 bit.
but incase if we have vram less than 12gb then we can load the model in 4 bit.


In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

In [9]:
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # not using this as it requires login in hugging_face
model_id = "Qwen/Qwen2.5-7B-Instruct"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.46s/it]


In [12]:
def clean_and_validate_json(json):
    indices_to_remove = []
    seen_ids = set()
    for idx,control in enumerate(json):
        empty_string_type = ['','none',' ',None]
        if control['control_id'].lower() not in seen_ids:
            seen_ids.add(control['control_id'].lower())
        else:
            indices_to_remove.append(idx)
            continue
        if len(control['control_desc'])<3:
            indices_to_remove.append(idx)
            continue
        try:
            if control['control_id'].lower() in empty_string_type or control['control_title'].lower() in empty_string_type or control['control_desc'].lower() in empty_string_type:
                indices_to_remove.append(idx)
                continue
        except:
            # if it fails to find any one of the column in the json
            indices_to_remove.append(idx)
    json = [element for index, element in enumerate(json) if index not in indices_to_remove]
    return json

In [50]:
def extract_controls_from_page(page_text):
    # 1. The System Prompt 
    system_prompt = """
You are a strict Compliance Auditor. Your task is to extract only the **definitions** of controls.

CRITICAL RULES:
1. **Definition vs. Reference**: ONLY extract a control if the text contains the OFFICIAL REQUIREMENT description.
   - REJECT questions (e.g., "Do you comply with A.8?").
   - REJECT references (e.g., "See section A.8 for details").
   - REJECT table of contents or headers without body text.
   
2. **Context Check**: If the text discusses a control but does not list its specific requirements/rules, output nothing.

3. **JSON Format**:
   Return a raw JSON list of objects with these fields:
   - "control_id": The ID (e.g., "AC-1").
   - "control_title": The title.
   - "control_desc": The full requirement text.
   - "type": Must be "definition". (If it looks like a question, do not include it).

If NO valid control definitions are found, output strictly: []
"""
    

    # 2. Structure the Chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Analyze this text:\n\n{page_text}"}
    ]

    # 3. Prepare Inputs
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         
        temperature=.01,          #so the model halluinates less
    )

    # 5. Decode
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response.strip()

#### Main Extraction Loop
- the extraction loop gives the whole page chunk to the LLM, and LLM then returns the json output in a snippet

In [51]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

In [52]:
import json
import time

all_extracted_data = []
responses = []

total_start_time = time.time()  # Start global timer
print("Starting extraction...")

for i, page_text in enumerate(text_page):
    print(f"--- Processing Page {i+1} ---")


    page_start_time = time.time()


    raw_response = extract_controls_from_page(page_text)


    clean_json_string = raw_response.replace("```json", "").replace("```", "").strip()

    page_end_time = time.time()
    elapsed_time = page_end_time - page_start_time
    print(f"   > Time taken: {elapsed_time:.2f} seconds")
    responses.append(clean_json_string)

    try:

        if not clean_json_string or clean_json_string == "[]":
            print(f"   > No controls found.")
            continue

        data = json.loads(clean_json_string)

        # Verify it's a list
        if isinstance(data, list):
            count = len(data)
            print(f"   > Success! Found {count} controls.")
            for control in data:
                control['page']=i
            all_extracted_data.extend(data)
        else:
            print(f"   > Warning: Model returned valid JSON but not a list.")

    except json.JSONDecodeError:
        print(f"   > Error: Model output invalid JSON.\n   > Raw Output: {raw_response[:50]}...")

total_end_time = time.time()
total_duration = total_end_time - total_start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)


Starting extraction...
--- Processing Page 1 ---
   > Time taken: 0.23 seconds
   > No controls found.
--- Processing Page 2 ---
   > Time taken: 0.14 seconds
   > No controls found.
--- Processing Page 3 ---
   > Time taken: 0.19 seconds
   > No controls found.
--- Processing Page 4 ---
   > Time taken: 0.14 seconds
   > No controls found.
--- Processing Page 5 ---
   > Time taken: 11.27 seconds
   > Success! Found 1 controls.
--- Processing Page 6 ---
   > Time taken: 0.16 seconds
   > No controls found.
--- Processing Page 7 ---
   > Time taken: 27.51 seconds
   > Success! Found 7 controls.
--- Processing Page 8 ---
   > Time taken: 16.93 seconds
   > Success! Found 2 controls.
--- Processing Page 9 ---
   > Time taken: 2.92 seconds
   > Success! Found 1 controls.
--- Processing Page 10 ---
   > Time taken: 9.27 seconds
   > Success! Found 3 controls.


In [53]:
print("\nDONE!")
print(f"Total controls found: {len(all_extracted_data)}")


DONE!
Total controls found: 14


In [79]:
with open("output_nist.json", "w") as f:
    json.dump(all_extracted_data, f, indent=2)

# To check for RAG Pipeline

In [80]:
import json
with open('output_nist.json',"r") as f:
    all_extracted_data = json.load(f)

pdf_path = "ISO_27001.pdf"

In [30]:
import pypdf
reader = pypdf.PdfReader(pdf_path)

text_page = []

for page in reader.pages:
    text_page.append(page.extract_text())

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import json
import re
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(os.environ['hf_api'])


def load_model(model_id="Qwen/Qwen2.5-7B-Instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # 2. Set padding side to 'left' for generation (Important for decoder-only models)
    tokenizer.padding_side = "left"

    # model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return tokenizer,model

def validate_controls(context,control_id,control_title,control_desc,tokenizer,model):
    validation_template = f"""
    You are a Quality Assurance Auditor. Review the following extraction claim.

    SOURCE TEXT:
    {context}

    CLAIMED CONTROL:
    ID: {control_id}
    Description: {control_desc}

    TASK:
    Verify if the "Source Text" actually contains the **official definition/requirement** for {control_id}.

    FAIL the validation if:
    - The text is just a question about the control.
    - The text is just a reference (e.g., "See {control_id}").
    - The description in the text matches a different control ID.

    OUTPUT JSON:
    {{
        "is_valid": true/false,
        "reason": "Explain why this is a definition vs a reference",
        "confidence_score": 1-10
    }}
    """
    messages = [
        {"role": "user", "content": validation_template}
    ]
    # message format
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         # temperature=0 (Deterministic)
        temperature=None,        # Must be None if do_sample=False
        top_p=None               # Must be None if do_sample=False
    )

    # 5. Decode
    # Determine the length of input tokens so we only decode the new response tokens
    response_tokens = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    # returning the response
    return clean_and_parse_json(response_text)


def clean_and_parse_json(text):
    """Helper to strip markdown code blocks and return a dict."""
    try:
        # 1. Try to find content inside ```json ... ```
        match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
        if match:
            text_to_parse = match.group(1)
        else:
            # 2. If no code blocks, try to find the first '{' and last '}'
            # This handles cases where the model outputs text before/after the JSON without fences
            start = text.find('{')
            end = text.rfind('}')
            if start != -1 and end != -1:
                text_to_parse = text[start:end+1]
            else:
                # 3. Fallback to original cleanup
                text_to_parse = re.sub(r"```json\s*|\s*```", "", text.strip())
        
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        # Fallback if model fails to generate valid JSON
        return {"error": "Failed to parse JSON", "raw_output": text}


    
def validation_pipeline(control_list, text_pages,tokenizer,model):
    # tokenizer,model=load_model("DavidAU/Llama3.3-8B-Instruct-Thinking-Claude-4.5-Opus-High-Reasoning")
    
    for idx,control in enumerate(control_list):
        print(f"{idx+1}/{len(control_list)} PROCCESSING CONTROL:",control['control_id'],end=": ")
        resp = validate_controls(text_pages[control['page']],control['control_id'],control['control_title'],control['control_desc'],tokenizer,model)
        try:
            print("PASS"if resp['is_valid'] else "FAIL")
        except:
            print("invalid json response.")
            print(resp)
        control['validation'] = resp
    return control_list


In [37]:
try:
    del tokenizer
    del model
except:
    print("tokenizer,model not defined")
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

tokenizer,model=load_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.63s/it]


In [54]:
validated_list = validation_pipeline(all_extracted_data,text_page,tokenizer,model)

1/14 PROCCESSING CONTROL: AC-1 PASS
2/14 PROCCESSING CONTROL: AC-3 PASS
3/14 PROCCESSING CONTROL: AC-5 FAIL
4/14 PROCCESSING CONTROL: AC-6 FAIL
5/14 PROCCESSING CONTROL: AC-17 FAIL
6/14 PROCCESSING CONTROL: AC-18 PASS
7/14 PROCCESSING CONTROL: AC-20 FAIL
8/14 PROCCESSING CONTROL: AC-24 PASS
9/14 PROCCESSING CONTROL: AC-11 FAIL
10/14 PROCCESSING CONTROL: AC-16 PASS
11/14 PROCCESSING CONTROL: AC-16 PASS
12/14 PROCCESSING CONTROL: AC-3 PASS
13/14 PROCCESSING CONTROL: ACCESS ENFORCEMENT | RESTRICTED ACCESS TO PRIVILEGED FUNCTIONS FAIL
14/14 PROCCESSING CONTROL: ACCESS ENFORCEMENT | DUAL AUTHORIZATION PASS


In [55]:
validated_list_original = validated_list.copy()

In [56]:
pass_count = 0
fail_count = 0
failed = []
for idx,control in enumerate(validated_list):
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            fail_count+=1
            failed.append([idx,control])

            continue

        if control['validation']['is_valid']:
            
            pass_count+=1
            
        else:
            fail_count+=1
            failed.append([idx,control])
    else:
        print(f"invalid control {control['control_id']}")
print(f"{pass_count} controls, passed out of {len(all_extracted_data)}. {fail_count} controls failed. \n log:{failed}")

8 controls, passed out of 14. 6 controls failed. 
 log:[[2, {'control_id': 'AC-5', 'control_title': 'Account Management', 'control_desc': 'Temporary and emergency accounts are intended for short-term use. Organizations establish temporary accounts as part of normal account activation procedures when there is a need for short-term accounts without the demand for immediacy in account activation. Organizations establish emergency accounts in response to crisis situations and with the need for rapid account activation. Therefore, emergency account activation may bypass normal account authorization processes. Emergency and temporary accounts are not to be confused with infrequently used accounts, including local logon accounts used for special tasks or when network resources are unavailable (may also be known as accounts of last resort). Such accounts remain available and are not subject to automatic disabling or removal dates. Conditions for disabling or deactivating accounts include when 

In [47]:
cleaned_controls = []
seen_ids = set()
for control in validated_list:
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            continue
        if control['control_id'] in seen_ids:
            continue
        else:
            seen_ids.add(control['control_id'])
        if control['validation']['is_valid']:
            control.pop('validation')
            cleaned_controls.append(control)
        else:
            fail_count+=1
    else:
        print(f"invalid control {control['control_id']}")

# print(json.dumps(cleaned_controls,indent=2))
print(len(cleaned_controls))

6


In [48]:
with open('../local_llm_outputs/output_nist_cleaned.json','w') as f:
    f.write(json.dumps(cleaned_controls,indent=4))

In [49]:
print(json.dumps(cleaned_controls,indent=4))

[
    {
        "control_id": "AC-1",
        "control_title": "ACCESS CONTROL POLICY AND PROCEDURES",
        "control_desc": "Develop, document, and disseminate to [Assignment: organization-defined personnel or roles]:\na. [Selection (one or more): Organization-level; Mission/business process-level; System-level] access control policy that:\n1. Addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance;\n2. Is consistent with applicable laws, executive orders, directives, regulations, policies, standards, and guidelines;\nb. Procedures to facilitate the implementation of the access control policy and the associated access controls;\nc. Designate an [Assignment: organization-defined official] to manage the development, documentation, and dissemination of the access control policy and procedures;\nd. Review and update the current access control:\n1. Policy [Assignment: organization-defined frequency] and followin