### EXTRACTION CONTROLS FROM UNSTRUCTURED PDF USING LLM
#### Using Qwen AI Model
- for extraction of controls
- validationof controls extracted using RAG

##### Path of the File

In [7]:
# pdf_path = "uploads/NIST_CONTROL.pdf"
# pdf_path = "ISO_27001.pdf"
pdf_path  = "notebooks/nist_file.pdf"

#### Reading Contents of the pdf and cleaning the pdf

In [5]:
import pypdf
import pdfplumber

In [8]:
reader = pypdf.PdfReader(pdf_path)

In [9]:
import re
def normalize_text(raw_text):

    # checking for empty string incase pdf has some
    if not raw_text:
        return ""

    # splitting lines
    lines = raw_text.split('\n')
    cleaned_lines = []

    # Generic noise patterns found the pdf[nist and iso]
    # defining the list because incase new patterns need to be added based on the pdf file.
    # These are safe to remove from ANY document.
    # We include a length check (len < 30) to ensure we don't accidentally
    # delete a real control that happens to contain the word "Page".
    noise_patterns = [
        r"^Page\s+\d+$",               # Matches "Page 1"
        r"^Page\s+\d+\s+of\s+\d+$",    # Matches "Page 1 of 10"
        r"^\d+\s+of\s+\d+$",           # Matches "1 of 10"
        r"^https?://",                 # URL artifacts often in footers
        r"^www\.",                     # Web links
        r"^\(c\)\s+\d{4}",             # Copyright markers like "(c) 2023"
        r"^Copyright",                 # Copyright word
        r'\bAppendix\s+[A-Z]+\s+Page\s+\d+\b'
    ]

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line is noise
        is_noise = False
        # Only check short lines to be safe. If a line is 100 chars long,
        # it's likely content, even if it has "Page" in it.
    
        for pattern in noise_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                is_noise = True
                break

        if not is_noise:
            cleaned_lines.append(line)

    # Merging all the lines into one text
    # We join with '\n' to preserve the structure.
    # The LLM needs to see the newlines to understand the layout.
    return '\n'.join(cleaned_lines)

In [10]:

text_page = []

# for page in reader.pages:
#     text_page.append(page.extract_text(extraction_mode='layout'))

with pdfplumber.open(pdf_path) as pdf:
    for idx,page in enumerate(pdf.pages):    
        if (idx+1)%50 ==0:
            print(f"extracting page {idx+1}/{len(pdf.pages)}")
        
        text_page.append(normalize_text(page.extract_text(layout=True)))

extracting page 50/492
extracting page 100/492
extracting page 150/492
extracting page 200/492
extracting page 250/492
extracting page 300/492
extracting page 350/492
extracting page 400/492
extracting page 450/492


#### Loading the Model 
loading the model in 8 bit for now because 8 bit gives better accuracy and stable results as compared to 4 bit.
but incase if we have vram less than 12gb then we can load the model in 4 bit.


In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,TextStreamer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     # bnb_4bit_use_double_quant=True,
#     # bnb_4bit_quant_type="nf4",
#     # bnb_4bit_compute_dtype=torch.bfloat16
# )

In [12]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(os.environ['hugging_face_token'])
# model_id="meta-llama/Llama-3.1-8B-Instruct" # not using this as it requires login in hugging_face
model_id = "Qwen/Qwen2.5-7B-Instruct"
# model_id="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# model_id="google/gemma-3-27b-it"


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
streamer = TextStreamer(tokenizer,skip_special_tokens=True,skip_prompt=True)

In [None]:
def clean_and_validate_json(json):
    indices_to_remove = []
    seen_ids = set()
    for idx,control in enumerate(json):
        empty_string_type = ['','none',' ',None]
        if control['control_id'].lower() not in seen_ids:
            seen_ids.add(control['control_id'].lower())
        else:
            indices_to_remove.append(idx)
            continue
        if len(control['control_desc'])<3:
            indices_to_remove.append(idx)
            continue
        try:
            if control['control_id'].lower() in empty_string_type or control['control_title'].lower() in empty_string_type or control['control_desc'].lower() in empty_string_type:
                indices_to_remove.append(idx)
                continue
        except:
            # if it fails to find any one of the column in the json
            indices_to_remove.append(idx)
    json = [element for index, element in enumerate(json) if index not in indices_to_remove]
    return json

In [None]:
def extract_controls_from_page(page_text,tokenizer,model):
    # 1. The System Prompt 
    system_prompt = """
You are a senior Compliance Auditor and Regulatory Analyst specialized in ISO, NIST, and statutory frameworks.

Your task is to extract ONLY real, enforceable compliance controls or regulatory requirements from the given text.

Precision is critical. Do NOT infer, summarize, merge, or invent controls.

========================
CORE EXTRACTION RULES
========================

1. A control MUST satisfy ALL of the following:
   - It has a valid Control ID (see patterns below)
   - It includes an explicit requirement, obligation, or mandate
   - The descriptive text directly applies to that Control ID

2. DO NOT extract:
   - Control IDs listed under:
     - "Related controls"
     - "Referenced controls"
     - "See also"
     - "Cross references"
     - Tables of contents
     - Indexes
     - Questionnaires or audit questions
   - Section headings, titles, or topic labels without enforceable requirements
   - Controls marked as "withdrawn", "deprecated", "not applicable", or "informative"
   - Any inferred or implied control not explicitly defined in the text

3. If the page does NOT clearly define a control, output an empty list: []

========================
CONTROL ID DETECTION
========================

Recognize ONLY these Control ID patterns:

- ISO / Annex A:
  - A.5.1
  - A.8.12
  - A.12.1.1

- NIST:
  - AC-1
  - IA-5
  - PM-10

- Legislative / Regulatory:
  - Sec. 302
  - Section 404

DO NOT treat IDs appearing inside explanatory text, examples, or references as controls.

========================
CONTROL BOUNDARY LOGIC
========================

- A valid control's description MUST:
  - Immediately follow or be clearly scoped to the Control ID
  - Contain enforceable language (e.g., "shall", "must", "is required to")
- STOP the description when:
  - A new Control ID appears
  - A new section or heading begins
  - The text shifts to references, examples, or guidance

========================
OUTPUT RULES (STRICT)
========================

- Output ONLY a raw JSON array
- No markdown
- No explanations
- No extra text
- No hallucinations

Each object MUST have:

{
  "control_id": "<exact identifier>",
  "control_title": "<concise title from text, 5–10 words>",
  "control_desc": "<full enforceable requirement text>"
}

========================
FAIL-SAFE BEHAVIOR
========================

- If uncertain whether text defines a real control → SKIP IT
- If zero valid controls exist → output []

========================
REMEMBER
========================

High precision > high recall.
It is better to return [] than an incorrect control."""
    

    # 2. Structure the Chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Analyze this text:\n\n{page_text}"}
    ]

    # 3. Prepare Inputs
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    print("DEBUG: Mddel output:")
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,
        streamer=streamer)
    
    
    

    # 5. Decode
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response.strip()

#### Main Extraction Loop
- the extraction loop gives the whole page chunk to the LLM, and LLM then returns the json output in a snippet

In [None]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

In [None]:
import json
import time

all_extracted_data = []
responses = []

total_start_time = time.time()  # Start global timer
print("Starting extraction...")

for i, page_text in enumerate(text_page):
    print(f"--- Processing Page {i+1} ---")


    page_start_time = time.time()


    raw_response = extract_controls_from_page(page_text,tokenizer,model)
    clean_json_string = raw_response.replace("```json", "").replace("```", "").strip()
    

    page_end_time = time.time()
    elapsed_time = page_end_time - page_start_time
    print(f"   > Time taken: {elapsed_time:.2f} seconds")
    responses.append(clean_json_string)

    try:

        if not clean_json_string or clean_json_string == "[]":
            print(f"   > No controls found.")
            continue

        data = json.loads(clean_json_string)

        # Verify it's a list
        if isinstance(data, list):
            count = len(data)
            print(f"   > Success! Found {count} controls.")
            for control in data:
                control['page']=i
                if i > 12:
                    print(i)
            all_extracted_data.extend(data)
        else:
            print(f"   > Warning: Model returned valid JSON but not a list.")

    except json.JSONDecodeError:
        print(f"   > Error: Model output invalid JSON.\n   > Raw Output: {raw_response[:50]}...")

total_end_time = time.time()
total_duration = total_end_time - total_start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)


Starting extraction...
--- Processing Page 1 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.25 seconds
   > No controls found.
--- Processing Page 2 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.22 seconds
   > No controls found.
--- Processing Page 3 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.34 seconds
   > No controls found.
--- Processing Page 4 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.29 seconds
   > No controls found.
--- Processing Page 5 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.29 seconds
   > No controls found.
--- Processing Page 6 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.35 seconds
   > No controls found.
--- Processing Page 7 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.25 seconds
   > No controls found.
--- Processing Page 8 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.25 seconds
   > No controls found.
--- Processing Page 9 ---
DEBUG: Mddel output:
[]
   > Time taken: 0.27 seconds
   > No controls found.
--- Processing Page 10 ---
DEBUG: Mddel o

In [21]:
print("\nDONE!")
print(f"Total controls found: {len(all_extracted_data)}")


DONE!
Total controls found: 768


In [86]:
with open("../local_llm_outputs/output_nist_whole.json", "w") as f:
    json.dump(all_extracted_data, f, indent=2)

# To check for RAG Pipeline

In [87]:
import json
with open("../local_llm_outputs/output_nist_whole.json","r") as f:
    all_extracted_data = json.load(f)


In [29]:
import pypdf
reader = pypdf.PdfReader(pdf_path)

text_page = []

for page in reader.pages:
    text_page.append(page.extract_text(extraction_mode='layout'))

print(len(text_page))

Rotated text discovered. Output will be incomplete.


12


In [91]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import json
import re




def load_model(model_id="Qwen/Qwen2.5-7B-Instruct",mode='4bit'):
    if mode.lower() == '4bit':
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    else:
        
        bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # 2. Set padding side to 'left' for generation (Important for decoder-only models)
    tokenizer.padding_side = "left"

    # model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return tokenizer,model

def validate_controls(context,control_id,control_title,control_desc,tokenizer,model):
    validation_template = f"""
    You are a Quality Assurance Auditor. Review the following extraction claim.

    SOURCE TEXT:
    {context}

    CLAIMED CONTROL:
    ID: {control_id}
    Title: {control_title}
    Desc: {control_desc}
    

    TASK:
    Verify if the "Source Text" actually contains the **official definition/requirement** for {control_id}.

    FAIL the validation if:
    - The text is just a question about the control.
    - The text is just a reference (e.g., "related controls: {control_id}").
    - The description in the text matches a different control ID.

    OUTPUT JSON:
    {{
        "is_valid": true/false,
        "reason": "Explain why this is a definition vs a reference",
        "confidence_score": Confidence score meaning:
                            1-3  = weak / indirect mention
                            4-6  = partial or ambiguous
                            7-8  = clear requirement
                            9-10 = explicit authoritative definition

    }}
    """
    messages = [
        {"role": "user", "content": validation_template}
    ]
    # message format
    streamer = TextStreamer(tokenizer, skip_prompt=True,skip_special_tokens=True)
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    print("\n model output:processing:\n\n")

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         # temperature=0 (Deterministic)
        temperature=None,        # Must be None if do_sample=False
        top_p=None,               # Must be None if do_sample=False
        streamer=streamer

    )

    # 5. Decode
    # Determine the length of input tokens so we only decode the new response tokens
    response_tokens = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    # returning the response
    return clean_and_parse_json(response_text)


def clean_and_parse_json(text):
    """Helper to strip markdown code blocks and return a dict."""
    try:
        # 1. Try to find content inside ```json ... ```
        match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
        if match:
            text_to_parse = match.group(1)
        else:
            # 2. If no code blocks, try to find the first '{' and last '}'
            # This handles cases where the model outputs text before/after the JSON without fences
            start = text.find('{')
            end = text.rfind('}')
            if start != -1 and end != -1:
                text_to_parse = text[start:end+1]
            else:
                # 3. Fallback to original cleanup
                text_to_parse = re.sub(r"```json\s*|\s*```", "", text.strip())
        
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        # Fallback if model fails to generate valid JSON
        return {"error": "Failed to parse JSON", "raw_output": text}


    
def validation_pipeline(control_list, text_pages,tokenizer,model):
    # tokenizer,model=load_model("DavidAU/Llama3.3-8B-Instruct-Thinking-Claude-4.5-Opus-High-Reasoning")
    
    for idx,control in enumerate(control_list):
        try:
            print(control['control_id'])
            resp = validate_controls(text_pages[control['page']-1],control['control_id'],control['control_title'],control['control_desc'],tokenizer,model)
            try:
                if resp['is_valid']:
                    print(f"\n\n \033[92m{idx+1}/{len(control_list)} PROCCESSING CONTROL",control['control_id'],end=": ")
                    print("PASS \033[0m".center(40))
                else:
                    print(f"\n\n \033[91m{idx+1}/{len(control_list)} PROCCESSING CONTROL",control['control_id'],end=": ")
                    print("FAIL \033[0m".center(40))

            except:
                print("invalid json response.")
                print(resp)
            control['validation'] = resp
        except:
            print(control)
            break
    return control_list


In [78]:
try:
    del tokenizer
    del model
except:
    print("tokenizer,model not defined")
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

# tokenizer,model=load_model(mode='8bit')

tokenizer,model not defined


0

In [92]:
validated_list = validation_pipeline(all_extracted_data,text_page,tokenizer,model)

AC-1

 model output:processing:


```json
{
    "is_valid": false,
    "reason": "The source text does not contain an official definition or requirement for AC-1. Instead, it discusses the development and maintenance of new controls in general terms, including access controls, but does not provide a specific, detailed definition for AC-1 as described in the claimed control.",
    "confidence_score": 1
}
```

### Explanation:
- **is_valid**: `false` because the source text does not explicitly define or require the specific details of AC-1 as stated in the claimed control.
- **reason**: The source text provides a general overview of how new controls are developed and maintained, but it does not give a detailed, official definition for AC-1. It mentions that new controls are developed based on various factors, including threat and vulnerability information, but it does not specify the exact content or structure of the access control policy and procedures as required by AC-1.
- **confidenc

In [22]:
validated_list_original = validated_list.copy()

In [23]:
pass_count = 0
fail_count = 0
failed = []
for idx,control in enumerate(validated_list):
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            fail_count+=1
            failed.append([idx,control])

            continue

        if control['validation']['is_valid']:
            
            pass_count+=1
            
        else:
            fail_count+=1
            failed.append([idx,control])
    else:
        print(f"invalid control {control['control_id']}")
print(f"{pass_count} controls, passed out of {len(all_extracted_data)}. {fail_count} controls failed. \n log:{failed}")

11 controls, passed out of 768. 757 controls failed. 


In [24]:
cleaned_controls = []
seen_ids = set()
# saving duplicates so we can save the most relevant and correct control from it and remove its references from index or erratas etc
grouped_duplicates = {}
for control in validated_list:
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            continue
        if control['control_id'] in seen_ids:
            if str(control['control_id']) in grouped_duplicates:
                grouped_duplicates[str(control['control_id'])].append(control)
            else:
                grouped_duplicates[str(control['control_id'])] = [control]

        else:
            seen_ids.add(control['control_id'])
        if control['validation']['is_valid']:
            # control.pop('validation')
            cleaned_controls.append(control)
        else:
            fail_count+=1
    else:
        print(f"invalid control {control['control_id']}")

# print(json.dumps(cleaned_controls,indent=2))
print(len(cleaned_controls))

11


In [28]:
with open('local_llm_outputs//output_nist_whole_cleaned.json','w') as f:
    f.write(json.dumps(cleaned_controls,indent=4))
    

In [29]:
print(json.dumps(cleaned_controls,indent=4))

[
    {
        "control_id": "AC-3",
        "control_title": "Account Management",
        "control_desc": "Establish and maintain a process for managing system accounts.",
        "page": 46,
        "validation": {
            "is_valid": true,
            "reason": "The source text provides a detailed set of requirements and procedures for managing system accounts, which aligns with the description of AC-3. The text explicitly outlines the steps and processes required for account management, making it a clear and authoritative definition.",
            "confidence_score": 9
        }
    },
    {
        "control_id": "AC-5",
        "control_title": "Account Management",
        "control_desc": "Specify authorized users, group and role membership, and access authorizations for each account.",
        "page": 46,
        "validation": {
            "is_valid": true,
            "reason": "The source text provides a detailed set of requirements and procedures for account management

In [31]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [35]:
documents = []

with pdfplumber.open(pdf_path) as pdf:
    for idx,page in enumerate(pdf.pages):    
        if (idx+1)%50 ==0:
            print(f"extracting page {idx+1}/{len(pdf.pages)}")
        documents.append(Document(page_content=page.extract_text(layout=True),metadata={'page':idx,'source':'pdf_document'}))

extracting page 50/492
extracting page 100/492
extracting page 150/492
extracting page 200/492
extracting page 250/492
extracting page 300/492
extracting page 350/492
extracting page 400/492
extracting page 450/492


In [32]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

In [33]:
vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")

In [36]:
ids = vector_store.add_documents(documents=documents)

In [37]:
def generate_search_query(control_id,control_name,control_desc):
    search_query = f"{control_id} {control_desc}"
    return search_query

In [42]:
r = []
for control in cleaned_controls:
    results = vector_store.similarity_search(generate_search_query(control['control_id'],control['control_title'],control['control_desc']))
    r.append(results)
    

In [43]:
print(r)



In [56]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def generate_search_query(control_id,control_name,control_desc):
    search_query = f"{control_id}"
    return search_query
def validate_controls_using_rag():
    reader = pypdf.PdfReader('nist_file.pdf')
    documents = []
    for idx,page in enumerate(reader.pages[:100]):
        documents.append(Document(page_content=page.extract_text(extraction_mode='layout'),metadata={'page':idx,'source':'pdf_document'}))
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
    vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")
    ids = vector_store.add_documents(documents=documents)
    for idx,control in enumerate(all_extracted_data):
        query = generate_search_query(control['control_id'],control['control_title'],control['control_desc'])
        
        # results = vector_store.similarity_search(query)
        results = vector_store.similarity_search(query)

        print('page_search:',[r.metadata['page'] for r in results],"page_extraction:",control['page'])
        print(results)
        if idx ==5:
            break
    

In [44]:
def verify_controls(context,control_id,control_title,control_desc,tokenizer,model):
    validation_template = f"""
    You are a Quality Assurance Auditor. Review the following extraction claim.

    SOURCE TEXT:
    {context}

    CLAIMED CONTROL:
    ID: {control_id}
    Title: {control_title}
    Desc: {control_desc}
    

    TASK:
    Verify if the "Source Text" actually contains the **official definition/requirement** for {control_id}.

    IF the claimed control has anything incorrect output a json with correct output

    OUTPUT JSON:
    {{
        "control_id": <the correct control id>,
        "control_title": <the correct control title>,
        "control_desc": <correct control description>
    }}
    """
    messages = [
        {"role": "user", "content": validation_template}
    ]
    # message format
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         # temperature=0 (Deterministic)
        temperature=None,        # Must be None if do_sample=False
        top_p=None               # Must be None if do_sample=False
    )

    # 5. Decode
    # Determine the length of input tokens so we only decode the new response tokens
    response_tokens = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    # returning the response
    return clean_and_parse_json(response_text)


def clean_and_parse_json(text):
    """Helper to strip markdown code blocks and return a dict."""
    try:
        # 1. Try to find content inside ```json ... ```
        match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
        if match:
            text_to_parse = match.group(1)
        else:
            # 2. If no code blocks, try to find the first '{' and last '}'
            # This handles cases where the model outputs text before/after the JSON without fences
            start = text.find('{')
            end = text.rfind('}')
            if start != -1 and end != -1:
                text_to_parse = text[start:end+1]
            else:
                # 3. Fallback to original cleanup
                text_to_parse = re.sub(r"```json\s*|\s*```", "", text.strip())
        
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        # Fallback if model fails to generate valid JSON
        return {"error": "Failed to parse JSON", "raw_output": text}


In [45]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")
ids = vector_store.add_documents(documents=documents)


In [48]:
verifications= []
test_case = {}
for idx,control in enumerate(cleaned_controls):
    query = generate_search_query(control['control_id'],control['control_title'],control['control_desc'])
    
    result = vector_store.similarity_search_by_vector(embeddings.embed_query(query))
    doc = result[0]
    r = verify_controls(doc.page_content, control['control_id'],control['control_title'],control['control_desc'],tokenizer,model)
    test_case[str(control['control_id'])] = r

In [52]:
for control_id in test_case:
    print(json.dumps(test_case[control_id],indent=2))

{
  "control_id": "AC-2",
  "control_title": "Account Management",
  "control_desc": "Define and document the types of accounts allowed and specifically prohibited for use within the system; assign account managers; require organization-defined prerequisites and criteria for group and role membership; specify authorized users, group and role membership, and access authorizations for each account; require approvals for requests to create accounts; create, enable, modify, disable, and remove accounts in accordance with organization-defined policy, procedures, prerequisites, and criteria; monitor the use of accounts; notify account managers and organization-defined personnel or roles within specified time periods when accounts are no longer required, users are terminated or transferred, or system usage or need-to-know changes for an individual; authorize access to the system based on valid access authorization, intended system usage, and organization-defined attributes; review accounts fo

In [62]:
print(verifications)

[[{'is_valid': False, 'reason': 'The source text does not contain the official definition/requirement for AC-1. Instead, it discusses various aspects of account management, including collaboration with privacy officials, account creation, modification, and removal, as well as the establishment of temporary and emergency accounts. While it mentions related controls, it does not provide a specific definition or requirement for developing, documenting, and disseminating access control policy and procedures.', 'confidence_score': 9}, {'is_valid': False, 'reason': 'The source text does not contain the official definition/requirement for AC-1. Instead, it discusses various aspects of account management, including collaboration with privacy officials, account creation, modification, and removal, as well as the establishment of temporary and emergency accounts. While it mentions related controls, it does not provide a specific definition or requirement for developing, documenting, and dissemin

In [53]:
data =  json.load(open('notebooks/control_json.json'))

In [None]:
train_samples = []

for c in data:
    page_idx = c["page"]
    page_text = text_page[page_idx]

    output = [{
        "control_id": c["control_id"],
        "control_title": c["control_title"],
        "control_desc": c["control_desc"]
    }]

    train_samples.append({
        "messages": [
            {
                "role": "system",
                "content": "You are a compliance control extraction engine. Extract ONLY valid controls. If none exist, output []."
            },
            {
                "role": "user",
                "content": page_text
            },
            {
                "role": "assistant",
                "content": json.dumps(output, ensure_ascii=False)
            }
        ]
    })
