### EXTRACTION CONTROLS FROM UNSTRUCTURED PDF USING LLM
#### Using Qwen AI Model
- for extraction of controls
- validationof controls extracted using RAG

##### Path of the File

In [30]:
# pdf_path = "uploads/NIST_CONTROL.pdf"
pdf_path = "ISO_27001.pdf"
# pdf_path  = "nist_file.pdf"

#### Reading Contents of the pdf and cleaning the pdf

In [31]:
import pypdf

In [32]:
reader = pypdf.PdfReader(pdf_path)

In [33]:
import re
def normalize_text(raw_text):

    # checking for empty string incase pdf has some
    if not raw_text:
        return ""

    # splitting lines
    lines = raw_text.split('\n')
    cleaned_lines = []

    # Generic noise patterns found the pdf[nist and iso]
    # defining the list because incase new patterns need to be added based on the pdf file.
    # These are safe to remove from ANY document.
    # We include a length check (len < 30) to ensure we don't accidentally
    # delete a real control that happens to contain the word "Page".
    noise_patterns = [
        r"^Page\s+\d+$",               # Matches "Page 1"
        r"^Page\s+\d+\s+of\s+\d+$",    # Matches "Page 1 of 10"
        r"^\d+\s+of\s+\d+$",           # Matches "1 of 10"
        r"^https?://",                 # URL artifacts often in footers
        r"^www\.",                     # Web links
        r"^\(c\)\s+\d{4}",             # Copyright markers like "(c) 2023"
        r"^Copyright",                 # Copyright word
        r'\bAppendix\s+[A-Z]+\s+Page\s+\d+\b'
    ]

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line is noise
        is_noise = False
        # Only check short lines to be safe. If a line is 100 chars long,
        # it's likely content, even if it has "Page" in it.
    
        for pattern in noise_patterns:
            if re.search(pattern, line, re.IGNORECASE):
                is_noise = True
                break

        if not is_noise:
            cleaned_lines.append(line)

    # Merging all the lines into one text
    # We join with '\n' to preserve the structure.
    # The LLM needs to see the newlines to understand the layout.
    return '\n'.join(cleaned_lines)

In [34]:

text_page = []

for page in reader.pages:
    text_page.append(page.extract_text(extraction_mode="layout"))

Rotated text discovered. Output will be incomplete.


#### Loading the Model 
loading the model in 8 bit for now because 8 bit gives better accuracy and stable results as compared to 4 bit.
but incase if we have vram less than 12gb then we can load the model in 4 bit.


In [36]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [37]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()
login(os.environ['hugging_face_token'])
# model_id="meta-llama/Llama-3.1-8B-Instruct" # not using this as it requires login in hugging_face
model_id = "Qwen/Qwen2.5-7B-Instruct"
# model_id="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# model_id="google/gemma-3-27b-it"


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.23s/it]


In [38]:
def clean_and_validate_json(json):
    indices_to_remove = []
    seen_ids = set()
    for idx,control in enumerate(json):
        empty_string_type = ['','none',' ',None]
        if control['control_id'].lower() not in seen_ids:
            seen_ids.add(control['control_id'].lower())
        else:
            indices_to_remove.append(idx)
            continue
        if len(control['control_desc'])<3:
            indices_to_remove.append(idx)
            continue
        try:
            if control['control_id'].lower() in empty_string_type or control['control_title'].lower() in empty_string_type or control['control_desc'].lower() in empty_string_type:
                indices_to_remove.append(idx)
                continue
        except:
            # if it fails to find any one of the column in the json
            indices_to_remove.append(idx)
    json = [element for index, element in enumerate(json) if index not in indices_to_remove]
    return json

In [39]:
def extract_controls_from_page(page_text):
    # 1. The System Prompt 
    system_prompt = """
    You are an expert Compliance Auditor specialized in ISO, NIST, and Regulatory frameworks.

    Task: Extract individual controls or regulatory requirements from the provided text. Ignore all boilerplate, headers, and irrelevant legal prose.

    Control ID Identification Logic:
    Identify control IDs using a pattern-matching approach for:
    1. Annex A style (e.g., A.5.1, A.12.1.1)
    2. NIST style (e.g., AC-1, PM-10)
    3. Legislative/Section style (e.g., Sec. 302, Sec. 404)

    Rules:
    - If controls are found (high chances some pages dont have controls so make sure to ignore that), return ONLY a raw JSON list of objects.
    - Fields:
        - "control_id": The exact identifier (e.g., "Sec. 404", "A.5.1", or "AC-1").
        - "control_title": A concise title (5-10 words) as stated in the text.
        - "control_desc": The full detailed description or specific requirement associated with that ID.
    - If NO controls are found, output strictly: []
    - Strict Formatting: No markdown, no "```json" blocks, no preamble. Just the raw JSON.
    """
    

    # 2. Structure the Chat
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Analyze this text:\n\n{page_text}"}
    ]

    # 3. Prepare Inputs
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         
        temperature=.01,          #so the model halluinates less
    )

    # 5. Decode
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    return response.strip()

#### Main Extraction Loop
- the extraction loop gives the whole page chunk to the LLM, and LLM then returns the json output in a snippet

In [14]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

In [None]:
import json
import time

all_extracted_data = []
responses = []

total_start_time = time.time()  # Start global timer
print("Starting extraction...")

for i, page_text in enumerate(text_page):
    print(f"--- Processing Page {i+1} ---")


    page_start_time = time.time()


    raw_response = extract_controls_from_page(page_text)
    clean_json_string = raw_response.replace("```json", "").replace("```", "").strip()
    

    page_end_time = time.time()
    elapsed_time = page_end_time - page_start_time
    print(f"   > Time taken: {elapsed_time:.2f} seconds")
    responses.append(clean_json_string)

    try:

        if not clean_json_string or clean_json_string == "[]":
            print(f"   > No controls found.")
            continue

        data = json.loads(clean_json_string)

        # Verify it's a list
        if isinstance(data, list):
            count = len(data)
            print(f"   > Success! Found {count} controls.")
            for control in data:
                control['page']=i
                print(i)
            all_extracted_data.extend(data)
        else:
            print(f"   > Warning: Model returned valid JSON but not a list.")

    except json.JSONDecodeError:
        print(f"   > Error: Model output invalid JSON.\n   > Raw Output: {raw_response[:50]}...")

total_end_time = time.time()
total_duration = total_end_time - total_start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Starting extraction...
--- Processing Page 1 ---
   > Time taken: 0.80 seconds
   > No controls found.
--- Processing Page 2 ---
   > Time taken: 0.22 seconds
   > No controls found.
--- Processing Page 3 ---
   > Time taken: 35.38 seconds
   > Success! Found 27 controls.
--- Processing Page 4 ---
   > Time taken: 30.06 seconds
   > Success! Found 28 controls.
--- Processing Page 5 ---
   > Time taken: 32.03 seconds
   > Success! Found 32 controls.
--- Processing Page 6 ---
   > Time taken: 5.17 seconds
   > Success! Found 4 controls.
--- Processing Page 7 ---
   > Time taken: 18.81 seconds
   > Success! Found 7 controls.
--- Processing Page 8 ---
   > Time taken: 24.41 seconds
   > Success! Found 24 controls.
--- Processing Page 9 ---
   > Time taken: 8.38 seconds
   > Success! Found 6 controls.
--- Processing Page 10 ---
   > Time taken: 2.52 seconds
   > Success! Found 2 controls.
--- Processing Page 11 ---
   > Time taken: 14.58 seconds
   > Success! Found 13 controls.
--- Processi

In [16]:
print("\nDONE!")
print(f"Total controls found: {len(all_extracted_data)}")


DONE!
Total controls found: 143


In [17]:
with open("output_iso_new.json", "w") as f:
    json.dump(all_extracted_data, f, indent=2)

# To check for RAG Pipeline

In [20]:
import json
with open('output_nist_new.json',"r") as f:
    all_extracted_data = json.load(f)

pdf_path = "ISO_27001.pdf"

In [29]:
import pypdf
reader = pypdf.PdfReader(pdf_path)

text_page = []

for page in reader.pages:
    text_page.append(page.extract_text(extraction_mode='layout'))

print(len(text_page))

Rotated text discovered. Output will be incomplete.


12


In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import json
import re




def load_model(model_id="Qwen/Qwen2.5-7B-Instruct",mode='4bit'):
    if mode.lower() == '4bit':
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    else:
        
        bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # 2. Set padding side to 'left' for generation (Important for decoder-only models)
    tokenizer.padding_side = "left"

    # model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    return tokenizer,model

def validate_controls(context,control_id,control_title,control_desc,tokenizer,model):
    validation_template = f"""
    You are a Quality Assurance Auditor. Review the following extraction claim.

    SOURCE TEXT:
    {context}

    CLAIMED CONTROL:
    ID: {control_id}
    Description: {control_desc}

    TASK:
    Verify if the "Source Text" actually contains the **official definition/requirement** for {control_id}.

    FAIL the validation if:
    - The text is just a question about the control.
    - The text is just a reference (e.g., "See {control_id}").
    - The description in the text matches a different control ID.

    OUTPUT JSON:
    {{
        "is_valid": true/false,
        "reason": "Explain why this is a definition vs a reference",
        "confidence_score": 1-10
    }}
    """
    messages = [
        {"role": "user", "content": validation_template}
    ]
    # message format
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # 4. Generate
    outputs = model.generate(
        input_ids,
        max_new_tokens=4096,      # Limit output size to save time
        do_sample=False,         # temperature=0 (Deterministic)
        temperature=None,        # Must be None if do_sample=False
        top_p=None               # Must be None if do_sample=False
    )

    # 5. Decode
    # Determine the length of input tokens so we only decode the new response tokens
    response_tokens = outputs[0][input_ids.shape[-1]:]
    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
    # returning the response
    return clean_and_parse_json(response_text)


def clean_and_parse_json(text):
    """Helper to strip markdown code blocks and return a dict."""
    try:
        # 1. Try to find content inside ```json ... ```
        match = re.search(r"```json\s*(.*?)\s*```", text, re.DOTALL)
        if match:
            text_to_parse = match.group(1)
        else:
            # 2. If no code blocks, try to find the first '{' and last '}'
            # This handles cases where the model outputs text before/after the JSON without fences
            start = text.find('{')
            end = text.rfind('}')
            if start != -1 and end != -1:
                text_to_parse = text[start:end+1]
            else:
                # 3. Fallback to original cleanup
                text_to_parse = re.sub(r"```json\s*|\s*```", "", text.strip())
        
        return json.loads(text_to_parse)
    except json.JSONDecodeError:
        # Fallback if model fails to generate valid JSON
        return {"error": "Failed to parse JSON", "raw_output": text}


    
def validation_pipeline(control_list, text_pages,tokenizer,model):
    # tokenizer,model=load_model("DavidAU/Llama3.3-8B-Instruct-Thinking-Claude-4.5-Opus-High-Reasoning")
    
    for idx,control in enumerate(control_list):
        print(f"{idx+1}/{len(control_list)} PROCCESSING CONTROL",control['control_id'],end=": ")
        try:
            resp = validate_controls(text_pages[control['page']],control['control_id'],control['control_title'],control['control_desc'],tokenizer,model)
            try:
                print("PASS"if resp['is_valid'] else "FAIL")
            except:
                print("invalid json response.")
                print(resp)
            control['validation'] = resp
        except:
            print(control)
            break
    return control_list


In [27]:
try:
    del tokenizer
    del model
except:
    print("tokenizer,model not defined")
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

tokenizer,model=load_model(mode='8bit')

Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.33s/it]


In [28]:
validated_list = validation_pipeline(all_extracted_data,text_page,tokenizer,model)

1/226 PROCCESSING CONTROL AC-19: {'control_id': 'AC-19', 'control_title': 'Many Safeguards for Mobile Devices', 'control_desc': 'Many safeguards for mobile devices are reflected in other controls.', 'page': 19}


In [10]:
validated_list_original = validated_list.copy()

In [12]:
pass_count = 0
fail_count = 0
failed = []
for idx,control in enumerate(validated_list):
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            fail_count+=1
            failed.append([idx,control])

            continue

        if control['validation']['is_valid']:
            
            pass_count+=1
            
        else:
            fail_count+=1
            failed.append([idx,control])
    else:
        print(f"invalid control {control['control_id']}")
print(f"{pass_count} controls, passed out of {len(all_extracted_data)}. {fail_count} controls failed. \n log:{failed}")

9 controls, passed out of 38. 29 controls failed. 
 log:[[3, {'control_id': 'AC-5', 'control_title': 'Account Management | Account Expiration', 'control_desc': 'Policies can include such information as account expiration dates or other factors that trigger the disabling of accounts.', 'page': 6, 'validation': {'is_valid': False, 'reason': 'The source text provides context and examples related to account expiration and disabling, but it does not explicitly state the official definition or requirement for AC-5. The text describes practices that align with what might be expected under AC-5, but it does not provide the exact wording or formal requirement for the control.', 'confidence_score': 7}}], [4, {'control_id': 'AC-6', 'control_title': 'Account Management | Account Attributes', 'control_desc': 'Organizations may choose to define access privileges or other attributes by account, type of account, or a combination of the two. Examples of other attributes required for authorizing access 

In [14]:
cleaned_controls = []
seen_ids = set()
for control in validated_list:
    if 'validation' in control:
        if not 'is_valid' in control['validation']:
            continue
        if control['control_id'] in seen_ids:
            continue
        else:
            seen_ids.add(control['control_id'])
        if control['validation']['is_valid']:
            # control.pop('validation')
            cleaned_controls.append(control)
        else:
            fail_count+=1
    else:
        print(f"invalid control {control['control_id']}")

# print(json.dumps(cleaned_controls,indent=2))
print(len(cleaned_controls))

6


In [48]:
with open('../local_llm_outputs/output_nist_cleaned.json','w') as f:
    f.write(json.dumps(cleaned_controls,indent=4))

In [15]:
print(json.dumps(cleaned_controls,indent=4))

[
    {
        "control_id": "AC-1",
        "control_title": "Develop, document, and disseminate access control policy and procedures",
        "control_desc": "Develop, document, and disseminate to organization-defined personnel or roles an organization-level, mission/business process-level, or system-level access control policy that addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance. Also develop procedures to facilitate the implementation of the access control policy and the associated access controls. Designate an organization-defined official to manage the development, documentation, and dissemination of the access control policy and procedures. Review and update the current access control policy and procedures at organization-defined frequencies and following organization-defined events.",
        "page": 4,
        "validation": {
            "is_valid": true,
            "reason": "The source te

In [24]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [18]:
documents = []
for idx,page in enumerate(reader.pages[40:50]):
    documents.append(Document(page_content=page.extract_text(extraction_mode='layout'),metadata={'page':idx,'source':'pdf_document'}))


Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.


In [22]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [25]:
vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")

In [26]:
ids = vector_store.add_documents(documents=documents)

In [33]:
def generate_search_query(control_id,control_name,control_desc):
    search_query = f"{control_id} {control_name} {control_desc}"
    return search_query

In [35]:
for control in all_extracted_data:
    print(generate_search_query(control['control_id'],control['control_title'],control['control_desc']))
    break

AC-1 Develop, document, and disseminate access control policy and procedures Develop, document, and disseminate to organization-defined personnel or roles an organization-level, mission/business process-level, or system-level access control policy that addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance. Also develop procedures to facilitate the implementation of the access control policy and the associated access controls. Designate an organization-defined official to manage the development, documentation, and dissemination of the access control policy and procedures. Review and update the current access control policy and procedures at organization-defined frequencies and following organization-defined events.


In [38]:
results = vector_store.similarity_search('AC-1 Develop, document, and disseminate access control policy and procedures Develop, document, and disseminate to organization-defined personnel or roles an organization-level, mission/business process-level, or system-level access control policy that addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance. Also develop procedures to facilitate the implementation of the access control policy and the associated access controls. Designate an organization-defined official to manage the development, documentation, and dissemination of the access control policy and procedures. Review and update the current access control policy and procedures at organization-defined frequencies and following organization-defined events.')

In [None]:
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def generate_search_query(control_id,control_name,control_desc):
    search_query = f"{control_id}"
    return search_query
def validate_controls_using_rag():
    reader = pypdf.PdfReader('nist_file.pdf')
    documents = []
    for idx,page in enumerate(reader.pages[:100]):
        documents.append(Document(page_content=page.extract_text(extraction_mode='layout'),metadata={'page':idx,'source':'pdf_document'}))
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
    vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")
    ids = vector_store.add_documents(documents=documents)
    for idx,control in enumerate(all_extracted_data):
        query = generate_search_query(control['control_id'],control['control_title'],control['control_desc'])
        
        # results = vector_store.similarity_search(query)
        results = vector_store.similarity_search(query)

        print('page_search:',[r.metadata['page'] for r in results],"page_extraction:",control['page'])
        print(results)
        if idx ==5:
            break
    

In [61]:
reader = pypdf.PdfReader('nist_file.pdf')
documents = []
for idx,page in enumerate(reader.pages[:100]):
    documents.append(Document(page_content=page.extract_text(extraction_mode='layout'),metadata={'page':idx,'source':'pdf_document'}))
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
vector_store = Chroma(collection_name="pdf_vector_store",embedding_function=embeddings,persist_directory="./pdf_chroma_db")
ids = vector_store.add_documents(documents=documents)
for idx,control in enumerate(all_extracted_data):
    query = generate_search_query(control['control_id'],control['control_title'],control['control_desc'])
    
    # results = vector_store.similarity_search(query)
    results = vector_store.similarity_search(query)

    print('page_search:',[r.metadata['page'] for r in results],"page_extraction:",control['page'])
    print(results)
    if idx ==5:
        break

Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text discovered. Output will be incomplete.
Rotated text

page_search: [6, 46, 46, 46] page_extraction: 4
[Document(id='a48648be-ae7b-42d6-b438-23a5f0d21c8b', metadata={'page': 6, 'source': 'pdf_document'}, page_content='NIST SP 800-                 53, REV. 5                                                                                     SECURITY AND PRIVACY CONTROLS FOR INFORMATION SYSTEMS AND ORGANIZATIONS\n_________________________________________________________________________________________________\n\n                Where access involves personally identifiable information, security programs collaborate with\n                the senior agency official for privacy to                                                establish the specific conditions for group and role\n                membership; specify authorized users, group and role membership, and access authorizations  for\n                each account; and create, adjust, or remove system accounts in accordance with organizational\n                policies. Policies can includ

In [20]:
for idx,control in enumerate(all_extracted_data):
    query = generate_search_query(control['control_id'],control['control_title'],control['control_desc'])

    # results = vector_store.similarity_search(query)
    results = vector_store.similarity_search(query)

    print('page_search:',[r.metadata['page'] for r in results],"page_extraction:",control['page'])
    print(len(results))
    if idx ==5:
        break

NameError: name 'all_extracted_data' is not defined

In [9]:
from transformers import BitsAndBytesConfig

bnb = BitsAndBytesConfig(load_in_8bit=True,)