## Imports and basic setup

In [1]:
from pathlib import Path
from tqdm import tqdm

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


## Define paths and domains

In [2]:
DATA_ROOT = Path("../data")
INDEX_ROOT = Path("../indexes")

In [3]:
domains = [d for d in DATA_ROOT.iterdir() if d.is_dir()]
domains

[WindowsPath('../data/Artificial Intelligence'),
 WindowsPath('../data/Automobile'),
 WindowsPath('../data/business'),
 WindowsPath('../data/climate'),
 WindowsPath('../data/cyber security'),
 WindowsPath('../data/medical'),
 WindowsPath('../data/psychology')]

## Define shared components

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [5]:
embedding_model = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

  embedding_model = HuggingFaceEmbeddings(


## Function to load PDFs for one domain

In [6]:
def load_domain_documents(domain_path: Path):
    documents=[]
    domain_name = domain_path.name
    
    for pdf_file in domain_path.glob("*.pdf"):
        loader = PyPDFLoader(str(pdf_file))
        pages = loader.load()
        
        for page in pages:
            page.metadata['domain'] = domain_name
            page.metadata['source'] = pdf_file.name
        
        documents.extend(pages)
    
    return documents

## Build Vector Databases for ALL domains

In [7]:
for domain_path in tqdm(domains):
    print(f"\nIndexing domain: {domain_path.name}")

    docs = load_domain_documents(domain_path)
    if not docs:
        print("No documents found, skipping.")
        continue

    chunks = text_splitter.split_documents(docs)

    index_path = INDEX_ROOT / domain_path.name.lower().replace(" ", "_")
    index_path.mkdir(parents=True, exist_ok=True)

    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local(str(index_path))

print("\nAll vector databases created successfully.")

  0%|          | 0/7 [00:00<?, ?it/s]


Indexing domain: Artificial Intelligence


PdfReadError("Invalid Elementary Object starting with b'P' @18807504: b'7 0 obj<</Universal PDF(The process that creates this PDF constitutes a trade se'")
 14%|█▍        | 1/7 [03:20<20:04, 200.67s/it]


Indexing domain: Automobile


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 81 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wrong pointing object 109 0 (offset 0)
Ignoring wr


Indexing domain: business


Ignoring wrong pointing object 0 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
 43%|████▎     | 3/7 [04:18<04:33, 68.41s/it]


Indexing domain: climate


Ignoring wrong pointing object 5 0 (offset 0)
 57%|█████▋    | 4/7 [05:27<03:25, 68.56s/it]


Indexing domain: cyber security


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
Ignoring wrong pointing object 73 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong 


Indexing domain: medical


 86%|████████▌ | 6/7 [07:08<00:59, 59.44s/it]


Indexing domain: psychology


100%|██████████| 7/7 [08:24<00:00, 72.11s/it]


All vector databases created successfully.



