In [2]:
import os, time, hashlib, yaml
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from dotenv import load_dotenv

load_dotenv()

#yaml loading
with open("metadata.yaml", "r") as f:
    METADATA_MAP = yaml.safe_load(f)

#recursive loading pdfs
def load_all_pdfs_recursive(root_directory):
    all_pages = []

    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.lower().endswith(".pdf"):

                file_path = os.path.join(root, filename)
                folder_name = os.path.basename(root)  # e.g., sem3, sem4

                print(f"Loading: {file_path}")

                loader = PyPDFLoader(file_path)
                pages = loader.load_and_split()

                # Get metadata from YAML, else empty dict
                pdf_meta = METADATA_MAP.get(filename, {})

                # Apply metadata to each PDF page
                for p in pages:
                    p.metadata.update(pdf_meta)
                    p.metadata["source_file"] = filename
                    p.metadata["folder"] = folder_name
                    p.metadata["directory_path"] = root

                all_pages.extend(pages)

    return all_pages


pages = load_all_pdfs_recursive("documents/")
print("Total pages loaded:", len(pages))

print("starting chunking")


splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=50
)

chunks = splitter.split_documents(pages)
print("Chunks created:", len(chunks))

print("adding chunk metadata")
ingest_date = time.strftime("%Y-%m-%d")

for i, c in enumerate(chunks):
    h = hashlib.sha1(c.page_content[:120].encode()).hexdigest()[:12]
    c.metadata["chunk_id"] = f"{c.metadata['source_file']}_chunk_{i}_{h}"
    c.metadata["ingest_date"] = ingest_date

print("to pinecone vectorstore")
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = PineconeVectorStore.from_documents(
    chunks,
    embedding=embeddings,
    index_name="pinecone-chatbot"
)

print("All PDFs (from nested folders) ingested into Pinecone successfully!")
print("printing sample chunks and metadata")
for i, chunk in enumerate(chunks[:10]):
    print("="*80)
    print(f"CHUNK {i}")
    print("- Text:", chunk.page_content[:300], "...")
    print("- Metadata:", chunk.metadata)
    print("="*80)


  from .autonotebook import tqdm as notebook_tqdm


Loading: documents/onestopcenter.pdf
Loading: documents/TamilnaduMergedData.pdf
Loading: documents/sem3\DigitalForensics_23MCPE652_MQP.pdf
Loading: documents/sem3\syllabus.pdf
Loading: documents/sem4\Module 3-5 Question Bank.pdf
Loading: documents/sem4\Syllabus.pdf
Total pages loaded: 48
starting chunking
Chunks created: 100
adding chunk metadata
to pinecone vectorstore


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


All PDFs (from nested folders) ingested into Pinecone successfully!
printing sample chunks and metadata
CHUNK 0
- Text: What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  
and
 
public
 
spaces,
 
within
 
the
 
family,
 
community
 
and
 
at
 
the
 
workplace.
 
Women
 
facing
 
physical,
 
sexual,
 
emotional,
 
psychological
 
and
  ...
- Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'ONE STOP CENTRE', 'source': 'documents/onestopcenter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'source_file': 'onestopcenter.pdf', 'folder': '', 'directory_path': 'documents/', 'chunk_id': 'onestopcenter.pdf_chunk_0_5d09f767158b', 'ingest_date': '2025-12-09', 'text': 'What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  \nand\n \npublic\n \nspaces,\n \nwithin\n \nthe

In [None]:
query = "give module 1 Q1 a of Digital Forensics"

results = vectorstore.similarity_search(
    query,
    k=5
    
)
print(results)

for r in results:
    print("="*80)
    print("TEXT:", r.page_content[:300])
    print("METADATA:", r.metadata)



# filter={"sem": "3",
#     "dept": "mca"}

[Document(id='4b5d145e-b6a7-41fd-a20f-1c9d804411c5', metadata={'author': 'Sathisha Shetty', 'chunk_id': 'DigitalForensics_23MCPE652_MQP.pdf_chunk_69_e2524a3df2f1', 'creationdate': '2025-03-19T13:49:15+05:30', 'creator': 'Microsoft® Word 2021', 'directory_path': 'documents/sem3', 'folder': 'sem3', 'ingest_date': '2025-12-09', 'moddate': '2025-03-19T13:49:15+05:30', 'page': 1.0, 'page_label': '2', 'producer': 'Microsoft® Word 2021', 'source': 'documents/sem3\\DigitalForensics_23MCPE652_MQP.pdf', 'source_file': 'DigitalForensics_23MCPE652_MQP.pdf', 'total_pages': 3.0}, page_content='24MCPE652 \n2 of 3 \nQ4 \na. A multinational corporation suspects that a coordinated cyberattack has \ncompromised its internal network, with sensitive files distributed across multiple \ndevices, including servers, employee workstations, and cloud storage systems. As \na digital forensic i nvestigator, how would you apply forensic tools to  \nsystematically gather and preserve data from these devices, ensurin