<a href="https://colab.research.google.com/github/shreyyeahh/Nokia_Internship/blob/main/multi_pdf_extraction_using_docling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os
import logging
from pathlib import Path
from tqdm.auto import tqdm
import textwrap
import torch
import openai

In [None]:
try:
    from google.colab import userdata
    # Set the OpenAI API key from Colab Secrets
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    print("OpenAI API Key set successfully.")
except (ImportError, KeyError):
    print("Warning: Could not find 'OPENAI_API_KEY' in Colab Secrets.")
    print("Please set it manually or the function will fail.")
    # For local development, you might set it directly, but this is not recommended for notebooks:
    # os.environ['OPENAI_API_KEY'] = "YOUR_SK-..."

import openai

In [None]:
%%capture
#! pip install -U ipywidgets
! pip install docling
! pip install pdf2image
! apt-get update && apt-get install -y poppler-utils

In [None]:
# Visualization and PDF/Image handling
import matplotlib.pyplot as plt
from PIL import Image
from pdf2image import convert_from_path

In [None]:
!pip install Transformers scikit-learn

In [None]:
!pip install sentence_transformers

In [None]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode

In [None]:
# Sentence-Transformers for semantic embeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
def extract_content_from_single_pdf(input_data_path: str) -> pd.DataFrame:
    """
    Extracts content from a single PDF using Docling.
    If tables are found, it processes them.
    If no tables are found, it treats the entire document text as a single block.
    """
    try:
        pipeline_options = PdfPipelineOptions(do_table_structure=True)
        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
        doc_converter = DocumentConverter(
            allowed_formats=[InputFormat.PDF],
            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
        )
        result = doc_converter.convert(input_data_path)

        all_content_blocks = []

        if not result.document.tables:
            all_content_blocks.append({
                "original_table_number": 1,
                "table_content": result.document.text
            })
        else:
            for table_ix, table in enumerate(result.document.tables):
                table_df = table.export_to_dataframe()
                table_content_string = table_df.to_string()
                all_content_blocks.append({
                    "original_table_number": table_ix + 1,
                    "table_content": table_content_string
                })

        return pd.DataFrame(all_content_blocks)

    except Exception as e:
        print(f"  - Error processing file {Path(input_data_path).name}: {e}")
        return pd.DataFrame()

In [None]:
def process_pdf_database(directory_path: str) -> pd.DataFrame:
    """
    Processes all PDFs in a given directory to create a final DataFrame of extracted content.
    """
    pdf_files = list(Path(directory_path).glob('*.pdf'))
    if not pdf_files:
        print(f"No PDF files found in directory: {directory_path}")
        return pd.DataFrame()

    all_pdfs_data = []

    print(f"Found {len(pdf_files)} PDFs. Starting processing...")

    for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_name = pdf_path.name
        extracted_df = extract_content_from_single_pdf(str(pdf_path))

        if not extracted_df.empty:
            extracted_df['pdf_name'] = pdf_name
            all_pdfs_data.append(extracted_df)

    if not all_pdfs_data:
        print("Processing complete, but no data was successfully extracted from any PDF.")
        return pd.DataFrame()

    master_df = pd.concat(all_pdfs_data, ignore_index=True)
    final_columns = ['pdf_name', 'original_table_number', 'table_content']
    master_df = master_df[final_columns]

    return master_df

In [None]:
def create_embeddings(df: pd.DataFrame, model_name: str = 'all-MiniLM-L6-v2'):
    """
    Creates semantic embeddings for the text chunks using a sentence-transformer model.
    """
    print(f"Loading sentence-transformer model: {model_name}...")
    model = SentenceTransformer(model_name)
    print("Model loaded. Generating embeddings for all content blocks...")

    corpus = df['table_content'].tolist()
    embeddings = model.encode(corpus, show_progress_bar=True, convert_to_tensor=True)

    print("Embeddings generated successfully.")
    return model, embeddings.cpu().numpy()

In [None]:
def search_documents_semantic(query: str, model, embeddings, original_df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
    """
    Searches the documents using semantic embeddings and returns the most relevant content blocks.
    """
    query_embedding = model.encode([query], convert_to_tensor=True)
    cosine_sims = cosine_similarity(query_embedding.cpu().numpy(), embeddings).flatten()

    if len(cosine_sims) > top_n:
        top_indices = np.argpartition(-cosine_sims, top_n)[:top_n]
        sorted_top_indices = top_indices[np.argsort(-cosine_sims[top_indices])]
    else:
        sorted_top_indices = np.argsort(-cosine_sims)

    results = original_df.iloc[sorted_top_indices].copy()
    results['similarity_score'] = cosine_sims[sorted_top_indices]

    return results

In [None]:
def generate_final_answer(query:str,retrieved_chunks_df:pd.DataFrame)->str:
   """
    Uses OpenAI's gpt-4o-mini model to generate a final answer from retrieved chunks.
    """
   try:
      client = openai.OpenAI()
   except openai.AuthenticationError:
      return "OpenAI key not set or invalid"

   #step 1 - combine the raw chunk content into single context string
   context = "\n---\n".join(retrieved_chunks_df['table_content'])

    # --- LLM CALL 1: Summarize/Clean the context ---
   summarization_prompt = f"""
   Based on the following raw data chunks, synthesize the key information into a clear, factual paragraph.
   Focus on accurately extracting facts, figures, and specifications.
   Do not add any information that is not present in the text.
     Raw data:
    ---
    {context}
    ---
    Clean Summary:
    """

   print("\n--- Step 4a: Generating Clean Context (with gpt-4o-mini)... ---")
   try:
        summarization_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert technical assistant that summarizes raw document data into clean, readable text."},
                {"role": "user", "content": summarization_prompt}
            ],
            temperature=0.0 # Low temperature for factual summarization
        )
        clean_context = summarization_response.choices[0].message.content
        print(textwrap.fill(clean_context, width=80))
   except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return "Failed to generate a clean context from the retrieved documents."

    # --- LLM CALL 2: Generate the final answer ---
   answer_prompt = f"""
    Using ONLY the context provided below, give a direct and comprehensive answer to the user's question.
    Cite the source PDF for key pieces of information if available.
    If the context does not contain the information needed to answer the question, state that the answer is not available in the provided documents.

    Context:
    ---
    {clean_context}
    ---
    User's Question: {query}

    Final Answer:
    """

   print("\n--- Step 4b: Generating Final Answer (with gpt-4o-mini)... ---")
   try:
        answer_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful question-answering assistant that strictly uses the provided context to answer questions."},
                {"role": "user", "content": answer_prompt}
            ],
            temperature=0.2 # Slightly higher temp for more natural language
        )
        final_answer = answer_response.choices[0].message.content
   except Exception as e:
        print(f"An error occurred during final answer generation: {e}")
        return "Failed to generate a final answer."

   return final_answer



In [None]:

# MAIN EXECUTION SCRIPT


if __name__ == '__main__':
    # 1. SET UP YOUR PDF DIRECTORY IN GOOGLE DRIVE
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        # --- IMPORTANT: CHANGE THIS to the path of your folder in Google Drive ---
        PDF_DIRECTORY = '/content/drive/MyDrive/pdf_folder/'
        print(f"Target directory set to: {PDF_DIRECTORY}")
    except (ImportError, ModuleNotFoundError):
        # Fallback for local execution if not in Colab
        PDF_DIRECTORY = './pdf_database/'

    if not os.path.exists(PDF_DIRECTORY):
        os.makedirs(PDF_DIRECTORY)
        print(f"\nCreated directory '{PDF_DIRECTORY}'.")
        print("Please upload your PDF files to this folder and run this cell again.")
    else:
        # 2. CREATE THE MASTER DATAFRAME
        # This step processes all PDFs and can take a while.
        master_dataframe = process_pdf_database(PDF_DIRECTORY)

        if not master_dataframe.empty:
            print(f"\nSuccessfully created a master DataFrame with {len(master_dataframe)} text chunks.")

            # Save the master dataframe for future use so you don't have to process again
            master_dataframe.to_csv("master_database.csv", index=False)
            print("\nSaved the master DataFrame to 'master_database.csv'")

            # 3. CREATE SEMANTIC EMBEDDINGS
            embedding_model, document_embeddings = create_embeddings(master_dataframe)

            # Optional: Save embeddings for faster loading next time
            np.save('document_embeddings.npy', document_embeddings)
            print("Saved document embeddings to 'document_embeddings.npy'")

            # 4. START THE INTERACTIVE Q&A LOOP
            print("\n" + "="*50)
            print("System is ready. You can now ask questions.")
            print("="*50)

            user_query = input("Enter your semantic query (or type 'exit' to quit): ")
            while user_query.lower() != 'exit':
                if user_query:
                    # Step 4a: Retrieve relevant chunks from your documents
                    print("\n--- Step 4: Retrieving relevant chunks... ---")
                    top_results = search_documents_semantic(
                        user_query,
                        embedding_model,
                        document_embeddings,
                        master_dataframe,
                        top_n=5 # You can adjust this number
                    )

                    print("Top Retrieved Chunks (for context):")
                    print(top_results[['pdf_name', 'original_table_number', 'table_content']])

                    # Step 4b: Pass the retrieved chunks to the LLM to generate a final answer
                    final_answer = generate_final_answer(user_query, top_results)

                    # Step 4c: Display the final, polished answer
                    print("\n" + "="*50)
                    print("Final Generated Answer (GPT-4o mini)")
                    print("="*50)
                    print(textwrap.fill(final_answer, width=80))
                    print("="*50)

                user_query = input("\nEnter your semantic query (or type 'exit' to quit): ")
        else:
            print("\nNo content was extracted from the PDFs. The process cannot continue.")