In [None]:
pip install sentence-transformers faiss-cpu langchain requests fitz numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.9.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl.metadata (5.3 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting simplejson>=3.8.0 (from nipype->fitz)
  Downloading simplejson-3.19.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

In [None]:
!pip install --force-reinstall pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import requests
import io
import time
import os
import pickle

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can change to another model from Sentence Transformers if needed

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    if pdf_path.startswith("http"):  # Handle URLs
        response = requests.get(pdf_path)
        response.raise_for_status()  # Raise an exception if download fails
        pdf_data = io.BytesIO(response.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")  # Open from bytes
    else:
        doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

# Function to chunk text into smaller pieces for embeddings
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to create embeddings for chunks
def create_embeddings(chunks, embeddings_cache_path="embeddings.pkl"):
    if os.path.exists(embeddings_cache_path):
        with open(embeddings_cache_path, "rb") as f:
            embeddings = pickle.load(f)
        return embeddings

    embeddings = model.encode(chunks)  # Generate embeddings using SentenceTransformers model

    # Cache embeddings to a file
    with open(embeddings_cache_path, "wb") as f:
        pickle.dump(embeddings, f)

    return embeddings

# Function to store embeddings in FAISS
def store_embeddings_in_faiss(embeddings):
    embedding_dim = len(embeddings[0])
    index = faiss.IndexFlatL2(embedding_dim)  # L2 similarity
    np_embeddings = np.array(embeddings, dtype='float32')
    index.add(np_embeddings)
    return index

# Function to perform similarity search on embeddings
def search_embeddings(query, index, chunks, top_k=3):
    query_embedding = model.encode([query])  # Generate embedding for the query using SentenceTransformers

    # Search the FAISS index
    query_vector = np.array(query_embedding, dtype='float32').reshape(1, -1)
    distances, indices = index.search(query_vector, top_k)

    # Fetch the most relevant chunks
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks

# Function to generate a response (simplified here without LangChain)
def generate_response(user_query, relevant_chunks):
    context = "\n".join(relevant_chunks)  # Combine the relevant chunks
    response = f"Based on the provided context, here's the response to your query: {user_query}\n\nContext:\n{context}"
    return response

# Main pipeline function
def run_pipeline(pdf_path, user_query):
    # Step 1: Extract and chunk text
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text)

    # Step 2: Create and store embeddings
    embeddings = create_embeddings(chunks)
    index = store_embeddings_in_faiss(embeddings)

    # Step 3: Retrieve relevant chunks for the query
    relevant_chunks = search_embeddings(user_query, index, chunks)

    # Step 4: Generate response
    response = generate_response(user_query, relevant_chunks)
    return response

# Running the pipeline
if __name__ == "__main__":
    # Path to the PDF file you want to process
    pdf_path = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"  # Replace with the actual path to your PDF

    # The query that you want to ask based on the content of the PDF
    user_query = "From page 2 get the exact unemployment information based on type of degree input"  # Replace with your own query

    # Run the pipeline
    response = run_pipeline(pdf_path, user_query)

    # Print the response
    print("Response:", response)


Response: Based on the provided context, here's the response to your query: From page 2 get the exact unemployment information based on type of degree input

Context:
Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life REVISED: MICHAEL LOLKUS 2018 Tables, Charts, and Graphs Basics We use charts and graphs to visualize data. This data can either be generated data, data gathered from an experiment, or data collected from some source. A picture tells a thousand words so it is not a surprise that many people use charts and graphs when explaining data. Types of Visual Representations of Data Table of Yearly U.S. GDP by Industry (in millions of dollars) Year 2010 2011 2012 2013 2014 2015 All Industries 26093515 27535971 28663246 29601191 30895407 31397023 Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554 Finance, Insurance, Real Estate, Rental, Leasing 4522451 4618678 4797313 5031881 5339678 5597018 Arts, Ent