<a href="https://colab.research.google.com/github/srivanipendkar/sithafal/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyPDF2
!pip install sentence-transformers
!pip install faiss-cpu
!pip install transformers
!pip install torch


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [2]:
import os
import requests
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

def download_pdf_from_url(url, save_path):
    """Download a PDF file from a URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"PDF downloaded successfully to '{save_path}'.")
        return save_path
    except requests.RequestException as e:
        print(f"Failed to download PDF: {e}")
        return None

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def chunk_text(text, chunk_size=500):
    """Split text into manageable chunks."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def create_vector_database(chunks, embedding_model='all-MiniLM-L6-v2'):
    """Embed text chunks and store them in a FAISS vector database."""
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    print(f"Vector database created with {len(chunks)} chunks.")
    return index, model, embeddings

def query_vector_database(query, chunks, index, model, top_k=5):
    """Retrieve the most relevant chunks for a query."""
    if not query.strip():
        print("Empty query provided. Please enter a valid question.")
        return []
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

def generate_response(retrieved_chunks, query, llm_model='t5-small'):
    """Generate a response using an LLM based on retrieved chunks."""
    summarizer = pipeline("summarization", model=llm_model, tokenizer=llm_model, device=-1)
    context = " ".join([chunk for chunk, _ in retrieved_chunks])
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer concisely:"
    try:
        response = summarizer(prompt, max_length=150, min_length=50, do_sample=False)
        return response[0]['summary_text']
    except Exception as e:
        print(f"Error generating response: {e}")
        return "Unable to generate a response at this time."

def extract_table_from_page(pdf_path, page_num):
    """Extract tabular data from a specific PDF page."""
    reader = PdfReader(pdf_path)
    if page_num < 0 or page_num >= len(reader.pages):
        return "Invalid page number."
    page = reader.pages[page_num]
    text = page.extract_text()
    if not text:
        return "No content on the specified page."
    lines = text.split("\n")
    table_data = [line.split() for line in lines if line.strip()]
    return table_data

def main():
    # Step 1: URL of the PDF file
    url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
    temp_file = "downloaded_file.pdf"

    print("\nDownloading PDF...")
    pdf_path = download_pdf_from_url(url, temp_file)
    if not pdf_path:
        return

    print("\nExtracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():
        print("Failed to extract text from the PDF. Exiting.")
        return

    print("\nChunking text...")
    chunks = chunk_text(text)

    print("\nCreating vector database...")
    index, embedding_model, _ = create_vector_database(chunks)

    # Interactive Loop
    while True:
        print("\nOptions:")
        print("1. Ask a question")
        print("2. Perform a comparison query")
        print("3. Extract tabular data from a page")
        print("4. Exit")
        choice = input("Enter your choice: ")

        if choice == "1":
            query = input("\nEnter your question: ")
            retrieved_chunks = query_vector_database(query, chunks, index, embedding_model)
            if retrieved_chunks:
                print("\nTop Retrieved Chunks:")
                for chunk, distance in retrieved_chunks:
                    print(f"Chunk: {chunk[:200]}... \nDistance: {distance:.2f}\n")
                response = generate_response(retrieved_chunks, query)
                print("\nGenerated Response:")
                print(response)
            else:
                print("No relevant chunks retrieved.")

        elif choice == "2":
            n = int(input("\nHow many queries for comparison? "))
            queries = [input(f"Enter query {i + 1}: ") for i in range(n)]
            results = {}
            for query in queries:
                retrieved_chunks = query_vector_database(query, chunks, index, embedding_model)
                results[query] = retrieved_chunks

            for query, retrieved_chunks in results.items():
                print(f"\nQuery: {query}")
                for chunk, distance in retrieved_chunks:
                    print(f"Chunk: {chunk[:200]}...\nDistance: {distance:.2f}\n")

        elif choice == "3":
            try:
                page_num = int(input("\nEnter page number (starting from 1): ")) - 1
                table_data = extract_table_from_page(pdf_path, page_num)
                if isinstance(table_data, str):
                    print(table_data)
                else:
                    print("\nTabular Data:")
                    for row in table_data:
                        print(" | ".join(row))
            except ValueError:
                print("Invalid input. Please enter a valid page number.")

        elif choice == "4":
            print("\nExiting the program. Goodbye!")
            break

        else:
            print("\nInvalid choice. Please try again.")

if __name__ == "__main__":
    main()


  from tqdm.autonotebook import tqdm, trange



Downloading PDF...
PDF downloaded successfully to 'downloaded_file.pdf'.

Extracting text from PDF...

Chunking text...

Creating vector database...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector database created with 10 chunks.

Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice: 1

Enter your question: which degree type has the lowest unemployment rate?

Top Retrieved Chunks:
Chunk: 8686638Source: U.S. Bureau of Labor Statistics
19%
18%
4%59%2015 U.S. GDP (in millions of dollars)
Manufacturing
Finance, insurance, real
estate, rental, and
leasing
Arts, entertainment,recreation,
ac... 
Distance: 1.42

Chunk: f Data
Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581... 
Distance: 1.44

Chunk: ive than a man?

Example from History
In what years were the affiliations for 
Republicans and Independents the same?During what time period did the party affiliations have the most change?
Example fr... 
Distance: 1.46

Chunk: lions of dollars)•The graph below is called a bar

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors



Generated Response:
the graph below is called a bar graph . it shows what percent “of the pie” a particular category occupies out of the whole . manufacturing makes up 19% of that pie and finance makes up 18% . lions of dollars) GDP for all industries was $31.397023 .

Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice: 3

Enter page number (starting from 1): 6

Tabular Data:
Table | of | Yearly | U.S. | GDP | by
Industry | (in | millions | of | dollars)
Year | 2010 | 2011 | 2012 | 2013 | 2014 | 2015
All | Industries | 26093515 | 27535971 | 28663246 | 29601191 | 30895407 | 31397023
Manufacturing | 4992521 | 5581942 | 5841608 | 5953299 | 6047477 | 5829554
Finance,
Insurance, | Real
Estate, | Rental,
Leasing4522451 | 4618678 | 4797313 | 5031881 | 5339678 | 5597018
Arts,
Entertainment,
Recreation,
Accommodation,
and | Food | Service964032 | 1015238 | 1076249 | 1120496 | 1189646 | 1283813
Other | 15614511 | 16320113 | 169

Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors



Generated Response:
the graph below is called a bar graph . it shows what percent “of the pie” a particular category occupies out of the whole . manufacturing makes up 19% of that pie and finance makes up 18% . lions of dollars) GDP for all industries was $31.397023 .

Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice: 4

Exiting the program. Goodbye!
