<a href="https://colab.research.google.com/github/rohithsarikela/hackathonTask1/blob/main/Copy_of_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2



In [None]:
!pip install faiss-cpu



In [None]:
import os
import requests
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

def download_pdf_from_url(url, save_path):
    """Download a PDF file from a URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully to '{save_path}'")
        return save_path
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
        return None

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    extracted_text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n"
        return extracted_text if extracted_text else "No text found in PDF."
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

def chunk_text(text, chunk_size=500):
    """Split text into smaller chunks."""
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def create_vector_database(chunks, embedding_model_name='all-MiniLM-L6-v2'):
    """Embed text chunks and store them in a FAISS vector database."""
    try:
        model = SentenceTransformer(embedding_model_name)
        embeddings = model.encode(chunks)
        vector_dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(vector_dimension)
        index.add(embeddings)
        return index, model
    except Exception as e:
        print(f"Error creating vector database: {e}")
        return None, None

def query_vector_database(query, chunks, index, model, top_k=5):
    """Retrieve relevant chunks for a given query."""
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

def generate_response(retrieved_chunks, query, llm_model_name='t5-small', device=-1):
    """Generate an answer using a summarization pipeline."""
    try:
        summarizer = pipeline("summarization", model=llm_model_name, tokenizer=llm_model_name, device=device)
        context = " ".join([chunk for chunk, _ in retrieved_chunks])
        prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer concisely:"
        response = summarizer(prompt, max_length=150, min_length=50, do_sample=False)
        return f"**Question:** {query}\n**Answer:** {response[0]['summary_text']}"
    except Exception as e:
        print(f"Error generating response: {e}")
        return "Unable to generate a response."

def extract_table_from_page(pdf_path, page_num):
    """Extract table-like data from a specific PDF page."""
    try:
        reader = PdfReader(pdf_path)
        if page_num < 0 or page_num >= len(reader.pages):
            return f"Invalid page number: {page_num + 1}"
        page_text = reader.pages[page_num].extract_text()
        if not page_text:
            return "No table data found on this page."
        lines = page_text.split("\n")
        table_data = [line.split() for line in lines if line.strip()]
        return table_data
    except Exception as e:
        return f"Error extracting table data: {e}"

def main():
    # PDF URL and file location
    pdf_url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
    local_pdf_file = "downloaded_file.pdf"

    # Step 1: Download PDF
    print("Downloading PDF...")
    pdf_path = download_pdf_from_url(pdf_url, local_pdf_file)
    if not pdf_path:
        return

    # Step 2: Extract and chunk text
    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    if not text or text == "No text found in PDF.":
        print("No content extracted from PDF.")
        return

    print("Splitting text into chunks...")
    chunks = chunk_text(text)

    # Step 3: Create vector database
    print("Creating vector database...")
    index, model = create_vector_database(chunks)
    if not index or not model:
        print("Vector database could not be created.")
        return

    # Interactive Menu
    while True:
        print("\nMenu Options:")
        print("1. Ask a question")
        print("2. Perform a comparison query")
        print("3. Extract tabular data from a page")
        print("4. Exit")

        choice = input("Enter your choice (1/2/3/4): ").strip()

        if choice == "1":
            query = input("Enter your question: ").strip()
            if not query:
                print("Question cannot be empty. Try again.")
                continue
            retrieved_chunks = query_vector_database(query, chunks, index, model)
            response = generate_response(retrieved_chunks, query)
            print(response)

        elif choice == "2":
            try:
                n = int(input("How many queries for comparison? ").strip())
                queries = [input(f"Enter query {i + 1}: ").strip() for i in range(n)]
                results = {}
                for query in queries:
                    retrieved_chunks = query_vector_database(query, chunks, index, model)
                    results[query] = retrieved_chunks
                for query, retrieved_chunks in results.items():
                    response = generate_response(retrieved_chunks, query)
                    print(response)
            except ValueError:
                print("Invalid input. Please enter a valid number for queries.")

        elif choice == "3":
            try:
                page_num = int(input("Enter page number (starting from 1): ").strip()) - 1
                table_data = extract_table_from_page(pdf_path, page_num)
                if isinstance(table_data, str):
                    print(table_data)
                else:
                    print("\nExtracted Table Data:")
                    for row in table_data:
                        print(" ".join(row))
            except ValueError:
                print("Invalid page number. Please enter a valid integer.")

        elif choice == "4":
            print("Exiting program. Goodbye!")
            break

        else:
            print("Invalid choice. Please select 1, 2, 3, or 4.")

if __name__ == "__main__":
    main()

Downloading PDF...
PDF downloaded successfully to 'downloaded_file.pdf'
Extracting text from PDF...
Splitting text into chunks...
Creating vector database...

Menu Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
Enter your choice (1/2/3/4): 3
Enter page number (starting from 1): 6

Extracted Table Data:
Table of Yearly U.S. GDP by
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
Estate, Rental,
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts,
Entertainment,
Recreation,
Accommodation,
and Food Service964032 1015238 1076249 1120496 1189646 1283813
Other 15614511 16320113 16948076 17495515 18318606 18686638Source: U.S. Bureau of Labor Statistics

Menu Options:
1. Ask a question
2. Perform a comparison query
3. Extract tabular data from a page
4. Exit
E