In [8]:
# Install required packages
!pip install langchain faiss-cpu PyPDF2 transformers sentence-transformers

!pip install sentence-transformers datasets
!pip install langchain langchain-community





In [None]:
# Install required packages (run this cell once)
!pip install PyPDF2 langchain faiss-cpu transformers sentence-transformers

import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
from google.colab import files

# Step 1: Upload a PDF file
def upload_pdf():
    print("Please upload your medical PDF file...")
    uploaded = files.upload()
    if not uploaded:
        raise ValueError("No file was uploaded.")
    # Get the first uploaded file
    pdf_file = list(uploaded.keys())[0]
    return pdf_file

# Step 2: Extract text from the PDF using PyPDF2
def extract_text_from_pdf(pdf_file):
    print("Extracting text from PDF...")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    full_text = ""
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text
    if not full_text:
        raise ValueError("No text could be extracted from the PDF.")
    return full_text

# Step 3: Split text into smaller chunks
def split_text(text, chunk_size=500, chunk_overlap=100):
    print("Splitting text into chunks...")
    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)
    return chunks

# Step 4: Create embeddings and build a vector store
def create_vectorstore(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    print("Generating embeddings and building vector store...")
    # For medical purposes, you might consider swapping this model for a medical-specific one if available.
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = FAISS.from_texts(chunks, embeddings)
    return vectorstore

# Step 5: Setup the question-answering pipeline (using a pre-trained model)
def setup_qa_pipeline(model_name="deepset/roberta-base-squad2"):
    print("Setting up QA pipeline...")
    qa_pipe = pipeline("question-answering", model=model_name)
    return qa_pipe

# Step 6: Define a function to answer questions using the vector store and QA pipeline
def answer_question(question, vectorstore, qa_pipeline, k=3):
    print("Retrieving relevant text chunks...")
    docs = vectorstore.similarity_search(question, k=k)
    context = " ".join([doc.page_content for doc in docs])
    print("Generating answer...")
    result = qa_pipeline(question=question, context=context)
    return result["answer"]

# Main Execution Flow
try:
    # Upload and process the PDF
    pdf_file = upload_pdf()
    text = extract_text_from_pdf(pdf_file)
    chunks = split_text(text)
    vectorstore = create_vectorstore(chunks)
    qa_pipeline_instance = setup_qa_pipeline()

    # Loop to allow multiple questions
    while True:
        user_question = input("Enter your question about the document (or type 'exit' to quit): ")
        if user_question.lower() in ['exit', 'quit']:
            print("Exiting the Q&A session.")
            break
        answer = answer_question(user_question, vectorstore, qa_pipeline_instance)
        print("Answer:", answer)

except Exception as e:
    print("An error occurred:", e)


Please upload your medical PDF file...


Saving sachin.pdf to sachin (1).pdf
Extracting text from PDF...
Splitting text into chunks...
Generating embeddings and building vector store...
Setting up QA pipeline...


Device set to use cuda:0


Enter your question about the document (or type 'exit' to quit): when did sachin made his debut
Retrieving relevant text chunks...
Generating answer...
Answer: 16
Enter your question about the document (or type 'exit' to quit): sachin full name
Retrieving relevant text chunks...
Generating answer...
Answer: Tendulkar
Enter your question about the document (or type 'exit' to quit): how many century sachin scored
Retrieving relevant text chunks...
Generating answer...
Answer: 100 international centuries
