In [None]:
import fitz  # PyMuPDF to extract text
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [None]:
# Step 1: Extract Text from PDF
print("Hello World")
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

In [None]:
# Load PDF
pdf_text = extract_text_from_pdf("1169.pdf")

In [None]:
import re

def clean_and_split_text(text, chunk_size=1000):
    text = re.sub(r'\s+', ' ', text)  # Remove excessive spaces
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return text_chunks


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def search_relevant_chunk(question, text_chunks):
    vectorizer = TfidfVectorizer()
    corpus = text_chunks + [question]
    tfidf_matrix = vectorizer.fit_transform(corpus)
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    relevant_chunk_idx = cosine_sim.argmax()  # Get the index of the most similar chunk
    return text_chunks[relevant_chunk_idx]


In [None]:
def extract_answer_from_chunk(chunk, question):
    # Simple approach: extract the sentence(s) that contain keywords
    sentences = chunk.split(".")
    relevant_sentences = [s for s in sentences if question.lower() in s.lower()]
    return " ".join(relevant_sentences) if relevant_sentences else "Answer not found."


In [None]:
def qa_pipeline(pdf_path, question):
    # Step 1: Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Preprocess and split the text into chunks
    text_chunks = clean_and_split_text(pdf_text)
    
    # Step 3: Search for the most relevant chunk
    relevant_chunk = search_relevant_chunk(question, text_chunks)
    
    # Step 4: Generate the answer from the relevant chunk
    answer = extract_answer_from_chunk(relevant_chunk, question)
    
    return answer


In [None]:
pdf_path = "1169.pdf" 
question = "Who is the main character in the novel?"
print("Hello WOrld")
answer = qa_pipeline(pdf_path, question)
print(answer)
print("Hello WOrld")