In [6]:
from langchain.text_splitter import CharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_ollama import OllamaEmbeddings
from PyPDF2 import PdfReader
from langchain_ollama import ChatOllama

In [7]:
def get_pdf_text(file_path):
    # Open the PDF file
    text=""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)

        # Iterate through each page
        for page in reader.pages:
            text+= page.extract_text()

    return text

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=200) #Try Recursive
    chunks = text_splitter.split_text(text)
    return chunks


def get_vector_store(text_chunks):
    embeddings = OllamaEmbeddings(model="llama3.1")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

def get_conversational_chain():

    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """

    model = ChatOllama(model="llama3.1", temperature =0, base_url="http://localhost:11434")

    prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain

def user_input(user_question):
    embeddings = OllamaEmbeddings(model="llama3.1")
    
    new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)

    chain = get_conversational_chain()

    
    response = chain.invoke(
        {"input_documents":docs, "question": user_question}
        , return_only_outputs=True)

    print(response["output_text"])




In [8]:
raw_text = get_pdf_text(r'2405.18369v2.pdf')
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)

In [None]:
user_input("Can you tell me this paper title?")