In [1]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

def get_pdf_text_from_folder(folder_path):
    text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            pdf_reader = PdfReader(pdf_path)
            for page in pdf_reader.pages:
                text += page.extract_text()
    return text

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks):
    # Use the RoBERTa model and tokenizer
    model_name = "deepset/roberta-base-squad2"
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
    return conversation_chain

def handle_user_input(conversation, user_question):
    response = conversation({'question': user_question})
    chat_history = response['chat_history']

    for i, message in enumerate(chat_history):
        print(f"{message.role}: {message.content}")

def main():
    load_dotenv("/content/drive/MyDrive/ask-multiple-pdfs/access.env")
    folder_path = "/content/drive/MyDrive/Testing PDFs/drive-download-20231217T185155Z-001"

    if os.path.isdir(folder_path):
        print("Processing...")
        raw_text = get_pdf_text_from_folder(folder_path)
        text_chunks = get_text_chunks(raw_text)

        vectorstore = get_vectorstore(text_chunks)
        conversation = get_conversation_chain(vectorstore)

        user_question = input("Ask a question about your documents:")
        if user_question:
            handle_user_input(conversation, user_question)

if __name__ == "__main__":
    main()


Processing...
load INSTRUCTOR_Transformer
max_seq_length  512
Ask a question about your documents:energy


ValueError: ignored

In [1]:
pip install transformers

